Source code for reader.hdf5_reader

#!/usr/bin/python

"""
.. See the NOTICE file distributed with this work for additional information
   regarding copyright ownership.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
"""

import os
import h5py
import numpy as np

from dmp import dmp


[docs]class hdf5_reader(object): # pylint: disable=invalid-name """ Class related to handling the functions for interacting directly with the HDF5 files. All required information should be passed to this class. """ def __init__(self, user_id, file_id, cnf_loc=''): """ Initialise the module and set default values Parameters ---------- user_id : str Identifier to uniquely locate the users files. Can be set to "common" if the files can be shared between users file_id : str Location of the file in the file system Example ------- .. code-block:: python :linenos: from hdf5_reader import hdf5_reader h5r = hdf5_reader('test') """ self.test_file = os.path.join(os.path.dirname(__file__), "../tests/data/region_idx.hdf5") self.user_id = user_id # Open the hdf5 file if user_id == 'test': resource_path = os.path.join(os.path.dirname(__file__), self.test_file) self.file_handle = h5py.File(resource_path, "r") else: self.user_id = user_id dm_handle = dmp(cnf_loc) file_obj = dm_handle.get_file_by_id(user_id, file_id) self.file_handle = h5py.File(file_obj['file_path'], 'r')
[docs] def close(self): """ Tidy function to close file handles Example ------- .. code-block:: python :linenos: from hdf5_reader import hdf5_reader h5r = hdf5_reader('test') h5r.close() """ self.file_handle.close()
[docs] def get_assemblies(self): """ List all assemblies for which there are files that have been indexed Returns ------- assembly : list List of assemblies in the index Example ------- .. code-block:: python :linenos: from hdf5_reader import hdf5_reader h5r = hdf5_reader('test') h5r.assemblies() """ return [asm for asm in self.file_handle if asm != 'meta']
[docs] def get_chromosomes(self, assembly): """ List all chromosomes that are covered by the index Parameters ---------- assembly : str Genome assembly ID Returns ------- chromosomes : list List of the chromosomes for a given assembly in the index Example ------- .. code-block:: python :linenos: from hdf5_reader import hdf5_reader h5r = hdf5_reader('test') asm = h5r.assemblies() chr_list = h5r.get_chromosomes(asm[0]) """ grp = self.file_handle[assembly] cid = list(np.nonzero(grp['chromosomes'])) return [grp['chromosomes'][i].decode('utf-8') for i in cid[0]]
[docs] def get_files(self, assembly): """ List all files for an assembly. If files are missing they can either get loaded or the search can be performed directly on the bigBed files Parameters ---------- assembly : str Genome assembly ID Returns ------- file_ids : list List of file ids for a given assembly in the index Example ------- .. code-block:: python :linenos: from hdf5_reader import hdf5_reader h5r = hdf5_reader('test') asm = h5r.assemblies() file_list = h5r.get_files(asm[0]) """ grp = self.file_handle[assembly] fid1 = list(np.nonzero(grp['files'][0])) fid1k = list(np.nonzero(grp['files'][1])) return { 1: [grp['files'][i] for i in fid1[0]], 1000: [grp['files'][i] for i in fid1k[0]] }
[docs] def get_regions(self, assembly, chromosome_id, start, end): """ List files that have data in a given region. Parameters ---------- assembly : str Genome assembly ID chromosome_id : str Chromosome names as listed by the get_files function start : int Start position for the region of interest end : int End position for the region of interest Returns ------- file_ids : list List of the file_ids that have sequence features within the region of interest Example ------- .. code-block:: python :linenos: from hdf5_reader import hdf5_reader h5r = hdf5_reader('test') asm = h5r.assemblies() file_list = h5r.get_chromosomes(asm[0], 1, 1000000, 1100000) """ file_idx = self.get_files(assembly) chrom_idx = self.get_chromosomes(assembly) grp = self.file_handle[assembly] dset1 = grp['data1'] dset1k = grp['data1k'] chromosome = str(chromosome_id) start = int(start) end = int(end) # chr X file X position dnp1 = dset1[chrom_idx.index(chromosome), :, start:end] dnp1k = dset1k[chrom_idx.index(chromosome), :, start:end] f_idx = [] dnp_size = len(dnp1k) for i in range(dnp_size): if np.any(dnp1[i]) is True: f_idx.append(file_idx[1][i]) dnp_size = len(dnp1k) for i in range(dnp_size): if np.any(dnp1k[i]) is True: f_idx.append(file_idx[1000][i]) return f_idx