Codebase list ariba / HEAD ariba / read_store.py
HEAD

Tree @HEAD (Download .tar.gz)

read_store.py @HEADraw · history · blame

import pyfastaq
import os
import pysam
from ariba import common

class Error (Exception): pass

class ReadStore:
    def __init__(self, infile, outprefix, log_fh=None):
        assert infile != outprefix
        self.infile = os.path.abspath(infile)
        self.outprefix = os.path.abspath(outprefix)
        self.outfile = os.path.abspath(outprefix) + '.gz'

        if not os.path.exists(self.infile):
            raise Error('File not found ' + self.infile + '. Cannot continue')

        self._sort_file(self.infile, self.outprefix, log_fh)
        self._compress_and_index_file(self.outprefix, log_fh)
        os.unlink(self.outprefix)


    @staticmethod
    def _sort_file(infile, outfile, log_fh=None):
        cmd = 'sort -k1,1 -k 2,2n ' + infile + ' > ' + outfile
        verbose = log_fh is not None
        common.syscall(cmd, verbose=verbose, verbose_filehandle=log_fh)


    @staticmethod
    def _compress_and_index_file(infile, log_fh=None):
        if log_fh is not None:
            print('Compressing file', infile, file=log_fh, flush=True)
        pysam.tabix_compress(infile, infile + '.gz')
        pysam.tabix_index(infile + '.gz', seq_col=0, start_col=1, end_col=1)


    def get_reads(self, cluster_name, out1, out2=None, fasta=False, log_fh=None, wanted_ids=None):
        total_reads = 0
        total_bases = 0

        if log_fh is not None:
            print('Getting reads for', cluster_name, 'from', self.outfile, file=log_fh)
        tabix_file = pysam.TabixFile(self.outfile)
        f_out1 = pyfastaq.utils.open_file_write(out1)
        if out2 is None:
            f_out2 = f_out1
        else:
            f_out2 = pyfastaq.utils.open_file_write(out2)

        for line in tabix_file.fetch(reference=cluster_name):
            cluster, number, seq, qual = line.rstrip().split()
            number = int(number)
            if wanted_ids is not None:
                new_number = number if number % 2 else number - 1
                if new_number not in wanted_ids:
                    continue

            if number % 2 == 0:
                if fasta:
                    print('>' + str(number - 1) + '/2', seq, sep='\n', file=f_out2)
                else:
                    print('@' + str(number - 1) + '/2', seq, '+', qual, sep='\n', file=f_out2)
            else:
                if fasta:
                    print('>' + str(number) + '/1', seq, sep='\n', file=f_out1)
                else:
                    print('@' + str(number) + '/1', seq, '+', qual, sep='\n', file=f_out1)

            total_reads += 1
            total_bases += len(qual)

        pyfastaq.utils.close(f_out1)
        if out2 is not None:
            pyfastaq.utils.close(f_out2)
        if log_fh is not None:
            print('Finished getting reads for', cluster_name, 'from', self.outfile, file=log_fh)

        return total_reads, total_bases

    def clean(self):
        os.unlink(self.outfile)
        os.unlink(self.outfile + '.tbi')