Codebase list ariba / HEAD ariba / assembly.py
HEAD

Tree @HEAD (Download .tar.gz)

assembly.py @HEADraw · history · blame

import os
import sys
import pyfastaq
import pymummer
import fermilite_ariba
from ariba import common, mapping, bam_parse, external_progs, ref_seq_chooser
import shlex

class Error (Exception): pass

class Assembly:
    def __init__(self,
      reads1,
      reads2,
      ref_fasta,
      ref_fastas,
      working_dir,
      final_assembly_fa,
      final_assembly_bam,
      log_fh,
      all_reference_fasta,
      contig_name_prefix='ctg',
      assembler='fermilite',
      max_insert=1000,
      min_scaff_depth=10,
      min_scaff_length=50,
      nucmer_min_id=90,
      nucmer_min_len=20,
      nucmer_breaklen=200,
      extern_progs=None,
      clean=True,
      spades_mode="wgs",
      spades_options=None,
      threads=1
    ):
        self.reads1 = os.path.abspath(reads1)
        self.reads2 = os.path.abspath(reads2)
        self.ref_fasta = os.path.abspath(ref_fasta)
        self.ref_fastas = os.path.abspath(ref_fastas)
        self.working_dir = os.path.abspath(working_dir)
        self.final_assembly_fa = os.path.abspath(final_assembly_fa)
        self.final_assembly_bam = os.path.abspath(final_assembly_bam)
        self.log_fh = log_fh
        self.all_reference_fasta = os.path.abspath(all_reference_fasta)
        self.contig_name_prefix = contig_name_prefix

        self.ref_seq_name = None
        self.assembler = assembler
        self.max_insert = max_insert
        self.min_scaff_depth = min_scaff_depth
        self.min_scaff_length = min_scaff_length
        self.nucmer_min_id = nucmer_min_id
        self.nucmer_min_len = nucmer_min_len
        self.nucmer_breaklen = nucmer_breaklen
        self.clean = clean
        self.spades_mode = spades_mode
        self.spades_options = spades_options
        self.threads = threads

        if extern_progs is None:
            self.extern_progs = external_progs.ExternalProgs(using_spades=self.assembler == 'spades')
        else:
            self.extern_progs = extern_progs

        try:
            os.mkdir(self.working_dir)
        except:
            raise Error('Error mkdir ' + self.working_dir)

        self.assembler_dir = os.path.join(self.working_dir, 'Assemble')
        self.all_assembly_contigs_fa = os.path.join(self.working_dir, 'debug_all_contigs.fa')
        self.best_assembly_fa = os.path.join(self.working_dir, 'debug_best_assembly.fa')
        self.final_assembly_fa = os.path.abspath(final_assembly_fa)


    @staticmethod
    def _run_fermilite(reads_in, fasta_out, log_out, name_prefix):
        return fermilite_ariba.fermilite_ariba(reads_in, fasta_out, log_out, name_prefix)


    def _assemble_with_fermilite(self):
        cwd = os.getcwd()
        try:
            os.chdir(self.working_dir)
        except:
            raise Error('Error chdir ' + self.working_dir)

        interleaved_reads = 'reads.fq'
        pyfastaq.tasks.interleave(self.reads1, self.reads2, interleaved_reads)
        fermilite_log = self.all_assembly_contigs_fa + '.log'
        got_from_fermilite = self._run_fermilite(interleaved_reads, self.all_assembly_contigs_fa, fermilite_log, self.contig_name_prefix)
        os.unlink(interleaved_reads)
        if os.path.exists(fermilite_log):
            with open(fermilite_log) as f:
                for line in f:
                    print(line, end='', file=self.log_fh)

            os.unlink(fermilite_log)

        self.assembled_ok = (got_from_fermilite == 0)
        os.chdir(cwd)

    @staticmethod
    def _check_spades_log_file(logfile):
        '''SPAdes can fail with a strange error. Stop everything if this happens'''
        f = pyfastaq.utils.open_file_read(logfile)

        for line in f:
            if line.startswith('== Error ==  system call for:') and line.rstrip().endswith('finished abnormally, err code: -7'):
                pyfastaq.utils.close(f)
                print('Error running SPAdes. Cannot continue. This is the error from the log file', logfile, '...', file=sys.stderr)
                print(line, file=sys.stderr)
                raise Error('Fatal error ("err code: -7") running spades. Cannot continue')

        pyfastaq.utils.close(f)
        return True

    def _assemble_with_spades(self):
        cwd = os.getcwd()
        self.assembled_ok = False
        try:
            try:
                os.chdir(self.working_dir)
            except:
                raise Error('Error chdir ' + self.working_dir)
            spades_exe = self.extern_progs.exe('spades')
            if not spades_exe:
                raise Error("Spades executable has not been found")
            spades_options = self.spades_options
            if spades_options is not None:
                spades_options = shlex.split(self.spades_options)
            if self.spades_mode == "rna":
                spades_options = ["--rna"] + (["-k","127"] if spades_options is None else spades_options)
                spades_out_seq_base = "transcripts.fasta"
            elif self.spades_mode == "sc":
                spades_options = ["--sc"] + (["-k", "33,55,77,99,127","--careful"] if spades_options is None else spades_options)
                spades_out_seq_base = "contigs.fasta"
            elif self.spades_mode == "wgs":
                spades_options = ["-k", "33,55,77,99,127","--careful"] if spades_options is None else spades_options
                spades_out_seq_base = "contigs.fasta"
            else:
                raise ValueError("Unknown spades_mode value: {}".format(self.spades_mode))
            asm_cmd = ['python3', spades_exe, "-t", str(self.threads), "--pe1-1", self.reads1, "--pe1-2", self.reads2, "-o", self.assembler_dir] + \
                spades_options
            asm_ok,err = common.syscall(asm_cmd, verbose=True, verbose_filehandle=self.log_fh, shell=False, allow_fail=True)
            if not asm_ok:
                print('Assembly finished with errors. These are the errors:', file=self.log_fh)
                print(err, file=self.log_fh)
                print('\nEnd of spades errors\n', file=self.log_fh)
            else:

                spades_log = os.path.join(self.assembler_dir, 'spades.log')
                if os.path.exists(spades_log):
                    self._check_spades_log_file(spades_log)

                    with open(spades_log) as f:
                        print('\n______________ SPAdes log ___________________\n', file=self.log_fh)
                        for line in f:
                            print(line.rstrip(), file=self.log_fh)
                        print('\n______________ End of SPAdes log _________________\n', file=self.log_fh)

                spades_warnings = os.path.join(self.assembler_dir, 'warnings.log')
                if os.path.exists(spades_warnings):
                    with open(spades_warnings) as f:
                        print('\n______________ SPAdes warnings ___________________\n', file=self.log_fh)
                        for line in f:
                            print(line.rstrip(), file=self.log_fh)
                        print('\n______________ End of SPAdes warnings _________________\n', file=self.log_fh)

                ## fermilight module generates contig names that look like `cluster_1.l15.c17.ctg.1` where 'cluster_1'==self.contig_name_prefix
                ## the whole structure of the contig name is expected in several places downstream where it is parsed into individual components.
                ## For example, it is parsed into to l and c parts in ref_seq_chooser (although the parts are not actually used).
                ## This is the code from fermilight module that generates the contig ID string:
                ## ofs << ">" << namePrefix << ".l" << overlap << ".c" << minCount << ".ctg." << i + 1 << '\n'
                ##
                ## We generate the same contig name structure here using dummy values for overlap and minCount, in order
                ## to avoid distrupting the downstream code.
                ## Note that the fermilight module generates multiple versions of the assembly on a grid of l and c values,
                ## and ref_seq_chooser then picks a single "best" (l,c) version based on coverage/identity of the nucmer
                ## alignment to the reference. Spades generates a single version of the assembly, so ref_seq_chooser
                ## can only pick that one version.

                spades_out_seq = os.path.join(self.assembler_dir,spades_out_seq_base)
                ## No need really to use general-purpose pyfastaq.sequences.file_reader here and pay performance cost for
                ## its multi-format line tests since we are only replacing the IDs in a pre-defined format
                if os.path.exists(spades_out_seq):
                    with open(spades_out_seq,"r") as inp, open(self.all_assembly_contigs_fa,"w") as out:
                        pref = self.contig_name_prefix
                        i_cont = 0
                        for line in inp:
                            if line.startswith(">"):
                                i_cont += 1
                                line = ">{}.l15.c17.ctg.{}\n".format(pref,i_cont)
                            out.write(line)
                        if i_cont > 0:
                            self.assembled_ok = True
            if self.clean:
                print('Deleting assembly directory', self.assembler_dir, file=self.log_fh)
                common.rmtree(self.assembler_dir)
        finally:
            os.chdir(cwd)


    @staticmethod
    def _fix_contig_orientation(contigs_fa, ref_fa, outfile, min_id=90, min_length=20, breaklen=200):
        '''Changes orientation of each contig to match the reference, when possible.
           Returns a set of names of contigs that had hits in both orientations to the reference'''
        if not os.path.exists(contigs_fa):
            raise Error('Cannot fix orientation of assembly contigs because file not found: ' + contigs_fa)

        tmp_coords = os.path.join(outfile + '.tmp.rename.coords')
        pymummer.nucmer.Runner(
            ref_fa,
            contigs_fa,
            tmp_coords,
            min_id=min_id,
            min_length=min_length,
            breaklen=breaklen,
            maxmatch=True,
        ).run()

        to_revcomp = set()
        not_revcomp = set()
        file_reader = pymummer.coords_file.reader(tmp_coords)
        for hit in file_reader:
            if hit.on_same_strand():
                not_revcomp.add(hit.qry_name)
            else:
                to_revcomp.add(hit.qry_name)

        os.unlink(tmp_coords)
        in_both = to_revcomp.intersection(not_revcomp)

        f = pyfastaq.utils.open_file_write(outfile)
        seq_reader = pyfastaq.sequences.file_reader(contigs_fa)
        for seq in seq_reader:
            if seq.id in to_revcomp and seq.id not in in_both:
                seq.revcomp()
            print(seq, file=f)
        pyfastaq.utils.close(f)

        return in_both


    @staticmethod
    def _parse_bam(sequences, bam, min_scaff_depth, max_insert):
        if not os.path.exists(bam):
            raise Error('File not found: ' + bam)

        bam_parser = bam_parse.Parser(bam, sequences)
        bam_parser.parse()
        bam_parser.write_files(bam)
        return bam_parser.scaff_graph_is_consistent(min_scaff_depth, max_insert)


    def run(self):
        if self.assembler == 'fermilite':
            self._assemble_with_fermilite()
        elif self.assembler == "spades":
            self._assemble_with_spades()
        print('Finished running assemblies', flush=True, file=self.log_fh)
        self.sequences = {}

        # double-check we got some contigs
        number_of_contigs = pyfastaq.tasks.count_sequences(self.all_assembly_contigs_fa) if os.path.exists(self.all_assembly_contigs_fa) else 0
        if number_of_contigs == 0:
            self.assembled_ok = False
            # This is to make this object picklable, to keep multithreading happy
            self.log_fh = None
            return
        else:
            self.assembled_ok = True

        if self.assembled_ok:
            ref_chooser = ref_seq_chooser.RefSeqChooser(
                self.ref_fastas,
                self.all_reference_fasta,
                self.all_assembly_contigs_fa,
                self.best_assembly_fa,
                self.log_fh,
                nucmer_min_id=self.nucmer_min_id,
                nucmer_min_len=self.nucmer_min_len,
                nucmer_breaklen=self.nucmer_breaklen,
            )
            ref_chooser.run()

            if ref_chooser.closest_ref_from_all_refs is None:
                print('Could not find match to reference sequences', file=self.log_fh)
                self.ref_seq_name = None
                self.log_fh = None
                return
            elif not ref_chooser.closest_ref_is_in_cluster:
                print('Closest reference', ref_chooser.closest_ref_from_all_refs, 'was not in cluster', file=self.log_fh)
                self.ref_seq_name = None
                self.log_fh = None
                return
            else:
                assert ref_chooser.closest_ref_from_all_refs is not None
                self.ref_seq_name = ref_chooser.closest_ref_from_all_refs

            print('Closest reference sequence:', self.ref_seq_name, file=self.log_fh)

            file_reader = pyfastaq.sequences.file_reader(self.ref_fastas)
            for ref_seq in file_reader:
                if self.ref_seq_name == ref_seq.id:
                    f_out = pyfastaq.utils.open_file_write(self.ref_fasta)
                    print(ref_seq, file=f_out)
                    pyfastaq.utils.close(f_out)
                    break

            contigs_both_strands = self._fix_contig_orientation(self.best_assembly_fa, self.ref_fasta, self.final_assembly_fa, min_id=self.nucmer_min_id, min_length=self.nucmer_min_len, breaklen=self.nucmer_breaklen)
            self.has_contigs_on_both_strands = len(contigs_both_strands) > 0
            pyfastaq.tasks.file_to_dict(self.final_assembly_fa, self.sequences)

            mapping.run_bowtie2(
                self.reads1,
                self.reads2,
                self.final_assembly_fa,
                self.final_assembly_bam[:-4],
                threads=self.threads,
                sort=True,
                bowtie2=self.extern_progs.exe('bowtie2'),
                bowtie2_version=self.extern_progs.version('bowtie2'),
                verbose=True,
                verbose_filehandle=self.log_fh
            )

            self.scaff_graph_ok = self._parse_bam(self.sequences, self.final_assembly_bam, self.min_scaff_depth, self.max_insert)
            print('Scaffolding graph is OK:', self.scaff_graph_ok, file=self.log_fh)

            if self.clean:
                for suffix in ['soft_clipped', 'unmapped_mates', 'scaff']:
                    filename = self.final_assembly_bam + '.' + suffix
                    print('Deleting file', filename, file=self.log_fh)
                    os.unlink(filename)


        # This is to make this object picklable, to keep multithreading happy
        self.log_fh = None