Codebase list ariba / b119ad2
Updated version 2.5.1+ds from 'upstream/2.5.1+ds' with Debian dir 7cd9ceeb23c31014e13f838f1e0a37e23df8920d Sascha Steinbiss 7 years ago
8 changed file(s) with 92 addition(s) and 10 deletion(s). Raw diff Collapse all Expand all
9292
9393
9494 @classmethod
95 def _best_of_two_hits(cls, hit1, hit2, use_qry_length=False):
95 def _best_of_two_hits(cls, hit1, hit2, use_qry_length=False, check_flanking=False):
9696 if use_qry_length:
9797 qry_length_percent1 = hit1.hit_length_qry / hit1.qry_length
9898 qry_length_percent2 = hit2.hit_length_qry / hit2.qry_length
106106 elif hit1.percent_identity != hit2.percent_identity:
107107 return hit1 if hit1.percent_identity > hit2.percent_identity else hit2
108108 else:
109 if check_flanking:
110 flank1 = min(min(hit1.qry_start, hit1.qry_end), hit1.qry_length - 1 - max(hit1.qry_start, hit1.qry_end))
111 flank2 = min(min(hit2.qry_start, hit2.qry_end), hit2.qry_length - 1 - max(hit2.qry_start, hit2.qry_end))
112 if flank1 != flank2:
113 return hit1 if flank1 > flank2 else hit2
109114 l1, c1 = RefSeqChooser._l_and_c_from_contig_name(hit1.qry_name)
110115 l2, c2 = RefSeqChooser._l_and_c_from_contig_name(hit2.qry_name)
111116 if l1 != l2:
115120
116121
117122 @classmethod
118 def _choose_best_nucmer_match(cls, matches, use_qry_length=False):
123 def _choose_best_nucmer_match(cls, matches, use_qry_length=False, check_flanking=False):
119124 best_match = None
120125 for ref_name in matches:
121126 for hit in matches[ref_name]:
122127 if best_match is None:
123128 best_match = hit
124129 else:
125 best_match = RefSeqChooser._best_of_two_hits(best_match, hit, use_qry_length=use_qry_length)
130 best_match = RefSeqChooser._best_of_two_hits(best_match, hit, use_qry_length=use_qry_length, check_flanking=check_flanking)
126131
127132 return best_match
128133
129134
130135 @classmethod
131 def _closest_nucmer_match_between_fastas(cls, ref_fasta, qry_fasta, log_fh, min_id, min_length, breaklen, use_qry_length):
136 def _closest_nucmer_match_between_fastas(cls, ref_fasta, qry_fasta, log_fh, min_id, min_length, breaklen, use_qry_length, check_flanking):
132137 tmpdir = tempfile.mkdtemp(prefix='tmp.closest_nucmer_match.', dir=os.getcwd())
133138 coords_file = os.path.join(tmpdir, 'nucmer_vs_cluster_refs.coords')
134139 pymummer.nucmer.Runner(
146151 if len(nucmer_matches) == 0:
147152 return None, {}
148153 else:
149 best_hit = RefSeqChooser._choose_best_nucmer_match(nucmer_matches, use_qry_length=use_qry_length)
154 best_hit = RefSeqChooser._choose_best_nucmer_match(nucmer_matches, use_qry_length=use_qry_length, check_flanking=check_flanking)
150155 return best_hit, nucmer_matches
151156
152157
153158 def run(self):
154159 print('Looking for closest match from sequences within cluster', file=self.log_fh)
155 best_hit_from_cluster, nucmer_matches = RefSeqChooser._closest_nucmer_match_between_fastas(self.cluster_fasta, self.assembly_fasta_in, self.log_fh, self.nucmer_min_id, self.nucmer_min_len, self.nucmer_breaklen, False)
160 best_hit_from_cluster, nucmer_matches = RefSeqChooser._closest_nucmer_match_between_fastas(self.cluster_fasta, self.assembly_fasta_in, self.log_fh, self.nucmer_min_id, self.nucmer_min_len, self.nucmer_breaklen, False, True)
156161 if best_hit_from_cluster is None:
157162 return
158163
165170 RefSeqChooser._make_matching_contig_pieces_fasta(self.assembly_fasta_in, pieces_coords, pieces_fasta_file)
166171
167172 print('Checking for a better match to a ref sequence outside the cluster', file=self.log_fh)
168 best_hit_from_all_seqs, not_needed = RefSeqChooser._closest_nucmer_match_between_fastas(self.all_refs_fasta, pieces_fasta_file, self.log_fh, self.nucmer_min_id, self.nucmer_min_len, self.nucmer_breaklen, True)
173 best_hit_from_all_seqs, not_needed = RefSeqChooser._closest_nucmer_match_between_fastas(self.all_refs_fasta, pieces_fasta_file, self.log_fh, self.nucmer_min_id, self.nucmer_min_len, self.nucmer_breaklen, True, False)
169174 shutil.rmtree(tmpdir)
170175 self.closest_ref_from_all_refs = best_hit_from_all_seqs.ref_name
171176 if self.closest_ref_from_all_refs is None:
276276 c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=112, total_reads_bases=1080)
277277 c.run()
278278 expected = [
279 'gene\tgene\t1\t0\t27\t112\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t364\t27.0\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of gene'
279 'gene\tgene\t1\t0\t27\t112\tcluster_name\t96\t96\t100.0\tcluster_name.l6.c30.ctg.1\t362\t27.8\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of gene'
280280 ]
281281 self.assertEqual(expected, c.report_lines)
282282 shutil.rmtree(tmpdir)
489489 c.run()
490490
491491 expected = [
492 'presence_absence1\tpresence_absence1\t1\t0\t19\t278\tcluster_name\t96\t77\t100.0\tcluster_name.l15.c30.ctg.1\t807\t22.8\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of presence_absence1'
492 'presence_absence1\tpresence_absence1\t1\t0\t19\t278\tcluster_name\t96\t77\t100.0\tcluster_name.l15.c17.ctg.1\t949\t20.5\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of presence_absence1'
493493 ]
494494 self.assertEqual(expected, c.report_lines)
495495 shutil.rmtree(tmpdir)
0 >ref1
1 GTCGCTCCTATGCGCTGGCACGTTCACACCTTTACGACAACCAGTAAGGATGCTTGGGCG
2 AATCCCCTTCCCCCTTCTGGTAGTTTTCATTATGCTCAGCGTAACTGAGTCTACCAGGAG
3 ACCTTGGACGGACGGTGAATCCGCATAGCGCACCCATAAGTAGGAGATAAGGTTACTGGA
4 TTGTTCGCTGAAGAAGACAATCAAGGGGAGGTCTATTTGTTTATAGTGACACTACAAGGG
5 AGGTGATGTTGGCCTGCTGGAAGGTTTTGAAAGAAGCGGGTGCTAGCCTGGCGACTCTTC
6 ATCCATTTCAATGATTTCGGGGCTCCACTTATTTCCGAATCGGCTCCTGGGGTAGCCCTA
7 ACTCCATGATCCACCTCGAATCGAACCGCCAGCAATTTCAGAGTATAGAACTAGACAGGA
8 TTTACTGCCAGAGCTATCGAATATCATCGGAACGGGGACTTCGCGCACCATTGGACAAGC
9 CAGATCTCAATCTGCAGCTA
10 >ref2
11 CACGCGTCGTGGCCAACCACGCGTTCGTTGGCAGATGCCTTTACGATCACTACCCAAAAT
12 AAAGAGCAGTGTGTGTATGTTACCTAACTACGTAGTAAGCGCTAGAGTAGGCAGTGGCCT
13 AAGTGACACCTGTTCCGTGTTGCCCTGGCAGCAGCACACCGCATTCTAAGGACCGTCGCG
14 TCGTATTCTTCCAGCTAAATCACCCTAAGTGCTATAATTTGGAGGAGTGAAGAGTTTGAT
15 GCCAAGCTGACGTCAGGCGGGGATTGCCATTGATCTTGGCTCTCAGCCAGAGAAAGTACA
16 TAACAGGAAAATTCAGCCCTTGGGTCTGTGCTCAACGATGGTTTGGAGACTCCTAGAATA
17 ATAGCACCTCAGGGACCTTTTCCTAGGAACTGTCCACGGTCGCCACGACTGGAGCTGAAA
18 TTTAGTACACAGAGCACCGCCTGTAGATTGCTCCTCGGTCCGGCTGTCTATAGACCGTCA
19 CAGAATTCTAGAGCAACCGT
0 >ref1
1 GTCGCTCCTATGCGCTGGCACGTTCACACCTTTACGACAACCAGTAAGGATGCTTGGGCG
2 AATCCCCTTCCCCCTTCTGGTAGTTTTCATTATGCTCAGCGTAACTGAGTCTACCAGGAG
3 ACCTTGGACGGACGGTGAATCCGCATAGCGCACCCATAAGTAGGAGATAAGGTTACTGGA
4 TTGTTCGCTGAAGAAGACAATCAAGGGGAGGTCTATTTGTTTATAGTGACACTACAAGGG
5 AGGTGATGTTGGCCTGCTGGAAGGTTTTGAAAGAAGCGGGTGCTAGCCTGGCGACTCTTC
6 ATCCATTTCAATGATTTCGGGGCTCCACTTATTTCCGAATCGGCTCCTGGGGTAGCCCTA
7 ACTCCATGATCCACCTCGAATCGAACCGCCAGCAATTTCAGAGTATAGAACTAGACAGGA
8 TTTACTGCCAGAGCTATCGAATATCATCGGAACGGGGACTTCGCGCACCATTGGACAAGC
9 CAGATCTCAATCTGCAGCTA
0 >cluster.l15.c17.ctg.1
1 ATCATCATCTGACTGATCGTACGTACGTGTCGTCAGTCAGCTAGCTGTCAGTAAGAAAAC
2 GTCGCTCCTATGCGCTGGCACGTTCACACCTTTACGACAACCAGTAAGGATGCTTGGGCG
3 AATCCCCTTCCCCCTTCTGGTAGTTTTCATTATGCTCAGCGTAACTGAGTCTACCAGGAG
4 ACCTTGGACGGACGGTGAATCCGCATAGCGCACCCATAAGTAGGAGATAAGGTTACTGGA
5 TTGTTCGCTGAAGAAGACAATCAAGGGGAGGTCTATTTGTTTATAGTGACACTACAAGGG
6 AGGTGATGTTGGCCTGCTGGAAGGTTTTGAAAGAAGCGGGTGCTAGCCTGGCGACTCTTC
7 ATCCATTTCAATGATTTCGGGGCTCCACTTATTTCCGAATCGGCTCCTGGGGTAGCCCTA
8 ACTCCATGATCCACCTCGAATCGAACCGCCAGCAATTTCAGAGTATAGAACTAGACAGGA
9 TTTACTGCCAGAGCTATCGAATATCATCGGAACGGGGACTTCGCGCACCATTGGACAAGC
10 CAGATCTCAATCTGTACCTA
11 >cluster.l6.c4.ctg.1
12 ATCATCATCTGACTGATCGTACGTACGTGTCGTCAGTCAGCTAGCTGTCAGTAAGAAAAC
13 GTCGCTCCTATGCGCTGGCACGTTCACACCTTTACGACAACCAGTAAGGATGCTTGGGCG
14 AATCCCCTTCCCCCTTCTGGTAGTTTTCATTATGCTCAGCGTAACTGAGTCTACCAGGAG
15 ACCTTGGACGGACGGTGAATCCGCATAGCGCACCCATAAGTAGGAGATAAGGTTACTGGA
16 TTGTTCGCTGAAGAAGACAATCAAGGGGAGGTCTATTTGTTTATAGTGACACTACAAGGG
17 AGGTGATGTTGGCCTGCTGGAAGGTTTTGAAAGAAGCGGGTGCTAGCCTGGCGACTCTTC
18 ATCCATTTCAATGATTTCGGGGCTCCACTTATTTCCGAATCGGCTCCTGGGGTAGCCCTA
19 ACTCCATGATCCACCTCGAATCGAACCGCCAGCAATTTCAGAGTATAGAACTAGACAGGA
20 TTTACTGCCAGAGCTATCGAATATCATCGGAACGGGGACTTCGCGCACCATTGGACAAGC
21 CAGATCTCAATCTGTACCTACTGACGTATCATCTGCGTACTGCGTCGTATGCATGAAAAC
0 >cluster.l6.c4.ctg.1
1 ATCATCATCTGACTGATCGTACGTACGTGTCGTCAGTCAGCTAGCTGTCAGTAAGAAAAC
2 GTCGCTCCTATGCGCTGGCACGTTCACACCTTTACGACAACCAGTAAGGATGCTTGGGCG
3 AATCCCCTTCCCCCTTCTGGTAGTTTTCATTATGCTCAGCGTAACTGAGTCTACCAGGAG
4 ACCTTGGACGGACGGTGAATCCGCATAGCGCACCCATAAGTAGGAGATAAGGTTACTGGA
5 TTGTTCGCTGAAGAAGACAATCAAGGGGAGGTCTATTTGTTTATAGTGACACTACAAGGG
6 AGGTGATGTTGGCCTGCTGGAAGGTTTTGAAAGAAGCGGGTGCTAGCCTGGCGACTCTTC
7 ATCCATTTCAATGATTTCGGGGCTCCACTTATTTCCGAATCGGCTCCTGGGGTAGCCCTA
8 ACTCCATGATCCACCTCGAATCGAACCGCCAGCAATTTCAGAGTATAGAACTAGACAGGA
9 TTTACTGCCAGAGCTATCGAATATCATCGGAACGGGGACTTCGCGCACCATTGGACAAGC
10 CAGATCTCAATCTGTACCTACTGACGTATCATCTGCGTACTGCGTCGTATGCATGAAAAC
9595 self.assertTrue(os.path.exists(tmp_out))
9696 os.unlink(tmp_out)
9797
98
99 def test_run_flanking_different(self):
100 '''Test full run where amount of flanking seq varies'''
101 all_ref_fasta = os.path.join(data_dir, 'ref_seq_chooser_test_flanking.all_refs.fa')
102 cluster_fasta = os.path.join(data_dir, 'ref_seq_chooser_test_flanking.cluster_refs.fa')
103 contig_fasta = os.path.join(data_dir, 'ref_seq_chooser_test_flanking.contigs.fa')
104 expected_fa = os.path.join(data_dir, 'ref_seq_chooser_test_flanking.expected_contigs.fa')
105 tmp_out = 'tmp.ref_seq_chooser_test_flanking.fa'
106 refchooser = ref_seq_chooser.RefSeqChooser(cluster_fasta, all_ref_fasta, contig_fasta, tmp_out, sys.stdout)
107 refchooser.run()
108 self.assertEqual('ref1', refchooser.closest_ref_from_all_refs)
109 self.assertTrue(refchooser.closest_ref_is_in_cluster)
110 self.assertTrue(filecmp.cmp(expected_fa, tmp_out, shallow=False))
111 os.unlink(tmp_out)
5454 setup(
5555 ext_modules=[minimap_mod, fermilite_mod, vcfcall_mod],
5656 name='ariba',
57 version='2.5.0',
57 version='2.5.1',
5858 description='ARIBA: Antibiotic Resistance Identification By Assembly',
5959 packages = find_packages(),
6060 package_data={'ariba': ['test_run_data/*']},