Updated version 2.5.1+ds from 'upstream/2.5.1+ds'
with Debian dir 7cd9ceeb23c31014e13f838f1e0a37e23df8920d
Sascha Steinbiss
7 years ago
92 | 92 | |
93 | 93 | |
94 | 94 | @classmethod |
95 | def _best_of_two_hits(cls, hit1, hit2, use_qry_length=False): | |
95 | def _best_of_two_hits(cls, hit1, hit2, use_qry_length=False, check_flanking=False): | |
96 | 96 | if use_qry_length: |
97 | 97 | qry_length_percent1 = hit1.hit_length_qry / hit1.qry_length |
98 | 98 | qry_length_percent2 = hit2.hit_length_qry / hit2.qry_length |
106 | 106 | elif hit1.percent_identity != hit2.percent_identity: |
107 | 107 | return hit1 if hit1.percent_identity > hit2.percent_identity else hit2 |
108 | 108 | else: |
109 | if check_flanking: | |
110 | flank1 = min(min(hit1.qry_start, hit1.qry_end), hit1.qry_length - 1 - max(hit1.qry_start, hit1.qry_end)) | |
111 | flank2 = min(min(hit2.qry_start, hit2.qry_end), hit2.qry_length - 1 - max(hit2.qry_start, hit2.qry_end)) | |
112 | if flank1 != flank2: | |
113 | return hit1 if flank1 > flank2 else hit2 | |
109 | 114 | l1, c1 = RefSeqChooser._l_and_c_from_contig_name(hit1.qry_name) |
110 | 115 | l2, c2 = RefSeqChooser._l_and_c_from_contig_name(hit2.qry_name) |
111 | 116 | if l1 != l2: |
115 | 120 | |
116 | 121 | |
117 | 122 | @classmethod |
118 | def _choose_best_nucmer_match(cls, matches, use_qry_length=False): | |
123 | def _choose_best_nucmer_match(cls, matches, use_qry_length=False, check_flanking=False): | |
119 | 124 | best_match = None |
120 | 125 | for ref_name in matches: |
121 | 126 | for hit in matches[ref_name]: |
122 | 127 | if best_match is None: |
123 | 128 | best_match = hit |
124 | 129 | else: |
125 | best_match = RefSeqChooser._best_of_two_hits(best_match, hit, use_qry_length=use_qry_length) | |
130 | best_match = RefSeqChooser._best_of_two_hits(best_match, hit, use_qry_length=use_qry_length, check_flanking=check_flanking) | |
126 | 131 | |
127 | 132 | return best_match |
128 | 133 | |
129 | 134 | |
130 | 135 | @classmethod |
131 | def _closest_nucmer_match_between_fastas(cls, ref_fasta, qry_fasta, log_fh, min_id, min_length, breaklen, use_qry_length): | |
136 | def _closest_nucmer_match_between_fastas(cls, ref_fasta, qry_fasta, log_fh, min_id, min_length, breaklen, use_qry_length, check_flanking): | |
132 | 137 | tmpdir = tempfile.mkdtemp(prefix='tmp.closest_nucmer_match.', dir=os.getcwd()) |
133 | 138 | coords_file = os.path.join(tmpdir, 'nucmer_vs_cluster_refs.coords') |
134 | 139 | pymummer.nucmer.Runner( |
146 | 151 | if len(nucmer_matches) == 0: |
147 | 152 | return None, {} |
148 | 153 | else: |
149 | best_hit = RefSeqChooser._choose_best_nucmer_match(nucmer_matches, use_qry_length=use_qry_length) | |
154 | best_hit = RefSeqChooser._choose_best_nucmer_match(nucmer_matches, use_qry_length=use_qry_length, check_flanking=check_flanking) | |
150 | 155 | return best_hit, nucmer_matches |
151 | 156 | |
152 | 157 | |
153 | 158 | def run(self): |
154 | 159 | print('Looking for closest match from sequences within cluster', file=self.log_fh) |
155 | best_hit_from_cluster, nucmer_matches = RefSeqChooser._closest_nucmer_match_between_fastas(self.cluster_fasta, self.assembly_fasta_in, self.log_fh, self.nucmer_min_id, self.nucmer_min_len, self.nucmer_breaklen, False) | |
160 | best_hit_from_cluster, nucmer_matches = RefSeqChooser._closest_nucmer_match_between_fastas(self.cluster_fasta, self.assembly_fasta_in, self.log_fh, self.nucmer_min_id, self.nucmer_min_len, self.nucmer_breaklen, False, True) | |
156 | 161 | if best_hit_from_cluster is None: |
157 | 162 | return |
158 | 163 | |
165 | 170 | RefSeqChooser._make_matching_contig_pieces_fasta(self.assembly_fasta_in, pieces_coords, pieces_fasta_file) |
166 | 171 | |
167 | 172 | print('Checking for a better match to a ref sequence outside the cluster', file=self.log_fh) |
168 | best_hit_from_all_seqs, not_needed = RefSeqChooser._closest_nucmer_match_between_fastas(self.all_refs_fasta, pieces_fasta_file, self.log_fh, self.nucmer_min_id, self.nucmer_min_len, self.nucmer_breaklen, True) | |
173 | best_hit_from_all_seqs, not_needed = RefSeqChooser._closest_nucmer_match_between_fastas(self.all_refs_fasta, pieces_fasta_file, self.log_fh, self.nucmer_min_id, self.nucmer_min_len, self.nucmer_breaklen, True, False) | |
169 | 174 | shutil.rmtree(tmpdir) |
170 | 175 | self.closest_ref_from_all_refs = best_hit_from_all_seqs.ref_name |
171 | 176 | if self.closest_ref_from_all_refs is None: |
276 | 276 | c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=112, total_reads_bases=1080) |
277 | 277 | c.run() |
278 | 278 | expected = [ |
279 | 'gene\tgene\t1\t0\t27\t112\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t364\t27.0\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of gene' | |
279 | 'gene\tgene\t1\t0\t27\t112\tcluster_name\t96\t96\t100.0\tcluster_name.l6.c30.ctg.1\t362\t27.8\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of gene' | |
280 | 280 | ] |
281 | 281 | self.assertEqual(expected, c.report_lines) |
282 | 282 | shutil.rmtree(tmpdir) |
489 | 489 | c.run() |
490 | 490 | |
491 | 491 | expected = [ |
492 | 'presence_absence1\tpresence_absence1\t1\t0\t19\t278\tcluster_name\t96\t77\t100.0\tcluster_name.l15.c30.ctg.1\t807\t22.8\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of presence_absence1' | |
492 | 'presence_absence1\tpresence_absence1\t1\t0\t19\t278\tcluster_name\t96\t77\t100.0\tcluster_name.l15.c17.ctg.1\t949\t20.5\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of presence_absence1' | |
493 | 493 | ] |
494 | 494 | self.assertEqual(expected, c.report_lines) |
495 | 495 | shutil.rmtree(tmpdir) |
0 | >ref1 | |
1 | GTCGCTCCTATGCGCTGGCACGTTCACACCTTTACGACAACCAGTAAGGATGCTTGGGCG | |
2 | AATCCCCTTCCCCCTTCTGGTAGTTTTCATTATGCTCAGCGTAACTGAGTCTACCAGGAG | |
3 | ACCTTGGACGGACGGTGAATCCGCATAGCGCACCCATAAGTAGGAGATAAGGTTACTGGA | |
4 | TTGTTCGCTGAAGAAGACAATCAAGGGGAGGTCTATTTGTTTATAGTGACACTACAAGGG | |
5 | AGGTGATGTTGGCCTGCTGGAAGGTTTTGAAAGAAGCGGGTGCTAGCCTGGCGACTCTTC | |
6 | ATCCATTTCAATGATTTCGGGGCTCCACTTATTTCCGAATCGGCTCCTGGGGTAGCCCTA | |
7 | ACTCCATGATCCACCTCGAATCGAACCGCCAGCAATTTCAGAGTATAGAACTAGACAGGA | |
8 | TTTACTGCCAGAGCTATCGAATATCATCGGAACGGGGACTTCGCGCACCATTGGACAAGC | |
9 | CAGATCTCAATCTGCAGCTA | |
10 | >ref2 | |
11 | CACGCGTCGTGGCCAACCACGCGTTCGTTGGCAGATGCCTTTACGATCACTACCCAAAAT | |
12 | AAAGAGCAGTGTGTGTATGTTACCTAACTACGTAGTAAGCGCTAGAGTAGGCAGTGGCCT | |
13 | AAGTGACACCTGTTCCGTGTTGCCCTGGCAGCAGCACACCGCATTCTAAGGACCGTCGCG | |
14 | TCGTATTCTTCCAGCTAAATCACCCTAAGTGCTATAATTTGGAGGAGTGAAGAGTTTGAT | |
15 | GCCAAGCTGACGTCAGGCGGGGATTGCCATTGATCTTGGCTCTCAGCCAGAGAAAGTACA | |
16 | TAACAGGAAAATTCAGCCCTTGGGTCTGTGCTCAACGATGGTTTGGAGACTCCTAGAATA | |
17 | ATAGCACCTCAGGGACCTTTTCCTAGGAACTGTCCACGGTCGCCACGACTGGAGCTGAAA | |
18 | TTTAGTACACAGAGCACCGCCTGTAGATTGCTCCTCGGTCCGGCTGTCTATAGACCGTCA | |
19 | CAGAATTCTAGAGCAACCGT |
0 | >ref1 | |
1 | GTCGCTCCTATGCGCTGGCACGTTCACACCTTTACGACAACCAGTAAGGATGCTTGGGCG | |
2 | AATCCCCTTCCCCCTTCTGGTAGTTTTCATTATGCTCAGCGTAACTGAGTCTACCAGGAG | |
3 | ACCTTGGACGGACGGTGAATCCGCATAGCGCACCCATAAGTAGGAGATAAGGTTACTGGA | |
4 | TTGTTCGCTGAAGAAGACAATCAAGGGGAGGTCTATTTGTTTATAGTGACACTACAAGGG | |
5 | AGGTGATGTTGGCCTGCTGGAAGGTTTTGAAAGAAGCGGGTGCTAGCCTGGCGACTCTTC | |
6 | ATCCATTTCAATGATTTCGGGGCTCCACTTATTTCCGAATCGGCTCCTGGGGTAGCCCTA | |
7 | ACTCCATGATCCACCTCGAATCGAACCGCCAGCAATTTCAGAGTATAGAACTAGACAGGA | |
8 | TTTACTGCCAGAGCTATCGAATATCATCGGAACGGGGACTTCGCGCACCATTGGACAAGC | |
9 | CAGATCTCAATCTGCAGCTA |
0 | >cluster.l15.c17.ctg.1 | |
1 | ATCATCATCTGACTGATCGTACGTACGTGTCGTCAGTCAGCTAGCTGTCAGTAAGAAAAC | |
2 | GTCGCTCCTATGCGCTGGCACGTTCACACCTTTACGACAACCAGTAAGGATGCTTGGGCG | |
3 | AATCCCCTTCCCCCTTCTGGTAGTTTTCATTATGCTCAGCGTAACTGAGTCTACCAGGAG | |
4 | ACCTTGGACGGACGGTGAATCCGCATAGCGCACCCATAAGTAGGAGATAAGGTTACTGGA | |
5 | TTGTTCGCTGAAGAAGACAATCAAGGGGAGGTCTATTTGTTTATAGTGACACTACAAGGG | |
6 | AGGTGATGTTGGCCTGCTGGAAGGTTTTGAAAGAAGCGGGTGCTAGCCTGGCGACTCTTC | |
7 | ATCCATTTCAATGATTTCGGGGCTCCACTTATTTCCGAATCGGCTCCTGGGGTAGCCCTA | |
8 | ACTCCATGATCCACCTCGAATCGAACCGCCAGCAATTTCAGAGTATAGAACTAGACAGGA | |
9 | TTTACTGCCAGAGCTATCGAATATCATCGGAACGGGGACTTCGCGCACCATTGGACAAGC | |
10 | CAGATCTCAATCTGTACCTA | |
11 | >cluster.l6.c4.ctg.1 | |
12 | ATCATCATCTGACTGATCGTACGTACGTGTCGTCAGTCAGCTAGCTGTCAGTAAGAAAAC | |
13 | GTCGCTCCTATGCGCTGGCACGTTCACACCTTTACGACAACCAGTAAGGATGCTTGGGCG | |
14 | AATCCCCTTCCCCCTTCTGGTAGTTTTCATTATGCTCAGCGTAACTGAGTCTACCAGGAG | |
15 | ACCTTGGACGGACGGTGAATCCGCATAGCGCACCCATAAGTAGGAGATAAGGTTACTGGA | |
16 | TTGTTCGCTGAAGAAGACAATCAAGGGGAGGTCTATTTGTTTATAGTGACACTACAAGGG | |
17 | AGGTGATGTTGGCCTGCTGGAAGGTTTTGAAAGAAGCGGGTGCTAGCCTGGCGACTCTTC | |
18 | ATCCATTTCAATGATTTCGGGGCTCCACTTATTTCCGAATCGGCTCCTGGGGTAGCCCTA | |
19 | ACTCCATGATCCACCTCGAATCGAACCGCCAGCAATTTCAGAGTATAGAACTAGACAGGA | |
20 | TTTACTGCCAGAGCTATCGAATATCATCGGAACGGGGACTTCGCGCACCATTGGACAAGC | |
21 | CAGATCTCAATCTGTACCTACTGACGTATCATCTGCGTACTGCGTCGTATGCATGAAAAC |
0 | >cluster.l6.c4.ctg.1 | |
1 | ATCATCATCTGACTGATCGTACGTACGTGTCGTCAGTCAGCTAGCTGTCAGTAAGAAAAC | |
2 | GTCGCTCCTATGCGCTGGCACGTTCACACCTTTACGACAACCAGTAAGGATGCTTGGGCG | |
3 | AATCCCCTTCCCCCTTCTGGTAGTTTTCATTATGCTCAGCGTAACTGAGTCTACCAGGAG | |
4 | ACCTTGGACGGACGGTGAATCCGCATAGCGCACCCATAAGTAGGAGATAAGGTTACTGGA | |
5 | TTGTTCGCTGAAGAAGACAATCAAGGGGAGGTCTATTTGTTTATAGTGACACTACAAGGG | |
6 | AGGTGATGTTGGCCTGCTGGAAGGTTTTGAAAGAAGCGGGTGCTAGCCTGGCGACTCTTC | |
7 | ATCCATTTCAATGATTTCGGGGCTCCACTTATTTCCGAATCGGCTCCTGGGGTAGCCCTA | |
8 | ACTCCATGATCCACCTCGAATCGAACCGCCAGCAATTTCAGAGTATAGAACTAGACAGGA | |
9 | TTTACTGCCAGAGCTATCGAATATCATCGGAACGGGGACTTCGCGCACCATTGGACAAGC | |
10 | CAGATCTCAATCTGTACCTACTGACGTATCATCTGCGTACTGCGTCGTATGCATGAAAAC |
95 | 95 | self.assertTrue(os.path.exists(tmp_out)) |
96 | 96 | os.unlink(tmp_out) |
97 | 97 | |
98 | ||
99 | def test_run_flanking_different(self): | |
100 | '''Test full run where amount of flanking seq varies''' | |
101 | all_ref_fasta = os.path.join(data_dir, 'ref_seq_chooser_test_flanking.all_refs.fa') | |
102 | cluster_fasta = os.path.join(data_dir, 'ref_seq_chooser_test_flanking.cluster_refs.fa') | |
103 | contig_fasta = os.path.join(data_dir, 'ref_seq_chooser_test_flanking.contigs.fa') | |
104 | expected_fa = os.path.join(data_dir, 'ref_seq_chooser_test_flanking.expected_contigs.fa') | |
105 | tmp_out = 'tmp.ref_seq_chooser_test_flanking.fa' | |
106 | refchooser = ref_seq_chooser.RefSeqChooser(cluster_fasta, all_ref_fasta, contig_fasta, tmp_out, sys.stdout) | |
107 | refchooser.run() | |
108 | self.assertEqual('ref1', refchooser.closest_ref_from_all_refs) | |
109 | self.assertTrue(refchooser.closest_ref_is_in_cluster) | |
110 | self.assertTrue(filecmp.cmp(expected_fa, tmp_out, shallow=False)) | |
111 | os.unlink(tmp_out) |