Codebase list ariba / 5d007b8
Updated version 2.5.0+ds from 'upstream/2.5.0+ds' with Debian dir b40ed438f8e615c28caf63373cf778554f382ead Sascha Steinbiss 7 years ago
12 changed file(s) with 105 addition(s) and 14 deletion(s). Raw diff Collapse all Expand all
5151 min_length=self.nucmer_min_len,
5252 breaklen=self.nucmer_breaklen,
5353 maxmatch=True,
54 show_snps=True
54 show_snps=True,
55 show_snps_C=False,
5556 ).run()
5657
5758
1212 infile,
1313 seq_identity_threshold=0.9,
1414 threads=1,
15 length_diff_cutoff=0.9,
15 length_diff_cutoff=0.0,
1616 verbose=False,
1717 min_cluster_number=0
1818 ):
114114 std::cerr << "[ariba_minimap] Error indexing" << std::endl;
115115 return 1;
116116 }
117
118 // This sets the -f option of minimap:
119 // -f FLOAT filter out top FLOAT fraction of repetitive minimizers [0.001]
120 // Needed so that reads map to sequences from large clusters.
121 mm_idx_set_max_occ(mi, 0.000001);
117122
118123 // mapping
119124 mm_mapopt_t opt;
1818 max_gene_length=10000,
1919 genetic_code=11,
2020 cdhit_min_id=0.9,
21 cdhit_min_length=0.9,
21 cdhit_min_length=0.0,
2222 run_cdhit=True,
2323 clusters_file=None,
2424 threads=1,
9292
9393
9494 @classmethod
95 def _best_of_two_hits(cls, hit1, hit2):
95 def _best_of_two_hits(cls, hit1, hit2, use_qry_length=False):
96 if use_qry_length:
97 qry_length_percent1 = hit1.hit_length_qry / hit1.qry_length
98 qry_length_percent2 = hit2.hit_length_qry / hit2.qry_length
99 if qry_length_percent1 != qry_length_percent2:
100 return hit1 if qry_length_percent1 > qry_length_percent2 else hit2
101
96102 ref_length_percent1 = hit1.hit_length_ref / hit1.ref_length
97103 ref_length_percent2 = hit2.hit_length_ref / hit2.ref_length
98104 if ref_length_percent1 != ref_length_percent2:
109115
110116
111117 @classmethod
112 def _choose_best_nucmer_match(cls, matches):
118 def _choose_best_nucmer_match(cls, matches, use_qry_length=False):
113119 best_match = None
114120 for ref_name in matches:
115121 for hit in matches[ref_name]:
116122 if best_match is None:
117123 best_match = hit
118124 else:
119 best_match = RefSeqChooser._best_of_two_hits(best_match, hit)
125 best_match = RefSeqChooser._best_of_two_hits(best_match, hit, use_qry_length=use_qry_length)
120126
121127 return best_match
122128
123129
124130 @classmethod
125 def _closest_nucmer_match_between_fastas(cls, ref_fasta, qry_fasta, log_fh, min_id, min_length, breaklen):
131 def _closest_nucmer_match_between_fastas(cls, ref_fasta, qry_fasta, log_fh, min_id, min_length, breaklen, use_qry_length):
126132 tmpdir = tempfile.mkdtemp(prefix='tmp.closest_nucmer_match.', dir=os.getcwd())
127133 coords_file = os.path.join(tmpdir, 'nucmer_vs_cluster_refs.coords')
128134 pymummer.nucmer.Runner(
140146 if len(nucmer_matches) == 0:
141147 return None, {}
142148 else:
143 best_hit = RefSeqChooser._choose_best_nucmer_match(nucmer_matches)
149 best_hit = RefSeqChooser._choose_best_nucmer_match(nucmer_matches, use_qry_length=use_qry_length)
144150 return best_hit, nucmer_matches
145151
146152
147153 def run(self):
148154 print('Looking for closest match from sequences within cluster', file=self.log_fh)
149 best_hit_from_cluster, nucmer_matches = RefSeqChooser._closest_nucmer_match_between_fastas(self.cluster_fasta, self.assembly_fasta_in, self.log_fh, self.nucmer_min_id, self.nucmer_min_len, self.nucmer_breaklen)
155 best_hit_from_cluster, nucmer_matches = RefSeqChooser._closest_nucmer_match_between_fastas(self.cluster_fasta, self.assembly_fasta_in, self.log_fh, self.nucmer_min_id, self.nucmer_min_len, self.nucmer_breaklen, False)
150156 if best_hit_from_cluster is None:
151157 return
152158
159165 RefSeqChooser._make_matching_contig_pieces_fasta(self.assembly_fasta_in, pieces_coords, pieces_fasta_file)
160166
161167 print('Checking for a better match to a ref sequence outside the cluster', file=self.log_fh)
162 best_hit_from_all_seqs, not_needed = RefSeqChooser._closest_nucmer_match_between_fastas(self.all_refs_fasta, pieces_fasta_file, self.log_fh, self.nucmer_min_id, self.nucmer_min_len, self.nucmer_breaklen)
168 best_hit_from_all_seqs, not_needed = RefSeqChooser._closest_nucmer_match_between_fastas(self.all_refs_fasta, pieces_fasta_file, self.log_fh, self.nucmer_min_id, self.nucmer_min_len, self.nucmer_breaklen, True)
163169 shutil.rmtree(tmpdir)
164170 self.closest_ref_from_all_refs = best_hit_from_all_seqs.ref_name
165171 if self.closest_ref_from_all_refs is None:
418418 pyfastaq.utils.close(f_out)
419419
420420
421 def cluster_with_cdhit(self, outprefix, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.9, nocluster=False, verbose=False, clusters_file=None):
421 def cluster_with_cdhit(self, outprefix, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.0, nocluster=False, verbose=False, clusters_file=None):
422422 clusters = {}
423423 ReferenceData._write_sequences_to_files(self.sequences, self.metadata, outprefix)
424424 ref_types = ('noncoding', 'noncoding.varonly', 'gene', 'gene.varonly')
0 >ref1
1 TCTCATTCTGGCGTCGATACGGTATGACTATAAGCGGACAGTCTTGCAGAGCGGAAGTAA
2 TGCGAGGACTAGCCGCATTCTAGGGTCAGAGCACCTCGCAAAGCAATCGAGAGTGCATGA
3 GTATTGTTCCGCAAGTGTCTTGCAAATACCTTGCTGCTATAAGAGGCGAGGGTACGGAGG
4 CGCGAGAGCTTCGTGAGGCGGGGGCATCATCACGTTATCACTACTACTGCTAGTGTCCGG
5 >ref2
6 ATTTTCAGACGCCCACATCAGGAGCTATTACATGCTGCCAGGGACCTTTACCCTTACCTA
7 GAAAAGCCCCCATGAAATATGGATTGCACCTGATCAAAGATCATCGCCTTCAGGGAACTG
8 TCTCATTCTGGCGTCGATACGGTATGACTATAAGCGGACAGTCTTGCAGAGCGGAAGTAA
9 TGCGAGGACTAGCCGCATTCTAGGGTCAGAGCACCTCGCAAAGCAATCGAGAGTGCATGA
10 ATATTGTTCCGCAAGTGTCTTGCAAATACCTTGCTGCTATAAGAGGCGAGGGTACGGAGG
11 CGCGAGAGCTTCGTGAGGCGGGGGCATCATCACGTTATCACTACTACTGCTAGTGTCCGG
12 TGTCTAGCCACCGACAAGCACGCATTCTGTACTAGTACGGTTTGAGTGTATAACGCAAAA
13 CTTAGCGTAGCACTGGCATGTTCTCCAAGTTTTAGATCCGTCATAGACCCAACCCAAGCG
14 CCATTAGCTTTTTATATTAG
15 >ref3
16 GGTCGTCACACGACAAGGACGCACACGCTGAGGGAACGAACTTCCTTAAGGTGGGACTTA
17 TTCTACACTCACTATCTGTAAACGGCAGCTGGAAATTTTGTGCCGCCGATTCAGCTCTCG
18 ATTGCACAGGGCCAAGCGATGGAGGGCTTAGATAAATCGACCATGTCAGTATCTGAGTTG
19 GCGGTTCCGTAACCCGACAGTCCCAGTCAAAACTGTAAGTGGCCCCATTCTAGAGAAGTT
20 TCATGACTAGTCACAGCAAACTGTGTTTCCAAACGAATAGCTTCATCGTAGCAACTCGCT
21 TCAAGGAGTAATGTTAACTTACGTACATTCAAGCGTACGAGGGTGAGGTTGGGTAGGAAA
22 GTGGGGTACTCGGGTAGATCGCGCGCCCCCGGGTTTACCGCAAGTGGGAAAATTGTAAAG
23 AAAATGATCGCATCGCTATATTGGACAACGGTGAGACAGGATACCTGTGGACGAGCAGGC
24 AAGTAGGTGAATCGAAGACG
25 >ref4
26 CAAGGAGTATCATGTAGGCCAGCGGCGGACCCTGACTTGTGATTACTTATGTTGTGCAGT
27 AACTTCGAGAACATTGTATCTATCAACTTCGTTATTTCACCCTCATTAGGTTATTGTCAC
28 ACCGAAGTCCCTGGGACTTCACCGGCATTCGTTAGAGCGGCGTGCGGTCCCGCTGCCCAC
29 CCCCCCCCGCTTCAGGGGAGCATGCCTATACTCTCAGACATGCCCAATTCCAGTGGCTCC
30 ATCGATGCTCGTGCTGATACTGTTGTGTGGCGCGATATTTCCCCGGCGTGAAAGAGCGGA
31 GGCTCAACCGCACCAAATGTTTTAATCCGTCGGGGCGCAGTTGCAGCTCGAGATGGATCC
32 ACTACTCAACTCAAAAGATGCCCCGCTACCAGGCACTATGGAGGTATTCGTGAACGCTTG
33 CGTATTGGTAGATCAATACACACTTTACGCGGGTTAGTGTAGAAGACACGATCGCAGTAA
34 GGCACCGAAGGCAGTGTTCC
0 >ref2
1 ATTTTCAGACGCCCACATCAGGAGCTATTACATGCTGCCAGGGACCTTTACCCTTACCTA
2 GAAAAGCCCCCATGAAATATGGATTGCACCTGATCAAAGATCATCGCCTTCAGGGAACTG
3 TCTCATTCTGGCGTCGATACGGTATGACTATAAGCGGACAGTCTTGCAGAGCGGAAGTAA
4 TGCGAGGACTAGCCGCATTCTAGGGTCAGAGCACCTCGCAAAGCAATCGAGAGTGCATGA
5 ATATTGTTCCGCAAGTGTCTTGCAAATACCTTGCTGCTATAAGAGGCGAGGGTACGGAGG
6 CGCGAGAGCTTCGTGAGGCGGGGGCATCATCACGTTATCACTACTACTGCTAGTGTCCGG
7 TGTCTAGCCACCGACAAGCACGCATTCTGTACTAGTACGGTTTGAGTGTATAACGCAAAA
8 CTTAGCGTAGCACTGGCATGTTCTCCAAGTTTTAGATCCGTCATAGACCCAACCCAAGCG
9 CCATTAGCTTTTTATATTAG
10 >ref3
11 GGTCGTCACACGACAAGGACGCACACGCTGAGGGAACGAACTTCCTTAAGGTGGGACTTA
12 TTCTACACTCACTATCTGTAAACGGCAGCTGGAAATTTTGTGCCGCCGATTCAGCTCTCG
13 ATTGCACAGGGCCAAGCGATGGAGGGCTTAGATAAATCGACCATGTCAGTATCTGAGTTG
14 GCGGTTCCGTAACCCGACAGTCCCAGTCAAAACTGTAAGTGGCCCCATTCTAGAGAAGTT
15 TCATGACTAGTCACAGCAAACTGTGTTTCCAAACGAATAGCTTCATCGTAGCAACTCGCT
16 TCAAGGAGTAATGTTAACTTACGTACATTCAAGCGTACGAGGGTGAGGTTGGGTAGGAAA
17 GTGGGGTACTCGGGTAGATCGCGCGCCCCCGGGTTTACCGCAAGTGGGAAAATTGTAAAG
18 AAAATGATCGCATCGCTATATTGGACAACGGTGAGACAGGATACCTGTGGACGAGCAGGC
19 AAGTAGGTGAATCGAAGACG
0 >ref2.l30.c4.ctg.1
1 ATTTTCAGACGCCCACATCAGGAGCTATTACATGCTGCCAGGGACCTTTACCCTTACCTA
2 GAAAAGCCCCCATGAAATATGGATTGCACCTGATCAAAGATCATCGCCTTCAGGGAACTG
3 TCTCATTCTGGCGTCGATACGGTATGACTATAAGCGGACAGTCTTGCAGAGCGGAAGTAA
4 TGCGAGGACTAGCCGCATTCTAGGGTCAGAGCACCTCGCAAAGCAATCGAGAGTGCATGA
5 GTATTGTTCCGCAAGTGTCTTGCAAATACCTTGCTGCTATAAGAGGCGAGGGTACGGAGG
6 CGCGAGAGCTTCGTGAGGCGGGGGCATCATCACGTTATCACTACTACTGCTAGTGTCCGG
7 TGTCTAGCCACCGACAAGCACGCATTCTGTACTAGTACGGTTTGAGTGTATAACGCAAAA
8 CTTAGCGTAGCACTGGCATGTTCTCCAAGTTTTAGATCCGTCATAGACCCAACCCAAGCG
9 CCATTAGCTTTTTATATTAG
8181 self.assertTrue(os.path.exists(tmp_out))
8282 os.unlink(tmp_out)
8383
84
85 def test_run_contained_ref_seq(self):
86 '''Test full run where ref seq completely contains another seq outside cluster'''
87 all_ref_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_contained_ref_seq.all_refs.fa')
88 cluster_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_contained_ref_seq.cluster_refs.fa')
89 contig_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_contained_ref_seq.contigs.fa')
90 tmp_out = 'tmp.ref_seq_chooser_full_run_contained_ref_seq.fa'
91 refchooser = ref_seq_chooser.RefSeqChooser(cluster_fasta, all_ref_fasta, contig_fasta, tmp_out, sys.stdout)
92 refchooser.run()
93 self.assertEqual('ref2', refchooser.closest_ref_from_all_refs)
94 self.assertTrue(refchooser.closest_ref_is_in_cluster)
95 self.assertTrue(os.path.exists(tmp_out))
96 os.unlink(tmp_out)
97
7373 cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit. Each input sequence is put into its own "cluster". Incompatible with --cdhit_clusters.')
7474 cdhit_group.add_argument('--cdhit_clusters', help='File specifying how the sequences should be clustered. Will be used instead of running cdhit. Format is one cluster per line. Sequence names separated by whitespace. Incompatible with --no_cdhit', metavar='FILENAME')
7575 cdhit_group.add_argument('--cdhit_min_id', type=float, help='Sequence identity threshold (cd-hit option -c) [%(default)s]', default=0.9, metavar='FLOAT')
76 cdhit_group.add_argument('--cdhit_min_length', type=float, help='length difference cutoff (cd-hit option -s) [%(default)s]', default=0.9, metavar='FLOAT')
76 cdhit_group.add_argument('--cdhit_min_length', type=float, help='length difference cutoff (cd-hit option -s) [%(default)s]', default=0.0, metavar='FLOAT')
7777
7878 other_prep_group = subparser_prepareref.add_argument_group('other options')
7979 other_prep_group.add_argument('--min_gene_length', type=int, help='Minimum allowed length in nucleotides of reference genes [%(default)s]', metavar='INT', default=6)
5454 setup(
5555 ext_modules=[minimap_mod, fermilite_mod, vcfcall_mod],
5656 name='ariba',
57 version='2.4.0',
57 version='2.5.0',
5858 description='ARIBA: Antibiotic Resistance Identification By Assembly',
5959 packages = find_packages(),
6060 package_data={'ariba': ['test_run_data/*']},
6868 'dendropy >= 4.1.0',
6969 'pyfastaq >= 3.12.0',
7070 'pysam >= 0.9.1',
71 'pymummer>=0.8.1',
71 'pymummer>=0.10.1',
7272 ],
7373 license='GPLv3',
7474 classifiers=[