Updated version 2.5.0+ds from 'upstream/2.5.0+ds'
with Debian dir b40ed438f8e615c28caf63373cf778554f382ead
Sascha Steinbiss
7 years ago
51 | 51 | min_length=self.nucmer_min_len, |
52 | 52 | breaklen=self.nucmer_breaklen, |
53 | 53 | maxmatch=True, |
54 | show_snps=True | |
54 | show_snps=True, | |
55 | show_snps_C=False, | |
55 | 56 | ).run() |
56 | 57 | |
57 | 58 |
12 | 12 | infile, |
13 | 13 | seq_identity_threshold=0.9, |
14 | 14 | threads=1, |
15 | length_diff_cutoff=0.9, | |
15 | length_diff_cutoff=0.0, | |
16 | 16 | verbose=False, |
17 | 17 | min_cluster_number=0 |
18 | 18 | ): |
114 | 114 | std::cerr << "[ariba_minimap] Error indexing" << std::endl; |
115 | 115 | return 1; |
116 | 116 | } |
117 | ||
118 | // This sets the -f option of minimap: | |
119 | // -f FLOAT filter out top FLOAT fraction of repetitive minimizers [0.001] | |
120 | // Needed so that reads map to sequences from large clusters. | |
121 | mm_idx_set_max_occ(mi, 0.000001); | |
117 | 122 | |
118 | 123 | // mapping |
119 | 124 | mm_mapopt_t opt; |
18 | 18 | max_gene_length=10000, |
19 | 19 | genetic_code=11, |
20 | 20 | cdhit_min_id=0.9, |
21 | cdhit_min_length=0.9, | |
21 | cdhit_min_length=0.0, | |
22 | 22 | run_cdhit=True, |
23 | 23 | clusters_file=None, |
24 | 24 | threads=1, |
92 | 92 | |
93 | 93 | |
94 | 94 | @classmethod |
95 | def _best_of_two_hits(cls, hit1, hit2): | |
95 | def _best_of_two_hits(cls, hit1, hit2, use_qry_length=False): | |
96 | if use_qry_length: | |
97 | qry_length_percent1 = hit1.hit_length_qry / hit1.qry_length | |
98 | qry_length_percent2 = hit2.hit_length_qry / hit2.qry_length | |
99 | if qry_length_percent1 != qry_length_percent2: | |
100 | return hit1 if qry_length_percent1 > qry_length_percent2 else hit2 | |
101 | ||
96 | 102 | ref_length_percent1 = hit1.hit_length_ref / hit1.ref_length |
97 | 103 | ref_length_percent2 = hit2.hit_length_ref / hit2.ref_length |
98 | 104 | if ref_length_percent1 != ref_length_percent2: |
109 | 115 | |
110 | 116 | |
111 | 117 | @classmethod |
112 | def _choose_best_nucmer_match(cls, matches): | |
118 | def _choose_best_nucmer_match(cls, matches, use_qry_length=False): | |
113 | 119 | best_match = None |
114 | 120 | for ref_name in matches: |
115 | 121 | for hit in matches[ref_name]: |
116 | 122 | if best_match is None: |
117 | 123 | best_match = hit |
118 | 124 | else: |
119 | best_match = RefSeqChooser._best_of_two_hits(best_match, hit) | |
125 | best_match = RefSeqChooser._best_of_two_hits(best_match, hit, use_qry_length=use_qry_length) | |
120 | 126 | |
121 | 127 | return best_match |
122 | 128 | |
123 | 129 | |
124 | 130 | @classmethod |
125 | def _closest_nucmer_match_between_fastas(cls, ref_fasta, qry_fasta, log_fh, min_id, min_length, breaklen): | |
131 | def _closest_nucmer_match_between_fastas(cls, ref_fasta, qry_fasta, log_fh, min_id, min_length, breaklen, use_qry_length): | |
126 | 132 | tmpdir = tempfile.mkdtemp(prefix='tmp.closest_nucmer_match.', dir=os.getcwd()) |
127 | 133 | coords_file = os.path.join(tmpdir, 'nucmer_vs_cluster_refs.coords') |
128 | 134 | pymummer.nucmer.Runner( |
140 | 146 | if len(nucmer_matches) == 0: |
141 | 147 | return None, {} |
142 | 148 | else: |
143 | best_hit = RefSeqChooser._choose_best_nucmer_match(nucmer_matches) | |
149 | best_hit = RefSeqChooser._choose_best_nucmer_match(nucmer_matches, use_qry_length=use_qry_length) | |
144 | 150 | return best_hit, nucmer_matches |
145 | 151 | |
146 | 152 | |
147 | 153 | def run(self): |
148 | 154 | print('Looking for closest match from sequences within cluster', file=self.log_fh) |
149 | best_hit_from_cluster, nucmer_matches = RefSeqChooser._closest_nucmer_match_between_fastas(self.cluster_fasta, self.assembly_fasta_in, self.log_fh, self.nucmer_min_id, self.nucmer_min_len, self.nucmer_breaklen) | |
155 | best_hit_from_cluster, nucmer_matches = RefSeqChooser._closest_nucmer_match_between_fastas(self.cluster_fasta, self.assembly_fasta_in, self.log_fh, self.nucmer_min_id, self.nucmer_min_len, self.nucmer_breaklen, False) | |
150 | 156 | if best_hit_from_cluster is None: |
151 | 157 | return |
152 | 158 | |
159 | 165 | RefSeqChooser._make_matching_contig_pieces_fasta(self.assembly_fasta_in, pieces_coords, pieces_fasta_file) |
160 | 166 | |
161 | 167 | print('Checking for a better match to a ref sequence outside the cluster', file=self.log_fh) |
162 | best_hit_from_all_seqs, not_needed = RefSeqChooser._closest_nucmer_match_between_fastas(self.all_refs_fasta, pieces_fasta_file, self.log_fh, self.nucmer_min_id, self.nucmer_min_len, self.nucmer_breaklen) | |
168 | best_hit_from_all_seqs, not_needed = RefSeqChooser._closest_nucmer_match_between_fastas(self.all_refs_fasta, pieces_fasta_file, self.log_fh, self.nucmer_min_id, self.nucmer_min_len, self.nucmer_breaklen, True) | |
163 | 169 | shutil.rmtree(tmpdir) |
164 | 170 | self.closest_ref_from_all_refs = best_hit_from_all_seqs.ref_name |
165 | 171 | if self.closest_ref_from_all_refs is None: |
418 | 418 | pyfastaq.utils.close(f_out) |
419 | 419 | |
420 | 420 | |
421 | def cluster_with_cdhit(self, outprefix, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.9, nocluster=False, verbose=False, clusters_file=None): | |
421 | def cluster_with_cdhit(self, outprefix, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.0, nocluster=False, verbose=False, clusters_file=None): | |
422 | 422 | clusters = {} |
423 | 423 | ReferenceData._write_sequences_to_files(self.sequences, self.metadata, outprefix) |
424 | 424 | ref_types = ('noncoding', 'noncoding.varonly', 'gene', 'gene.varonly') |
0 | >ref1 | |
1 | TCTCATTCTGGCGTCGATACGGTATGACTATAAGCGGACAGTCTTGCAGAGCGGAAGTAA | |
2 | TGCGAGGACTAGCCGCATTCTAGGGTCAGAGCACCTCGCAAAGCAATCGAGAGTGCATGA | |
3 | GTATTGTTCCGCAAGTGTCTTGCAAATACCTTGCTGCTATAAGAGGCGAGGGTACGGAGG | |
4 | CGCGAGAGCTTCGTGAGGCGGGGGCATCATCACGTTATCACTACTACTGCTAGTGTCCGG | |
5 | >ref2 | |
6 | ATTTTCAGACGCCCACATCAGGAGCTATTACATGCTGCCAGGGACCTTTACCCTTACCTA | |
7 | GAAAAGCCCCCATGAAATATGGATTGCACCTGATCAAAGATCATCGCCTTCAGGGAACTG | |
8 | TCTCATTCTGGCGTCGATACGGTATGACTATAAGCGGACAGTCTTGCAGAGCGGAAGTAA | |
9 | TGCGAGGACTAGCCGCATTCTAGGGTCAGAGCACCTCGCAAAGCAATCGAGAGTGCATGA | |
10 | ATATTGTTCCGCAAGTGTCTTGCAAATACCTTGCTGCTATAAGAGGCGAGGGTACGGAGG | |
11 | CGCGAGAGCTTCGTGAGGCGGGGGCATCATCACGTTATCACTACTACTGCTAGTGTCCGG | |
12 | TGTCTAGCCACCGACAAGCACGCATTCTGTACTAGTACGGTTTGAGTGTATAACGCAAAA | |
13 | CTTAGCGTAGCACTGGCATGTTCTCCAAGTTTTAGATCCGTCATAGACCCAACCCAAGCG | |
14 | CCATTAGCTTTTTATATTAG | |
15 | >ref3 | |
16 | GGTCGTCACACGACAAGGACGCACACGCTGAGGGAACGAACTTCCTTAAGGTGGGACTTA | |
17 | TTCTACACTCACTATCTGTAAACGGCAGCTGGAAATTTTGTGCCGCCGATTCAGCTCTCG | |
18 | ATTGCACAGGGCCAAGCGATGGAGGGCTTAGATAAATCGACCATGTCAGTATCTGAGTTG | |
19 | GCGGTTCCGTAACCCGACAGTCCCAGTCAAAACTGTAAGTGGCCCCATTCTAGAGAAGTT | |
20 | TCATGACTAGTCACAGCAAACTGTGTTTCCAAACGAATAGCTTCATCGTAGCAACTCGCT | |
21 | TCAAGGAGTAATGTTAACTTACGTACATTCAAGCGTACGAGGGTGAGGTTGGGTAGGAAA | |
22 | GTGGGGTACTCGGGTAGATCGCGCGCCCCCGGGTTTACCGCAAGTGGGAAAATTGTAAAG | |
23 | AAAATGATCGCATCGCTATATTGGACAACGGTGAGACAGGATACCTGTGGACGAGCAGGC | |
24 | AAGTAGGTGAATCGAAGACG | |
25 | >ref4 | |
26 | CAAGGAGTATCATGTAGGCCAGCGGCGGACCCTGACTTGTGATTACTTATGTTGTGCAGT | |
27 | AACTTCGAGAACATTGTATCTATCAACTTCGTTATTTCACCCTCATTAGGTTATTGTCAC | |
28 | ACCGAAGTCCCTGGGACTTCACCGGCATTCGTTAGAGCGGCGTGCGGTCCCGCTGCCCAC | |
29 | CCCCCCCCGCTTCAGGGGAGCATGCCTATACTCTCAGACATGCCCAATTCCAGTGGCTCC | |
30 | ATCGATGCTCGTGCTGATACTGTTGTGTGGCGCGATATTTCCCCGGCGTGAAAGAGCGGA | |
31 | GGCTCAACCGCACCAAATGTTTTAATCCGTCGGGGCGCAGTTGCAGCTCGAGATGGATCC | |
32 | ACTACTCAACTCAAAAGATGCCCCGCTACCAGGCACTATGGAGGTATTCGTGAACGCTTG | |
33 | CGTATTGGTAGATCAATACACACTTTACGCGGGTTAGTGTAGAAGACACGATCGCAGTAA | |
34 | GGCACCGAAGGCAGTGTTCC |
0 | >ref2 | |
1 | ATTTTCAGACGCCCACATCAGGAGCTATTACATGCTGCCAGGGACCTTTACCCTTACCTA | |
2 | GAAAAGCCCCCATGAAATATGGATTGCACCTGATCAAAGATCATCGCCTTCAGGGAACTG | |
3 | TCTCATTCTGGCGTCGATACGGTATGACTATAAGCGGACAGTCTTGCAGAGCGGAAGTAA | |
4 | TGCGAGGACTAGCCGCATTCTAGGGTCAGAGCACCTCGCAAAGCAATCGAGAGTGCATGA | |
5 | ATATTGTTCCGCAAGTGTCTTGCAAATACCTTGCTGCTATAAGAGGCGAGGGTACGGAGG | |
6 | CGCGAGAGCTTCGTGAGGCGGGGGCATCATCACGTTATCACTACTACTGCTAGTGTCCGG | |
7 | TGTCTAGCCACCGACAAGCACGCATTCTGTACTAGTACGGTTTGAGTGTATAACGCAAAA | |
8 | CTTAGCGTAGCACTGGCATGTTCTCCAAGTTTTAGATCCGTCATAGACCCAACCCAAGCG | |
9 | CCATTAGCTTTTTATATTAG | |
10 | >ref3 | |
11 | GGTCGTCACACGACAAGGACGCACACGCTGAGGGAACGAACTTCCTTAAGGTGGGACTTA | |
12 | TTCTACACTCACTATCTGTAAACGGCAGCTGGAAATTTTGTGCCGCCGATTCAGCTCTCG | |
13 | ATTGCACAGGGCCAAGCGATGGAGGGCTTAGATAAATCGACCATGTCAGTATCTGAGTTG | |
14 | GCGGTTCCGTAACCCGACAGTCCCAGTCAAAACTGTAAGTGGCCCCATTCTAGAGAAGTT | |
15 | TCATGACTAGTCACAGCAAACTGTGTTTCCAAACGAATAGCTTCATCGTAGCAACTCGCT | |
16 | TCAAGGAGTAATGTTAACTTACGTACATTCAAGCGTACGAGGGTGAGGTTGGGTAGGAAA | |
17 | GTGGGGTACTCGGGTAGATCGCGCGCCCCCGGGTTTACCGCAAGTGGGAAAATTGTAAAG | |
18 | AAAATGATCGCATCGCTATATTGGACAACGGTGAGACAGGATACCTGTGGACGAGCAGGC | |
19 | AAGTAGGTGAATCGAAGACG |
0 | >ref2.l30.c4.ctg.1 | |
1 | ATTTTCAGACGCCCACATCAGGAGCTATTACATGCTGCCAGGGACCTTTACCCTTACCTA | |
2 | GAAAAGCCCCCATGAAATATGGATTGCACCTGATCAAAGATCATCGCCTTCAGGGAACTG | |
3 | TCTCATTCTGGCGTCGATACGGTATGACTATAAGCGGACAGTCTTGCAGAGCGGAAGTAA | |
4 | TGCGAGGACTAGCCGCATTCTAGGGTCAGAGCACCTCGCAAAGCAATCGAGAGTGCATGA | |
5 | GTATTGTTCCGCAAGTGTCTTGCAAATACCTTGCTGCTATAAGAGGCGAGGGTACGGAGG | |
6 | CGCGAGAGCTTCGTGAGGCGGGGGCATCATCACGTTATCACTACTACTGCTAGTGTCCGG | |
7 | TGTCTAGCCACCGACAAGCACGCATTCTGTACTAGTACGGTTTGAGTGTATAACGCAAAA | |
8 | CTTAGCGTAGCACTGGCATGTTCTCCAAGTTTTAGATCCGTCATAGACCCAACCCAAGCG | |
9 | CCATTAGCTTTTTATATTAG |
81 | 81 | self.assertTrue(os.path.exists(tmp_out)) |
82 | 82 | os.unlink(tmp_out) |
83 | 83 | |
84 | ||
85 | def test_run_contained_ref_seq(self): | |
86 | '''Test full run where ref seq completely contains another seq outside cluster''' | |
87 | all_ref_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_contained_ref_seq.all_refs.fa') | |
88 | cluster_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_contained_ref_seq.cluster_refs.fa') | |
89 | contig_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_contained_ref_seq.contigs.fa') | |
90 | tmp_out = 'tmp.ref_seq_chooser_full_run_contained_ref_seq.fa' | |
91 | refchooser = ref_seq_chooser.RefSeqChooser(cluster_fasta, all_ref_fasta, contig_fasta, tmp_out, sys.stdout) | |
92 | refchooser.run() | |
93 | self.assertEqual('ref2', refchooser.closest_ref_from_all_refs) | |
94 | self.assertTrue(refchooser.closest_ref_is_in_cluster) | |
95 | self.assertTrue(os.path.exists(tmp_out)) | |
96 | os.unlink(tmp_out) | |
97 |
73 | 73 | cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit. Each input sequence is put into its own "cluster". Incompatible with --cdhit_clusters.') |
74 | 74 | cdhit_group.add_argument('--cdhit_clusters', help='File specifying how the sequences should be clustered. Will be used instead of running cdhit. Format is one cluster per line. Sequence names separated by whitespace. Incompatible with --no_cdhit', metavar='FILENAME') |
75 | 75 | cdhit_group.add_argument('--cdhit_min_id', type=float, help='Sequence identity threshold (cd-hit option -c) [%(default)s]', default=0.9, metavar='FLOAT') |
76 | cdhit_group.add_argument('--cdhit_min_length', type=float, help='length difference cutoff (cd-hit option -s) [%(default)s]', default=0.9, metavar='FLOAT') | |
76 | cdhit_group.add_argument('--cdhit_min_length', type=float, help='length difference cutoff (cd-hit option -s) [%(default)s]', default=0.0, metavar='FLOAT') | |
77 | 77 | |
78 | 78 | other_prep_group = subparser_prepareref.add_argument_group('other options') |
79 | 79 | other_prep_group.add_argument('--min_gene_length', type=int, help='Minimum allowed length in nucleotides of reference genes [%(default)s]', metavar='INT', default=6) |
54 | 54 | setup( |
55 | 55 | ext_modules=[minimap_mod, fermilite_mod, vcfcall_mod], |
56 | 56 | name='ariba', |
57 | version='2.4.0', | |
57 | version='2.5.0', | |
58 | 58 | description='ARIBA: Antibiotic Resistance Identification By Assembly', |
59 | 59 | packages = find_packages(), |
60 | 60 | package_data={'ariba': ['test_run_data/*']}, |
68 | 68 | 'dendropy >= 4.1.0', |
69 | 69 | 'pyfastaq >= 3.12.0', |
70 | 70 | 'pysam >= 0.9.1', |
71 | 'pymummer>=0.8.1', | |
71 | 'pymummer>=0.10.1', | |
72 | 72 | ], |
73 | 73 | license='GPLv3', |
74 | 74 | classifiers=[ |