Added ability to set CD-HIT memory limit on the command line - Issue #255
“kpepper”
5 years ago
12 | 12 | seq_identity_threshold=0.9, |
13 | 13 | threads=1, |
14 | 14 | length_diff_cutoff=0.0, |
15 | memory_limit=None, | |
15 | 16 | verbose=False, |
16 | 17 | min_cluster_number=0 |
17 | 18 | ): |
19 | 20 | if not os.path.exists(infile): |
20 | 21 | raise Error('File not found: "' + infile + '". Cannot continue') |
21 | 22 | |
23 | if (memory_limit is not None) and (memory_limit < 0): | |
24 | raise Error('Input parameter cdhit_max_memory is set to an invalid value. Cannot continue') | |
25 | ||
22 | 26 | self.infile = os.path.abspath(infile) |
23 | 27 | self.seq_identity_threshold = seq_identity_threshold |
24 | 28 | self.threads = threads |
25 | 29 | self.length_diff_cutoff = length_diff_cutoff |
30 | self.memory_limit = memory_limit | |
26 | 31 | self.verbose = verbose |
27 | 32 | self.min_cluster_number = min_cluster_number |
28 | 33 | extern_progs = external_progs.ExternalProgs(fail_on_error=True, using_spades=False) |
132 | 137 | return clusters |
133 | 138 | |
134 | 139 | |
135 | def run(self): | |
136 | tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd()) | |
137 | cdhit_fasta = os.path.join(tmpdir, 'cdhit') | |
138 | cluster_info_outfile = cdhit_fasta + '.bak.clstr' | |
139 | ||
140 | def get_run_cmd(self, output_file): | |
140 | 141 | cmd = ' '.join([ |
141 | 142 | self.cd_hit_est, |
142 | 143 | '-i', self.infile, |
143 | '-o', cdhit_fasta, | |
144 | '-o', output_file, | |
144 | 145 | '-c', str(self.seq_identity_threshold), |
145 | 146 | '-T', str(self.threads), |
146 | 147 | '-s', str(self.length_diff_cutoff), |
148 | 149 | '-bak 1', |
149 | 150 | ]) |
150 | 151 | |
152 | # Add in cdhit memory allocation if one has been specified | |
153 | if self.memory_limit is not None: | |
154 | cmd = ' '.join([cmd, '-M', str(self.memory_limit)]) | |
155 | ||
156 | return cmd | |
157 | ||
158 | ||
159 | def run(self): | |
160 | tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd()) | |
161 | cdhit_fasta = os.path.join(tmpdir, 'cdhit') | |
162 | cluster_info_outfile = cdhit_fasta + '.bak.clstr' | |
163 | cmd = self.get_run_cmd(cdhit_fasta) | |
151 | 164 | common.syscall(cmd, verbose=self.verbose) |
152 | 165 | clusters = self._get_clusters_from_bak_file(cluster_info_outfile, self.min_cluster_number) |
153 | 166 | common.rmtree(tmpdir) |
154 | 167 | return clusters |
155 | 168 | |
169 |
18 | 18 | genetic_code=11, |
19 | 19 | cdhit_min_id=0.9, |
20 | 20 | cdhit_min_length=0.0, |
21 | cdhit_max_memory=None, | |
21 | 22 | run_cdhit=True, |
22 | 23 | clusters_file=None, |
23 | 24 | threads=1, |
39 | 40 | self.genetic_code = genetic_code |
40 | 41 | self.cdhit_min_id = cdhit_min_id |
41 | 42 | self.cdhit_min_length = cdhit_min_length |
43 | self.cdhit_max_memory = cdhit_max_memory | |
42 | 44 | self.run_cdhit = run_cdhit |
43 | 45 | self.clusters_file = clusters_file |
44 | 46 | self.threads = threads |
192 | 194 | seq_identity_threshold=self.cdhit_min_id, |
193 | 195 | threads=self.threads, |
194 | 196 | length_diff_cutoff=self.cdhit_min_length, |
197 | memory_limit=self.cdhit_max_memory, | |
195 | 198 | nocluster=not self.run_cdhit, |
196 | 199 | verbose=self.verbose, |
197 | 200 | clusters_file=self.clusters_file, |
213 | 216 | print(' grep REMOVE', os.path.join(outdir, '01.filter.check_genes.log'), file=sys.stderr) |
214 | 217 | |
215 | 218 | if number_of_bad_variants_logged > 0: |
216 | print('WARNING. Problem with at least one variant. Problem variants are rmoved. Please see the file', os.path.join(outdir, '01.filter.check_metadata.log'), 'for details.', file=sys.stderr) | |
219 | print('WARNING. Problem with at least one variant. Problem variants are removed. Please see the file', os.path.join(outdir, '01.filter.check_metadata.log'), 'for details.', file=sys.stderr) |
433 | 433 | pyfastaq.utils.close(f_out) |
434 | 434 | |
435 | 435 | |
436 | def cluster_with_cdhit(self, outprefix, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.0, nocluster=False, verbose=False, clusters_file=None): | |
436 | def cluster_with_cdhit(self, outprefix, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.0, memory_limit=None, nocluster=False, verbose=False, clusters_file=None): | |
437 | 437 | clusters = {} |
438 | 438 | ReferenceData._write_sequences_to_files(self.sequences, self.metadata, outprefix) |
439 | 439 | ref_types = ('noncoding', 'noncoding.varonly', 'gene', 'gene.varonly') |
453 | 453 | seq_identity_threshold=seq_identity_threshold, |
454 | 454 | threads=threads, |
455 | 455 | length_diff_cutoff=length_diff_cutoff, |
456 | memory_limit=memory_limit, | |
456 | 457 | verbose=verbose, |
457 | 458 | min_cluster_number = min_cluster_number, |
458 | 459 | ) |
20 | 20 | genetic_code=options.genetic_code, |
21 | 21 | cdhit_min_id=options.cdhit_min_id, |
22 | 22 | cdhit_min_length=options.cdhit_min_length, |
23 | cdhit_max_memory=options.cdhit_max_memory, | |
23 | 24 | run_cdhit=not options.no_cdhit, |
24 | 25 | clusters_file=options.cdhit_clusters, |
25 | 26 | threads=options.threads, |
0 | 0 | import unittest |
1 | 1 | import os |
2 | import re | |
2 | 3 | from ariba import cdhit, external_progs |
4 | ||
3 | 5 | |
4 | 6 | modules_dir = os.path.dirname(os.path.abspath(cdhit.__file__)) |
5 | 7 | data_dir = os.path.join(modules_dir, 'tests', 'data') |
10 | 12 | '''test init_fail_infile_missing''' |
11 | 13 | with self.assertRaises(cdhit.Error): |
12 | 14 | cdhit.Runner('oopsnotafile', 'out') |
15 | ||
16 | ||
17 | def test_init_fail_invalid_memory(self): | |
18 | '''test_init_fail_invalid_memory''' | |
19 | infile = os.path.join(data_dir, 'cdhit_test_run.in.fa') | |
20 | with self.assertRaises(cdhit.Error): | |
21 | cdhit.Runner(infile, memory_limit=-10) | |
13 | 22 | |
14 | 23 | |
15 | 24 | def test_get_clusters_from_bak_file(self): |
161 | 170 | '1': {'seq3'}, |
162 | 171 | } |
163 | 172 | self.assertEqual(clusters, expected_clusters) |
173 | ||
174 | ||
175 | def test_get_run_cmd_with_default_memory(self): | |
176 | '''test_get_run_cmd_with_default_memory''' | |
177 | fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict_rename.in.fa') | |
178 | r = cdhit.Runner(fa_infile) | |
179 | run_cmd = r.get_run_cmd('foo/bar/file.out') | |
180 | match = re.search('^.+cd-hit-est -i .+ -o foo/bar/file.out -c 0.9 -T 1 -s 0.0 -d 0 -bak 1$', run_cmd) | |
181 | self.assertTrue(match) | |
182 | ||
183 | ||
184 | def test_get_run_cmd_with_non_default_memory(self): | |
185 | '''test_get_run_cmd_with_non_default_memory''' | |
186 | fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict_rename.in.fa') | |
187 | r = cdhit.Runner(fa_infile, memory_limit=900) | |
188 | run_cmd = r.get_run_cmd('foo/bar/file.out') | |
189 | match = re.search('^.+cd-hit-est -i .+ -c 0.9 -T 1 -s 0.0 -d 0 -bak 1 -M 900$', run_cmd) | |
190 | self.assertTrue(match) | |
191 | ||
192 | ||
193 | def test_get_run_cmd_with_unlimited_memory(self): | |
194 | '''test_get_run_cmd_with_unlimited_memory''' | |
195 | fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict_rename.in.fa') | |
196 | r = cdhit.Runner(fa_infile, memory_limit=0) | |
197 | run_cmd = r.get_run_cmd('foo/bar/file.out') | |
198 | match = re.search('^.+cd-hit-est -i .+ -c 0.9 -T 1 -s 0.0 -d 0 -bak 1 -M 0$', run_cmd) | |
199 | self.assertTrue(match) |
134 | 134 | cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit. Each input sequence is put into its own "cluster". Incompatible with --cdhit_clusters.') |
135 | 135 | cdhit_group.add_argument('--cdhit_clusters', help='File specifying how the sequences should be clustered. Will be used instead of running cdhit. Format is one cluster per line. Sequence names separated by whitespace. Incompatible with --no_cdhit', metavar='FILENAME') |
136 | 136 | cdhit_group.add_argument('--cdhit_min_id', type=float, help='Sequence identity threshold (cd-hit option -c) [%(default)s]', default=0.9, metavar='FLOAT') |
137 | cdhit_group.add_argument('--cdhit_min_length', type=float, help='length difference cutoff (cd-hit option -s) [%(default)s]', default=0.0, metavar='FLOAT') | |
137 | cdhit_group.add_argument('--cdhit_min_length', type=float, help='Length difference cutoff (cd-hit option -s) [%(default)s]', default=0.0, metavar='FLOAT') | |
138 | cdhit_group.add_argument('--cdhit_max_memory', type=int, help='Memory limit in MB (cd-hit option -M) [%(default)s]. Use 0 for unlimited.', metavar='INT') | |
138 | 139 | |
139 | 140 | other_prep_group = subparser_prepareref.add_argument_group('other options') |
140 | 141 | other_prep_group.add_argument('--min_gene_length', type=int, help='Minimum allowed length in nucleotides of reference genes [%(default)s]', metavar='INT', default=6) |