Codebase list ariba / 70c82f5
Added ability to set CD-HIT memory limit on the command line - Issue #255 “kpepper” 5 years ago
6 changed file(s) with 65 addition(s) and 9 deletion(s). Raw diff Collapse all Expand all
1212 seq_identity_threshold=0.9,
1313 threads=1,
1414 length_diff_cutoff=0.0,
15 memory_limit=None,
1516 verbose=False,
1617 min_cluster_number=0
1718 ):
1920 if not os.path.exists(infile):
2021 raise Error('File not found: "' + infile + '". Cannot continue')
2122
23 if (memory_limit is not None) and (memory_limit < 0):
24 raise Error('Input parameter cdhit_max_memory is set to an invalid value. Cannot continue')
25
2226 self.infile = os.path.abspath(infile)
2327 self.seq_identity_threshold = seq_identity_threshold
2428 self.threads = threads
2529 self.length_diff_cutoff = length_diff_cutoff
30 self.memory_limit = memory_limit
2631 self.verbose = verbose
2732 self.min_cluster_number = min_cluster_number
2833 extern_progs = external_progs.ExternalProgs(fail_on_error=True, using_spades=False)
132137 return clusters
133138
134139
135 def run(self):
136 tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd())
137 cdhit_fasta = os.path.join(tmpdir, 'cdhit')
138 cluster_info_outfile = cdhit_fasta + '.bak.clstr'
139
140 def get_run_cmd(self, output_file):
140141 cmd = ' '.join([
141142 self.cd_hit_est,
142143 '-i', self.infile,
143 '-o', cdhit_fasta,
144 '-o', output_file,
144145 '-c', str(self.seq_identity_threshold),
145146 '-T', str(self.threads),
146147 '-s', str(self.length_diff_cutoff),
148149 '-bak 1',
149150 ])
150151
152 # Add in cdhit memory allocation if one has been specified
153 if self.memory_limit is not None:
154 cmd = ' '.join([cmd, '-M', str(self.memory_limit)])
155
156 return cmd
157
158
159 def run(self):
160 tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd())
161 cdhit_fasta = os.path.join(tmpdir, 'cdhit')
162 cluster_info_outfile = cdhit_fasta + '.bak.clstr'
163 cmd = self.get_run_cmd(cdhit_fasta)
151164 common.syscall(cmd, verbose=self.verbose)
152165 clusters = self._get_clusters_from_bak_file(cluster_info_outfile, self.min_cluster_number)
153166 common.rmtree(tmpdir)
154167 return clusters
155168
169
1818 genetic_code=11,
1919 cdhit_min_id=0.9,
2020 cdhit_min_length=0.0,
21 cdhit_max_memory=None,
2122 run_cdhit=True,
2223 clusters_file=None,
2324 threads=1,
3940 self.genetic_code = genetic_code
4041 self.cdhit_min_id = cdhit_min_id
4142 self.cdhit_min_length = cdhit_min_length
43 self.cdhit_max_memory = cdhit_max_memory
4244 self.run_cdhit = run_cdhit
4345 self.clusters_file = clusters_file
4446 self.threads = threads
192194 seq_identity_threshold=self.cdhit_min_id,
193195 threads=self.threads,
194196 length_diff_cutoff=self.cdhit_min_length,
197 memory_limit=self.cdhit_max_memory,
195198 nocluster=not self.run_cdhit,
196199 verbose=self.verbose,
197200 clusters_file=self.clusters_file,
213216 print(' grep REMOVE', os.path.join(outdir, '01.filter.check_genes.log'), file=sys.stderr)
214217
215218 if number_of_bad_variants_logged > 0:
216 print('WARNING. Problem with at least one variant. Problem variants are rmoved. Please see the file', os.path.join(outdir, '01.filter.check_metadata.log'), 'for details.', file=sys.stderr)
219 print('WARNING. Problem with at least one variant. Problem variants are removed. Please see the file', os.path.join(outdir, '01.filter.check_metadata.log'), 'for details.', file=sys.stderr)
433433 pyfastaq.utils.close(f_out)
434434
435435
436 def cluster_with_cdhit(self, outprefix, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.0, nocluster=False, verbose=False, clusters_file=None):
436 def cluster_with_cdhit(self, outprefix, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.0, memory_limit=None, nocluster=False, verbose=False, clusters_file=None):
437437 clusters = {}
438438 ReferenceData._write_sequences_to_files(self.sequences, self.metadata, outprefix)
439439 ref_types = ('noncoding', 'noncoding.varonly', 'gene', 'gene.varonly')
453453 seq_identity_threshold=seq_identity_threshold,
454454 threads=threads,
455455 length_diff_cutoff=length_diff_cutoff,
456 memory_limit=memory_limit,
456457 verbose=verbose,
457458 min_cluster_number = min_cluster_number,
458459 )
2020 genetic_code=options.genetic_code,
2121 cdhit_min_id=options.cdhit_min_id,
2222 cdhit_min_length=options.cdhit_min_length,
23 cdhit_max_memory=options.cdhit_max_memory,
2324 run_cdhit=not options.no_cdhit,
2425 clusters_file=options.cdhit_clusters,
2526 threads=options.threads,
00 import unittest
11 import os
2 import re
23 from ariba import cdhit, external_progs
4
35
46 modules_dir = os.path.dirname(os.path.abspath(cdhit.__file__))
57 data_dir = os.path.join(modules_dir, 'tests', 'data')
1012 '''test init_fail_infile_missing'''
1113 with self.assertRaises(cdhit.Error):
1214 cdhit.Runner('oopsnotafile', 'out')
15
16
17 def test_init_fail_invalid_memory(self):
18 '''test_init_fail_invalid_memory'''
19 infile = os.path.join(data_dir, 'cdhit_test_run.in.fa')
20 with self.assertRaises(cdhit.Error):
21 cdhit.Runner(infile, memory_limit=-10)
1322
1423
1524 def test_get_clusters_from_bak_file(self):
161170 '1': {'seq3'},
162171 }
163172 self.assertEqual(clusters, expected_clusters)
173
174
175 def test_get_run_cmd_with_default_memory(self):
176 '''test_get_run_cmd_with_default_memory'''
177 fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict_rename.in.fa')
178 r = cdhit.Runner(fa_infile)
179 run_cmd = r.get_run_cmd('foo/bar/file.out')
180 match = re.search('^.+cd-hit-est -i .+ -o foo/bar/file.out -c 0.9 -T 1 -s 0.0 -d 0 -bak 1$', run_cmd)
181 self.assertTrue(match)
182
183
184 def test_get_run_cmd_with_non_default_memory(self):
185 '''test_get_run_cmd_with_non_default_memory'''
186 fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict_rename.in.fa')
187 r = cdhit.Runner(fa_infile, memory_limit=900)
188 run_cmd = r.get_run_cmd('foo/bar/file.out')
189 match = re.search('^.+cd-hit-est -i .+ -c 0.9 -T 1 -s 0.0 -d 0 -bak 1 -M 900$', run_cmd)
190 self.assertTrue(match)
191
192
193 def test_get_run_cmd_with_unlimited_memory(self):
194 '''test_get_run_cmd_with_unlimited_memory'''
195 fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict_rename.in.fa')
196 r = cdhit.Runner(fa_infile, memory_limit=0)
197 run_cmd = r.get_run_cmd('foo/bar/file.out')
198 match = re.search('^.+cd-hit-est -i .+ -c 0.9 -T 1 -s 0.0 -d 0 -bak 1 -M 0$', run_cmd)
199 self.assertTrue(match)
134134 cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit. Each input sequence is put into its own "cluster". Incompatible with --cdhit_clusters.')
135135 cdhit_group.add_argument('--cdhit_clusters', help='File specifying how the sequences should be clustered. Will be used instead of running cdhit. Format is one cluster per line. Sequence names separated by whitespace. Incompatible with --no_cdhit', metavar='FILENAME')
136136 cdhit_group.add_argument('--cdhit_min_id', type=float, help='Sequence identity threshold (cd-hit option -c) [%(default)s]', default=0.9, metavar='FLOAT')
137 cdhit_group.add_argument('--cdhit_min_length', type=float, help='length difference cutoff (cd-hit option -s) [%(default)s]', default=0.0, metavar='FLOAT')
137 cdhit_group.add_argument('--cdhit_min_length', type=float, help='Length difference cutoff (cd-hit option -s) [%(default)s]', default=0.0, metavar='FLOAT')
138 cdhit_group.add_argument('--cdhit_max_memory', type=int, help='Memory limit in MB (cd-hit option -M) [%(default)s]. Use 0 for unlimited.', metavar='INT')
138139
139140 other_prep_group = subparser_prepareref.add_argument_group('other options')
140141 other_prep_group.add_argument('--min_gene_length', type=int, help='Minimum allowed length in nucleotides of reference genes [%(default)s]', metavar='INT', default=6)