Codebase list ariba / 0c1eaaa
New upstream version 2.12.0+ds Sascha Steinbiss 5 years ago
16 changed file(s) with 165 addition(s) and 49 deletion(s). Raw diff Collapse all Expand all
44 For how to use ARIBA, please see the [ARIBA wiki page][ARIBA wiki].
55
66 [![Build Status](https://travis-ci.org/sanger-pathogens/ariba.svg?branch=master)](https://travis-ci.org/sanger-pathogens/ariba)
7 [![License: GPL v3](https://img.shields.io/badge/License-GPL%20v3-brightgreen.svg)](https://github.com/ssjunnebo/ariba/blob/master/LICENSE)
7 [![License: GPL v3](https://img.shields.io/badge/License-GPL%20v3-brightgreen.svg)](https://github.com/sanger-pathogens/ariba/blob/master/LICENSE)
88 [![status](https://img.shields.io/badge/MGEN-10.1099%2Fmgen.0.000131-brightgreen.svg)](http://mgen.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000131)
99
1010 ## Contents
5858 self.threads = threads
5959
6060 if extern_progs is None:
61 self.extern_progs = external_progs.ExternalProgs()
61 self.extern_progs = external_progs.ExternalProgs(using_spades=self.assembler == 'spades')
6262 else:
6363 self.extern_progs = extern_progs
6464
2626 self.length_diff_cutoff = length_diff_cutoff
2727 self.verbose = verbose
2828 self.min_cluster_number = min_cluster_number
29 extern_progs = external_progs.ExternalProgs(fail_on_error=True)
29 extern_progs = external_progs.ExternalProgs(fail_on_error=True, using_spades=False)
3030 self.cd_hit_est = extern_progs.exe('cdhit')
3131
3232
129129 self.log_fh = None
130130
131131 if extern_progs is None:
132 self.extern_progs = external_progs.ExternalProgs()
132 self.extern_progs = external_progs.ExternalProgs(using_spades=self.assembler == 'spades')
133133 else:
134134 self.extern_progs = extern_progs
135135
1919 prog_to_env_var = {x: 'ARIBA_' + x.upper() for x in prog_to_default if x not in {'nucmer'}}
2020
2121
22 # Nucmer 3.1 'nucmer --version' outputs this:
23 # nucmer
24 # NUCmer (NUCleotide MUMmer) version 3.1
25 #
26 # Numcer 4 'nucmer --version' outputs this:
27 # 4.0.0beta2
28 #
29 # ... make the regex permissive and hope things
30 # still work for later versions
2231 prog_to_version_cmd = {
2332 'bowtie2': ('--version', re.compile('.*bowtie2.*version (.*)$')),
2433 'cdhit': ('', re.compile('CD-HIT version ([0-9\.]+) \(')),
25 'nucmer': ('--version', re.compile('^NUCmer \(NUCleotide MUMmer\) version ([0-9\.]+)')),
34 'nucmer': ('--version', re.compile('([0-9]+\.[0-9\.]+.*$)')),
2635 'spades': ('--version', re.compile('SPAdes\s+v([0-9\.]+)'))
2736 }
2837
3948 ])
4049
4150 class ExternalProgs:
42 def __init__(self, verbose=False, fail_on_error=True):
51 def __init__(self, verbose=False, fail_on_error=True, using_spades=False):
4352 self.progs = {}
4453 self.version_report = []
4554 self.all_deps_ok = True
4655 self.versions = {}
56 self.using_spades = using_spades
4757
4858 if verbose:
4959 print('{:_^79}'.format(' Checking dependencies and their versions '))
5262 warnings = []
5363
5464 for prog in sorted(prog_to_default):
65 if prog == 'spades' and not self.using_spades:
66 continue
67
5568 msg_sink = errors
5669 if prog in prog_optional:
5770 msg_sink = warnings
1919 self.log_fh = log_fh
2020
2121 if extern_progs is None:
22 self.extern_progs = external_progs.ExternalProgs()
22 self.extern_progs = external_progs.ExternalProgs(using_spades=False)
2323 else:
2424 self.extern_progs = extern_progs
2525
66 import pyfastaq
77 import time
88 import json
9 import subprocess
10 import sys
911 from ariba import common, card_record, vfdb_parser, megares_data_finder, megares_zip_parser
1012
1113
185187 print('and in your methods say that version', self.version, 'of the database was used')
186188
187189
190 @classmethod
191 def _get_genetic_epi_database_from_bitbucket(cls, db_name, outdir, git_commit=None):
192 assert db_name in {'plasmidfinder', 'resfinder', 'virulence_finder'}
193 cmd = 'git clone ' + 'https://bitbucket.org/genomicepidemiology/' + db_name + '_db.git ' + outdir
194 common.syscall(cmd)
195
196 if git_commit is not None:
197 common.syscall('cd ' + outdir + ' && git checkout ' + git_commit)
198
199 print('Using this git commit for ' + db_name + ' database:')
200 subprocess.check_call('cd ' + outdir + ' && git log -n 1', shell=True)
201
202
188203 def _get_from_resfinder(self, outprefix):
189204 outprefix = os.path.abspath(outprefix)
190205 final_fasta = outprefix + '.fa'
192207 tmpdir = outprefix + '.tmp.download'
193208 current_dir = os.getcwd()
194209
195 try:
196 os.mkdir(tmpdir)
210 if self.version =='old':
211 try:
212 os.mkdir(tmpdir)
213 os.chdir(tmpdir)
214 except:
215 raise Error('Error mkdir/chdir ' + tmpdir)
216
217 zipfile = 'resfinder.zip'
218 cmd = 'curl -X POST --data "folder=resfinder&filename=resfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
219 print('Downloading data with:', cmd, sep='\n')
220 common.syscall(cmd)
221 common.syscall('unzip ' + zipfile)
222 else:
223 RefGenesGetter._get_genetic_epi_database_from_bitbucket('resfinder', tmpdir, git_commit=self.version)
197224 os.chdir(tmpdir)
198 except:
199 raise Error('Error mkdir/chdir ' + tmpdir)
200
201 zipfile = 'resfinder.zip'
202 cmd = 'curl -X POST --data "folder=resfinder&filename=resfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
203 print('Downloading data with:', cmd, sep='\n')
204 common.syscall(cmd)
205 common.syscall('unzip ' + zipfile)
225
206226
207227 print('Combining downloaded fasta files...')
208228 fout_fa = pyfastaq.utils.open_file_write(final_fasta)
221241 except:
222242 description = '.'
223243
224 # names are not unique across the files
244 # names are not unique across the files
225245 if seq.id in used_names:
226246 used_names[seq.id] += 1
227247 seq.id += '_' + str(used_names[seq.id])
309329 tmpdir = outprefix + '.tmp.download'
310330 current_dir = os.getcwd()
311331
312 try:
313 os.mkdir(tmpdir)
332 if self.version == 'old':
333 try:
334 os.mkdir(tmpdir)
335 os.chdir(tmpdir)
336 except:
337 raise Error('Error mkdir/chdir ' + tmpdir)
338
339 zipfile = 'plasmidfinder.zip'
340 cmd = 'curl -X POST --data "folder=plasmidfinder&filename=plasmidfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
341 print('Downloading data with:', cmd, sep='\n')
342 common.syscall(cmd)
343 common.syscall('unzip ' + zipfile)
344 else:
345 RefGenesGetter._get_genetic_epi_database_from_bitbucket('plasmidfinder', tmpdir, git_commit=self.version)
314346 os.chdir(tmpdir)
315 except:
316 raise Error('Error mkdir/chdir ' + tmpdir)
317
318 zipfile = 'plasmidfinder.zip'
319 cmd = 'curl -X POST --data "folder=plasmidfinder&filename=plasmidfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
320 print('Downloading data with:', cmd, sep='\n')
321 common.syscall(cmd)
322 common.syscall('unzip ' + zipfile)
323347
324348 print('Combining downloaded fasta files...')
325349 fout_fa = pyfastaq.utils.open_file_write(final_fasta)
356380
357381
358382 def _get_from_srst2_argannot(self, outprefix):
359 srst2_version = '0.2.0'
360 srst2_url = 'https://github.com/katholt/srst2/raw/v' + srst2_version + '/data/ARGannot.r1.fasta'
383 if self.version is None:
384 self.version = 'r2'
385 if self.version not in {'r1', 'r2'}:
386 raise Error('srst2_argannot version must be r1 or r2. Got this: ' + self.version)
387
388 version_string = '.r1' if self.version == 'r1' else '_r2'
389 srst2_url = 'https://raw.githubusercontent.com/katholt/srst2/master/data/ARGannot' + version_string + '.fasta'
361390 srst2_fa = outprefix + '.original.fa'
362391 command = 'wget -O ' + srst2_fa + ' ' + srst2_url
363392 common.syscall(command, verbose=True)
388417 print('If you use this downloaded data, please cite:')
389418 print('"SRST2: Rapid genomic surveillance for public health and hospital microbiology labs",\nInouye et al 2014, Genome Medicine, PMID: 25422674\n')
390419 print(argannot_ref)
391 print('and in your methods say that the ARG-ANNOT sequences were used from version', srst2_version, 'of SRST2.')
420 # Use to also output the version of SRST2 here, but the r2 version of their
421 # fasta file was made after SRST2 release 0.2.0. At the time of writing this,
422 # 0.2.0 is the latest release, ie r2 isn't in an SRST2 release.
392423
393424
394425 def _get_from_vfdb_core(self, outprefix):
426457 print('"VFDB 2016: hierarchical and refined dataset for big data analysis-10 years on",\nChen LH et al 2016, Nucleic Acids Res. 44(Database issue):D694-D697. PMID: 26578559\n')
427458
428459
460 @classmethod
461 def _fix_virulencefinder_fasta_file(cls, infile, outfile):
462 '''Some line breaks are missing in the FASTA files from
463 viruslence finder. Which means there are lines like this:
464 AAGATCCAATAACTGAAGATGTTGAACAAACAATTCATAATATTTATGGTCAATATGCTATTTTCGTTGA
465 AGGTGTTGCGCATTTACCTGGACATCTCTCTCCATTATTAAAAAAATTACTACTTAAATCTTTATAA>coa:1:BA000018.3
466 ATGAAAAAGCAAATAATTTCGCTAGGCGCATTAGCAGTTGCATCTAGCTTATTTACATGGGATAACAAAG
467 and therefore the sequences are messed up when we parse them. Also
468 one has a > at the end, then the seq name on the next line.
469 This function fixes the file by adding line breaks'''
470 with open(infile) as f_in, open(outfile, 'w') as f_out:
471 for line in f_in:
472 if line.startswith('>') or '>' not in line:
473 print(line, end='', file=f_out)
474 elif line.endswith('>\n'):
475 print('WARNING: found line with ">" at the end! Fixing. Line:' + line.rstrip() + ' in file ' + infile, file=sys.stderr)
476 print(line.rstrip('>\n'), file=f_out)
477 print('>', end='', file=f_out)
478 else:
479 print('WARNING: found line with ">" not at the start! Fixing. Line:' + line.rstrip() + ' in file ' + infile, file=sys.stderr)
480 line1, line2 = line.split('>')
481 print(line1, file=f_out)
482 print('>', line2, sep='', end='', file=f_out)
483
484
429485 def _get_from_virulencefinder(self, outprefix):
430486 outprefix = os.path.abspath(outprefix)
431487 final_fasta = outprefix + '.fa'
433489 tmpdir = outprefix + '.tmp.download'
434490 current_dir = os.getcwd()
435491
436 try:
437 os.mkdir(tmpdir)
492 if self.version == 'old':
493 try:
494 os.mkdir(tmpdir)
495 os.chdir(tmpdir)
496 except:
497 raise Error('Error mkdir/chdir ' + tmpdir)
498
499 zipfile = 'plasmidfinder.zip'
500 cmd = 'curl -X POST --data "folder=virulencefinder&filename=virulencefinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
501 print('Downloading data with:', cmd, sep='\n')
502 common.syscall(cmd)
503 common.syscall('unzip ' + zipfile)
504 else:
505 RefGenesGetter._get_genetic_epi_database_from_bitbucket('plasmidfinder', tmpdir, git_commit=self.version)
438506 os.chdir(tmpdir)
439 except:
440 raise Error('Error mkdir/chdir ' + tmpdir)
441
442 zipfile = 'plasmidfinder.zip'
443 cmd = 'curl -X POST --data "folder=virulencefinder&filename=virulencefinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
444 print('Downloading data with:', cmd, sep='\n')
445 common.syscall(cmd)
446 common.syscall('unzip ' + zipfile)
447507
448508 print('Combining downloaded fasta files...')
449509 fout_fa = pyfastaq.utils.open_file_write(final_fasta)
453513 for filename in os.listdir(tmpdir):
454514 if filename.endswith('.fsa'):
455515 print(' ', filename)
456 file_reader = pyfastaq.sequences.file_reader(os.path.join(tmpdir, filename))
516 fix_file = os.path.join(tmpdir, filename + '.fix.fsa')
517 RefGenesGetter._fix_virulencefinder_fasta_file(os.path.join(tmpdir, filename), fix_file)
518 file_reader = pyfastaq.sequences.file_reader(fix_file)
457519 for seq in file_reader:
458520 original_id = seq.id
459521 seq.id = seq.id.replace('_', '.', 1)
55 if options.no_cdhit and options.cdhit_clusters is not None:
66 sys.exit('Cannot use both --no_cdhit and --cdhit_clusters. Neither or exactly one of those options must be used')
77
8 extern_progs, version_report_lines = versions.get_all_versions()
8 extern_progs, version_report_lines = versions.get_all_versions(using_spades=False)
99 if options.verbose:
1010 print(*version_report_lines, sep='\n')
1111
3434 print('Output directory already exists. ARIBA makes the output directory. Cannot continue.', file=sys.stderr)
3535 sys.exit(1)
3636
37 extern_progs, version_report_lines = ariba.versions.get_all_versions()
37 extern_progs, version_report_lines = ariba.versions.get_all_versions(using_spades=options.assembler == 'spades')
3838 if options.verbose:
3939 print(*version_report_lines, sep='\n')
4040
88
99 modules_dir = os.path.dirname(os.path.abspath(assembly.__file__))
1010 data_dir = os.path.join(modules_dir, 'tests', 'data')
11 extern_progs = external_progs.ExternalProgs()
11 extern_progs = external_progs.ExternalProgs(using_spades=True)
1212
1313 class TestAssembly(unittest.TestCase):
1414 def test_run_fermilite(self):
0 >seq1
1 ACGT
2 A
3 >seq2
4 AGT
5 AC>seq3
6 ACGT
7 >seq4
8 AACGT>
9 seq5
10 AAC
0 >seq1
1 ACGT
2 A
3 >seq2
4 AGT
5 AC
6 >seq3
7 ACGT
8 >seq4
9 AACGT
10 >seq5
11 AAC
0 import unittest
1 import os
2 import filecmp
3 from ariba import ref_genes_getter
4
5 modules_dir = os.path.dirname(os.path.abspath(ref_genes_getter.__file__))
6 data_dir = os.path.join(modules_dir, 'tests', 'data')
7
8
9 class TestRefGenesGetter(unittest.TestCase):
10 def test_fix_virulencefinder_fasta_file(self):
11 '''test _fix_virulencefinder_fasta_file'''
12 infile = os.path.join(data_dir, 'ref_genes_getter.fix_virulencefinder_fasta_file.in.fa')
13 tmp_file = 'tmp.test.ref_genes_getter.fix_virulencefinder_fasta_file.out.fa'
14 expected_file = os.path.join(data_dir, 'ref_genes_getter.fix_virulencefinder_fasta_file.out.fa')
15 ref_genes_getter.RefGenesGetter._fix_virulencefinder_fasta_file(infile, tmp_file)
16 self.assertTrue(filecmp.cmp(expected_file, tmp_file, shallow=False))
17 os.unlink(tmp_file)
1616 }
1717
1818
19 def get_all_versions(raise_error=True):
20 extern_progs = external_progs.ExternalProgs(fail_on_error=False)
19 def get_all_versions(raise_error=True, using_spades=True):
20 extern_progs = external_progs.ExternalProgs(fail_on_error=False, using_spades=using_spades)
2121
2222 report_lines = [
2323 'ARIBA version: ' + ariba_version,
6161 description='Download reference data from one of a few supported public resources',
6262 )
6363 subparser_getref.add_argument('--debug', action='store_true', help='Do not delete temporary downloaded files')
64 subparser_getref.add_argument('--version', help='Version of reference data to download. If not used, gets the latest version. Only applies to card and megares')
64 subparser_getref.add_argument('--version', help='Version of reference data to download. If not used, gets the latest version. Applies to: card, megares, plasmidfinder, resfinder, srst2_argannot, virulencefinder. For plasmid/res/virulencefinder: default is to get latest from bitbucket - supply git commit hash to get a specific version from bitbucket, or use "old " to get from old website. For srst2_argannot: default is latest version r2, use r1 to get the older version')
6565 subparser_getref.add_argument('db', help='Database to download. Must be one of: ' + ' '.join(allowed_dbs), choices=allowed_dbs, metavar="DB name")
6666 subparser_getref.add_argument('outprefix', help='Prefix of output filenames')
6767 subparser_getref.set_defaults(func=ariba.tasks.getref.run)
5454 setup(
5555 ext_modules=[minimap_mod, fermilite_mod, vcfcall_mod],
5656 name='ariba',
57 version='2.11.1',
57 version='2.12.0',
5858 description='ARIBA: Antibiotic Resistance Identification By Assembly',
5959 packages = find_packages(),
6060 package_data={'ariba': ['test_run_data/*']},