Commit 0c1eaaa8e446b066e05d707a211cba03ab8474c9 - ariba

+1

-1

README.md less more

4	4	For how to use ARIBA, please see the [ARIBA wiki page][ARIBA wiki].
5	5
6	6	[![Build Status](https://travis-ci.org/sanger-pathogens/ariba.svg?branch=master)](https://travis-ci.org/sanger-pathogens/ariba)
7		[![License: GPL v3](https://img.shields.io/badge/License-GPL%20v3-brightgreen.svg)](https://github.com/ssjunnebo/ariba/blob/master/LICENSE)
	7	[![License: GPL v3](https://img.shields.io/badge/License-GPL%20v3-brightgreen.svg)](https://github.com/sanger-pathogens/ariba/blob/master/LICENSE)
8	8	[![status](https://img.shields.io/badge/MGEN-10.1099%2Fmgen.0.000131-brightgreen.svg)](http://mgen.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000131)
9	9
10	10	## Contents

+1

-1

ariba/assembly.py less more

58	58	self.threads = threads
59	59
60	60	if extern_progs is None:
61		self.extern_progs = external_progs.ExternalProgs()
	61	self.extern_progs = external_progs.ExternalProgs(using_spades=self.assembler == 'spades')
62	62	else:
63	63	self.extern_progs = extern_progs
64	64

+1

-1

ariba/cdhit.py less more

26	26	self.length_diff_cutoff = length_diff_cutoff
27	27	self.verbose = verbose
28	28	self.min_cluster_number = min_cluster_number
29		extern_progs = external_progs.ExternalProgs(fail_on_error=True)
	29	extern_progs = external_progs.ExternalProgs(fail_on_error=True, using_spades=False)
30	30	self.cd_hit_est = extern_progs.exe('cdhit')
31	31
32	32

+1

-1

ariba/cluster.py less more

129	129	self.log_fh = None
130	130
131	131	if extern_progs is None:
132		self.extern_progs = external_progs.ExternalProgs()
	132	self.extern_progs = external_progs.ExternalProgs(using_spades=self.assembler == 'spades')
133	133	else:
134	134	self.extern_progs = extern_progs
135	135

+15

-2

ariba/external_progs.py less more

19	19	prog_to_env_var = {x: 'ARIBA_' + x.upper() for x in prog_to_default if x not in {'nucmer'}}
20	20
21	21
	22	# Nucmer 3.1 'nucmer --version' outputs this:
	23	# nucmer
	24	# NUCmer (NUCleotide MUMmer) version 3.1
	25	#
	26	# Numcer 4 'nucmer --version' outputs this:
	27	# 4.0.0beta2
	28	#
	29	# ... make the regex permissive and hope things
	30	# still work for later versions
22	31	prog_to_version_cmd = {
23	32	'bowtie2': ('--version', re.compile('.bowtie2.version (.*)$')),
24	33	'cdhit': ('', re.compile('CD-HIT version ([0-9\.]+) \(')),
25		'nucmer': ('--version', re.compile('^NUCmer $NUCleotide MUMmer$ version ([0-9\.]+)')),
	34	'nucmer': ('--version', re.compile('([0-9]+\.[0-9\.]+.*$)')),
26	35	'spades': ('--version', re.compile('SPAdes\s+v([0-9\.]+)'))
27	36	}
28	37

39	48	])
40	49
41	50	class ExternalProgs:
42		def __init__(self, verbose=False, fail_on_error=True):
	51	def __init__(self, verbose=False, fail_on_error=True, using_spades=False):
43	52	self.progs = {}
44	53	self.version_report = []
45	54	self.all_deps_ok = True
46	55	self.versions = {}
	56	self.using_spades = using_spades
47	57
48	58	if verbose:
49	59	print('{:_^79}'.format(' Checking dependencies and their versions '))

52	62	warnings = []
53	63
54	64	for prog in sorted(prog_to_default):
	65	if prog == 'spades' and not self.using_spades:
	66	continue
	67
55	68	msg_sink = errors
56	69	if prog in prog_optional:
57	70	msg_sink = warnings

+1

-1

ariba/read_filter.py less more

19	19	self.log_fh = log_fh
20	20
21	21	if extern_progs is None:
22		self.extern_progs = external_progs.ExternalProgs()
	22	self.extern_progs = external_progs.ExternalProgs(using_spades=False)
23	23	else:
24	24	self.extern_progs = extern_progs
25	25

+97

-35

ariba/ref_genes_getter.py less more

6	6	import pyfastaq
7	7	import time
8	8	import json
	9	import subprocess
	10	import sys
9	11	from ariba import common, card_record, vfdb_parser, megares_data_finder, megares_zip_parser
10	12
11	13

185	187	print('and in your methods say that version', self.version, 'of the database was used')
186	188
187	189
	190	@classmethod
	191	def _get_genetic_epi_database_from_bitbucket(cls, db_name, outdir, git_commit=None):
	192	assert db_name in {'plasmidfinder', 'resfinder', 'virulence_finder'}
	193	cmd = 'git clone ' + 'https://bitbucket.org/genomicepidemiology/' + db_name + '_db.git ' + outdir
	194	common.syscall(cmd)
	195
	196	if git_commit is not None:
	197	common.syscall('cd ' + outdir + ' && git checkout ' + git_commit)
	198
	199	print('Using this git commit for ' + db_name + ' database:')
	200	subprocess.check_call('cd ' + outdir + ' && git log -n 1', shell=True)
	201
	202
188	203	def _get_from_resfinder(self, outprefix):
189	204	outprefix = os.path.abspath(outprefix)
190	205	final_fasta = outprefix + '.fa'

192	207	tmpdir = outprefix + '.tmp.download'
193	208	current_dir = os.getcwd()
194	209
195		try:
196		os.mkdir(tmpdir)
	210	if self.version =='old':
	211	try:
	212	os.mkdir(tmpdir)
	213	os.chdir(tmpdir)
	214	except:
	215	raise Error('Error mkdir/chdir ' + tmpdir)
	216
	217	zipfile = 'resfinder.zip'
	218	cmd = 'curl -X POST --data "folder=resfinder&filename=resfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
	219	print('Downloading data with:', cmd, sep='\n')
	220	common.syscall(cmd)
	221	common.syscall('unzip ' + zipfile)
	222	else:
	223	RefGenesGetter._get_genetic_epi_database_from_bitbucket('resfinder', tmpdir, git_commit=self.version)
197	224	os.chdir(tmpdir)
198		except:
199		raise Error('Error mkdir/chdir ' + tmpdir)
200
201		zipfile = 'resfinder.zip'
202		cmd = 'curl -X POST --data "folder=resfinder&filename=resfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
203		print('Downloading data with:', cmd, sep='\n')
204		common.syscall(cmd)
205		common.syscall('unzip ' + zipfile)
	225
206	226
207	227	print('Combining downloaded fasta files...')
208	228	fout_fa = pyfastaq.utils.open_file_write(final_fasta)

221	241	except:
222	242	description = '.'
223	243
224		# names are not unique across the files
	244	# names are not unique across the files
225	245	if seq.id in used_names:
226	246	used_names[seq.id] += 1
227	247	seq.id += '_' + str(used_names[seq.id])

309	329	tmpdir = outprefix + '.tmp.download'
310	330	current_dir = os.getcwd()
311	331
312		try:
313		os.mkdir(tmpdir)
	332	if self.version == 'old':
	333	try:
	334	os.mkdir(tmpdir)
	335	os.chdir(tmpdir)
	336	except:
	337	raise Error('Error mkdir/chdir ' + tmpdir)
	338
	339	zipfile = 'plasmidfinder.zip'
	340	cmd = 'curl -X POST --data "folder=plasmidfinder&filename=plasmidfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
	341	print('Downloading data with:', cmd, sep='\n')
	342	common.syscall(cmd)
	343	common.syscall('unzip ' + zipfile)
	344	else:
	345	RefGenesGetter._get_genetic_epi_database_from_bitbucket('plasmidfinder', tmpdir, git_commit=self.version)
314	346	os.chdir(tmpdir)
315		except:
316		raise Error('Error mkdir/chdir ' + tmpdir)
317
318		zipfile = 'plasmidfinder.zip'
319		cmd = 'curl -X POST --data "folder=plasmidfinder&filename=plasmidfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
320		print('Downloading data with:', cmd, sep='\n')
321		common.syscall(cmd)
322		common.syscall('unzip ' + zipfile)
323	347
324	348	print('Combining downloaded fasta files...')
325	349	fout_fa = pyfastaq.utils.open_file_write(final_fasta)

356	380
357	381
358	382	def _get_from_srst2_argannot(self, outprefix):
359		srst2_version = '0.2.0'
360		srst2_url = 'https://github.com/katholt/srst2/raw/v' + srst2_version + '/data/ARGannot.r1.fasta'
	383	if self.version is None:
	384	self.version = 'r2'
	385	if self.version not in {'r1', 'r2'}:
	386	raise Error('srst2_argannot version must be r1 or r2. Got this: ' + self.version)
	387
	388	version_string = '.r1' if self.version == 'r1' else '_r2'
	389	srst2_url = 'https://raw.githubusercontent.com/katholt/srst2/master/data/ARGannot' + version_string + '.fasta'
361	390	srst2_fa = outprefix + '.original.fa'
362	391	command = 'wget -O ' + srst2_fa + ' ' + srst2_url
363	392	common.syscall(command, verbose=True)

388	417	print('If you use this downloaded data, please cite:')
389	418	print('"SRST2: Rapid genomic surveillance for public health and hospital microbiology labs",\nInouye et al 2014, Genome Medicine, PMID: 25422674\n')
390	419	print(argannot_ref)
391		print('and in your methods say that the ARG-ANNOT sequences were used from version', srst2_version, 'of SRST2.')
	420	# Use to also output the version of SRST2 here, but the r2 version of their
	421	# fasta file was made after SRST2 release 0.2.0. At the time of writing this,
	422	# 0.2.0 is the latest release, ie r2 isn't in an SRST2 release.
392	423
393	424
394	425	def _get_from_vfdb_core(self, outprefix):

426	457	print('"VFDB 2016: hierarchical and refined dataset for big data analysis-10 years on",\nChen LH et al 2016, Nucleic Acids Res. 44(Database issue):D694-D697. PMID: 26578559\n')
427	458
428	459
	460	@classmethod
	461	def _fix_virulencefinder_fasta_file(cls, infile, outfile):
	462	'''Some line breaks are missing in the FASTA files from
	463	viruslence finder. Which means there are lines like this:
	464	AAGATCCAATAACTGAAGATGTTGAACAAACAATTCATAATATTTATGGTCAATATGCTATTTTCGTTGA
	465	AGGTGTTGCGCATTTACCTGGACATCTCTCTCCATTATTAAAAAAATTACTACTTAAATCTTTATAA>coa:1:BA000018.3
	466	ATGAAAAAGCAAATAATTTCGCTAGGCGCATTAGCAGTTGCATCTAGCTTATTTACATGGGATAACAAAG
	467	and therefore the sequences are messed up when we parse them. Also
	468	one has a > at the end, then the seq name on the next line.
	469	This function fixes the file by adding line breaks'''
	470	with open(infile) as f_in, open(outfile, 'w') as f_out:
	471	for line in f_in:
	472	if line.startswith('>') or '>' not in line:
	473	print(line, end='', file=f_out)
	474	elif line.endswith('>\n'):
	475	print('WARNING: found line with ">" at the end! Fixing. Line:' + line.rstrip() + ' in file ' + infile, file=sys.stderr)
	476	print(line.rstrip('>\n'), file=f_out)
	477	print('>', end='', file=f_out)
	478	else:
	479	print('WARNING: found line with ">" not at the start! Fixing. Line:' + line.rstrip() + ' in file ' + infile, file=sys.stderr)
	480	line1, line2 = line.split('>')
	481	print(line1, file=f_out)
	482	print('>', line2, sep='', end='', file=f_out)
	483
	484
429	485	def _get_from_virulencefinder(self, outprefix):
430	486	outprefix = os.path.abspath(outprefix)
431	487	final_fasta = outprefix + '.fa'

433	489	tmpdir = outprefix + '.tmp.download'
434	490	current_dir = os.getcwd()
435	491
436		try:
437		os.mkdir(tmpdir)
	492	if self.version == 'old':
	493	try:
	494	os.mkdir(tmpdir)
	495	os.chdir(tmpdir)
	496	except:
	497	raise Error('Error mkdir/chdir ' + tmpdir)
	498
	499	zipfile = 'plasmidfinder.zip'
	500	cmd = 'curl -X POST --data "folder=virulencefinder&filename=virulencefinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
	501	print('Downloading data with:', cmd, sep='\n')
	502	common.syscall(cmd)
	503	common.syscall('unzip ' + zipfile)
	504	else:
	505	RefGenesGetter._get_genetic_epi_database_from_bitbucket('plasmidfinder', tmpdir, git_commit=self.version)
438	506	os.chdir(tmpdir)
439		except:
440		raise Error('Error mkdir/chdir ' + tmpdir)
441
442		zipfile = 'plasmidfinder.zip'
443		cmd = 'curl -X POST --data "folder=virulencefinder&filename=virulencefinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
444		print('Downloading data with:', cmd, sep='\n')
445		common.syscall(cmd)
446		common.syscall('unzip ' + zipfile)
447	507
448	508	print('Combining downloaded fasta files...')
449	509	fout_fa = pyfastaq.utils.open_file_write(final_fasta)

453	513	for filename in os.listdir(tmpdir):
454	514	if filename.endswith('.fsa'):
455	515	print(' ', filename)
456		file_reader = pyfastaq.sequences.file_reader(os.path.join(tmpdir, filename))
	516	fix_file = os.path.join(tmpdir, filename + '.fix.fsa')
	517	RefGenesGetter._fix_virulencefinder_fasta_file(os.path.join(tmpdir, filename), fix_file)
	518	file_reader = pyfastaq.sequences.file_reader(fix_file)
457	519	for seq in file_reader:
458	520	original_id = seq.id
459	521	seq.id = seq.id.replace('_', '.', 1)

+1

-1

ariba/tasks/prepareref.py less more

5	5	if options.no_cdhit and options.cdhit_clusters is not None:
6	6	sys.exit('Cannot use both --no_cdhit and --cdhit_clusters. Neither or exactly one of those options must be used')
7	7
8		extern_progs, version_report_lines = versions.get_all_versions()
	8	extern_progs, version_report_lines = versions.get_all_versions(using_spades=False)
9	9	if options.verbose:
10	10	print(*version_report_lines, sep='\n')
11	11

+1

-1

ariba/tasks/run.py less more

34	34	print('Output directory already exists. ARIBA makes the output directory. Cannot continue.', file=sys.stderr)
35	35	sys.exit(1)
36	36
37		extern_progs, version_report_lines = ariba.versions.get_all_versions()
	37	extern_progs, version_report_lines = ariba.versions.get_all_versions(using_spades=options.assembler == 'spades')
38	38	if options.verbose:
39	39	print(*version_report_lines, sep='\n')
40	40

+1

-1

ariba/tests/assembly_test.py less more

8	8
9	9	modules_dir = os.path.dirname(os.path.abspath(assembly.__file__))
10	10	data_dir = os.path.join(modules_dir, 'tests', 'data')
11		extern_progs = external_progs.ExternalProgs()
	11	extern_progs = external_progs.ExternalProgs(using_spades=True)
12	12
13	13	class TestAssembly(unittest.TestCase):
14	14	def test_run_fermilite(self):

+11

-0

ariba/tests/data/ref_genes_getter.fix_virulencefinder_fasta_file.in.fa less more

	0	>seq1
	1	ACGT
	2	A
	3	>seq2
	4	AGT
	5	AC>seq3
	6	ACGT
	7	>seq4
	8	AACGT>
	9	seq5
	10	AAC

+12

-0

ariba/tests/data/ref_genes_getter.fix_virulencefinder_fasta_file.out.fa less more

	0	>seq1
	1	ACGT
	2	A
	3	>seq2
	4	AGT
	5	AC
	6	>seq3
	7	ACGT
	8	>seq4
	9	AACGT
	10	>seq5
	11	AAC

+18

-0

ariba/tests/ref_genes_getter_test.py less more

	0	import unittest
	1	import os
	2	import filecmp
	3	from ariba import ref_genes_getter
	4
	5	modules_dir = os.path.dirname(os.path.abspath(ref_genes_getter.__file__))
	6	data_dir = os.path.join(modules_dir, 'tests', 'data')
	7
	8
	9	class TestRefGenesGetter(unittest.TestCase):
	10	def test_fix_virulencefinder_fasta_file(self):
	11	'''test _fix_virulencefinder_fasta_file'''
	12	infile = os.path.join(data_dir, 'ref_genes_getter.fix_virulencefinder_fasta_file.in.fa')
	13	tmp_file = 'tmp.test.ref_genes_getter.fix_virulencefinder_fasta_file.out.fa'
	14	expected_file = os.path.join(data_dir, 'ref_genes_getter.fix_virulencefinder_fasta_file.out.fa')
	15	ref_genes_getter.RefGenesGetter._fix_virulencefinder_fasta_file(infile, tmp_file)
	16	self.assertTrue(filecmp.cmp(expected_file, tmp_file, shallow=False))
	17	os.unlink(tmp_file)

+2

-2

ariba/versions.py less more

16	16	}
17	17
18	18
19		def get_all_versions(raise_error=True):
20		extern_progs = external_progs.ExternalProgs(fail_on_error=False)
	19	def get_all_versions(raise_error=True, using_spades=True):
	20	extern_progs = external_progs.ExternalProgs(fail_on_error=False, using_spades=using_spades)
21	21
22	22	report_lines = [
23	23	'ARIBA version: ' + ariba_version,

+1

-1

scripts/ariba less more

61	61	description='Download reference data from one of a few supported public resources',
62	62	)
63	63	subparser_getref.add_argument('--debug', action='store_true', help='Do not delete temporary downloaded files')
64		subparser_getref.add_argument('--version', help='Version of reference data to download. If not used, gets the latest version. Only applies to card and megares')
	64	subparser_getref.add_argument('--version', help='Version of reference data to download. If not used, gets the latest version. Applies to: card, megares, plasmidfinder, resfinder, srst2_argannot, virulencefinder. For plasmid/res/virulencefinder: default is to get latest from bitbucket - supply git commit hash to get a specific version from bitbucket, or use "old " to get from old website. For srst2_argannot: default is latest version r2, use r1 to get the older version')
65	65	subparser_getref.add_argument('db', help='Database to download. Must be one of: ' + ' '.join(allowed_dbs), choices=allowed_dbs, metavar="DB name")
66	66	subparser_getref.add_argument('outprefix', help='Prefix of output filenames')
67	67	subparser_getref.set_defaults(func=ariba.tasks.getref.run)

+1

-1

setup.py less more

54	54	setup(
55	55	ext_modules=[minimap_mod, fermilite_mod, vcfcall_mod],
56	56	name='ariba',
57		version='2.11.1',
	57	version='2.12.0',
58	58	description='ARIBA: Antibiotic Resistance Identification By Assembly',
59	59	packages = find_packages(),
60	60	package_data={'ariba': ['test_run_data/*']},