6 | 6 |
import pyfastaq
|
7 | 7 |
import time
|
8 | 8 |
import json
|
|
9 |
import subprocess
|
|
10 |
import sys
|
9 | 11 |
from ariba import common, card_record, vfdb_parser, megares_data_finder, megares_zip_parser
|
10 | 12 |
|
11 | 13 |
|
|
185 | 187 |
print('and in your methods say that version', self.version, 'of the database was used')
|
186 | 188 |
|
187 | 189 |
|
|
190 |
@classmethod
|
|
191 |
def _get_genetic_epi_database_from_bitbucket(cls, db_name, outdir, git_commit=None):
|
|
192 |
assert db_name in {'plasmidfinder', 'resfinder', 'virulence_finder'}
|
|
193 |
cmd = 'git clone ' + 'https://bitbucket.org/genomicepidemiology/' + db_name + '_db.git ' + outdir
|
|
194 |
common.syscall(cmd)
|
|
195 |
|
|
196 |
if git_commit is not None:
|
|
197 |
common.syscall('cd ' + outdir + ' && git checkout ' + git_commit)
|
|
198 |
|
|
199 |
print('Using this git commit for ' + db_name + ' database:')
|
|
200 |
subprocess.check_call('cd ' + outdir + ' && git log -n 1', shell=True)
|
|
201 |
|
|
202 |
|
188 | 203 |
def _get_from_resfinder(self, outprefix):
|
189 | 204 |
outprefix = os.path.abspath(outprefix)
|
190 | 205 |
final_fasta = outprefix + '.fa'
|
|
192 | 207 |
tmpdir = outprefix + '.tmp.download'
|
193 | 208 |
current_dir = os.getcwd()
|
194 | 209 |
|
195 | |
try:
|
196 | |
os.mkdir(tmpdir)
|
|
210 |
if self.version =='old':
|
|
211 |
try:
|
|
212 |
os.mkdir(tmpdir)
|
|
213 |
os.chdir(tmpdir)
|
|
214 |
except:
|
|
215 |
raise Error('Error mkdir/chdir ' + tmpdir)
|
|
216 |
|
|
217 |
zipfile = 'resfinder.zip'
|
|
218 |
cmd = 'curl -X POST --data "folder=resfinder&filename=resfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
|
|
219 |
print('Downloading data with:', cmd, sep='\n')
|
|
220 |
common.syscall(cmd)
|
|
221 |
common.syscall('unzip ' + zipfile)
|
|
222 |
else:
|
|
223 |
RefGenesGetter._get_genetic_epi_database_from_bitbucket('resfinder', tmpdir, git_commit=self.version)
|
197 | 224 |
os.chdir(tmpdir)
|
198 | |
except:
|
199 | |
raise Error('Error mkdir/chdir ' + tmpdir)
|
200 | |
|
201 | |
zipfile = 'resfinder.zip'
|
202 | |
cmd = 'curl -X POST --data "folder=resfinder&filename=resfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
|
203 | |
print('Downloading data with:', cmd, sep='\n')
|
204 | |
common.syscall(cmd)
|
205 | |
common.syscall('unzip ' + zipfile)
|
|
225 |
|
206 | 226 |
|
207 | 227 |
print('Combining downloaded fasta files...')
|
208 | 228 |
fout_fa = pyfastaq.utils.open_file_write(final_fasta)
|
|
221 | 241 |
except:
|
222 | 242 |
description = '.'
|
223 | 243 |
|
224 | |
# names are not unique across the files
|
|
244 |
# names are not unique across the files
|
225 | 245 |
if seq.id in used_names:
|
226 | 246 |
used_names[seq.id] += 1
|
227 | 247 |
seq.id += '_' + str(used_names[seq.id])
|
|
309 | 329 |
tmpdir = outprefix + '.tmp.download'
|
310 | 330 |
current_dir = os.getcwd()
|
311 | 331 |
|
312 | |
try:
|
313 | |
os.mkdir(tmpdir)
|
|
332 |
if self.version == 'old':
|
|
333 |
try:
|
|
334 |
os.mkdir(tmpdir)
|
|
335 |
os.chdir(tmpdir)
|
|
336 |
except:
|
|
337 |
raise Error('Error mkdir/chdir ' + tmpdir)
|
|
338 |
|
|
339 |
zipfile = 'plasmidfinder.zip'
|
|
340 |
cmd = 'curl -X POST --data "folder=plasmidfinder&filename=plasmidfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
|
|
341 |
print('Downloading data with:', cmd, sep='\n')
|
|
342 |
common.syscall(cmd)
|
|
343 |
common.syscall('unzip ' + zipfile)
|
|
344 |
else:
|
|
345 |
RefGenesGetter._get_genetic_epi_database_from_bitbucket('plasmidfinder', tmpdir, git_commit=self.version)
|
314 | 346 |
os.chdir(tmpdir)
|
315 | |
except:
|
316 | |
raise Error('Error mkdir/chdir ' + tmpdir)
|
317 | |
|
318 | |
zipfile = 'plasmidfinder.zip'
|
319 | |
cmd = 'curl -X POST --data "folder=plasmidfinder&filename=plasmidfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
|
320 | |
print('Downloading data with:', cmd, sep='\n')
|
321 | |
common.syscall(cmd)
|
322 | |
common.syscall('unzip ' + zipfile)
|
323 | 347 |
|
324 | 348 |
print('Combining downloaded fasta files...')
|
325 | 349 |
fout_fa = pyfastaq.utils.open_file_write(final_fasta)
|
|
356 | 380 |
|
357 | 381 |
|
358 | 382 |
def _get_from_srst2_argannot(self, outprefix):
|
359 | |
srst2_version = '0.2.0'
|
360 | |
srst2_url = 'https://github.com/katholt/srst2/raw/v' + srst2_version + '/data/ARGannot.r1.fasta'
|
|
383 |
if self.version is None:
|
|
384 |
self.version = 'r2'
|
|
385 |
if self.version not in {'r1', 'r2'}:
|
|
386 |
raise Error('srst2_argannot version must be r1 or r2. Got this: ' + self.version)
|
|
387 |
|
|
388 |
version_string = '.r1' if self.version == 'r1' else '_r2'
|
|
389 |
srst2_url = 'https://raw.githubusercontent.com/katholt/srst2/master/data/ARGannot' + version_string + '.fasta'
|
361 | 390 |
srst2_fa = outprefix + '.original.fa'
|
362 | 391 |
command = 'wget -O ' + srst2_fa + ' ' + srst2_url
|
363 | 392 |
common.syscall(command, verbose=True)
|
|
388 | 417 |
print('If you use this downloaded data, please cite:')
|
389 | 418 |
print('"SRST2: Rapid genomic surveillance for public health and hospital microbiology labs",\nInouye et al 2014, Genome Medicine, PMID: 25422674\n')
|
390 | 419 |
print(argannot_ref)
|
391 | |
print('and in your methods say that the ARG-ANNOT sequences were used from version', srst2_version, 'of SRST2.')
|
|
420 |
# Use to also output the version of SRST2 here, but the r2 version of their
|
|
421 |
# fasta file was made after SRST2 release 0.2.0. At the time of writing this,
|
|
422 |
# 0.2.0 is the latest release, ie r2 isn't in an SRST2 release.
|
392 | 423 |
|
393 | 424 |
|
394 | 425 |
def _get_from_vfdb_core(self, outprefix):
|
|
426 | 457 |
print('"VFDB 2016: hierarchical and refined dataset for big data analysis-10 years on",\nChen LH et al 2016, Nucleic Acids Res. 44(Database issue):D694-D697. PMID: 26578559\n')
|
427 | 458 |
|
428 | 459 |
|
|
460 |
@classmethod
|
|
461 |
def _fix_virulencefinder_fasta_file(cls, infile, outfile):
|
|
462 |
'''Some line breaks are missing in the FASTA files from
|
|
463 |
viruslence finder. Which means there are lines like this:
|
|
464 |
AAGATCCAATAACTGAAGATGTTGAACAAACAATTCATAATATTTATGGTCAATATGCTATTTTCGTTGA
|
|
465 |
AGGTGTTGCGCATTTACCTGGACATCTCTCTCCATTATTAAAAAAATTACTACTTAAATCTTTATAA>coa:1:BA000018.3
|
|
466 |
ATGAAAAAGCAAATAATTTCGCTAGGCGCATTAGCAGTTGCATCTAGCTTATTTACATGGGATAACAAAG
|
|
467 |
and therefore the sequences are messed up when we parse them. Also
|
|
468 |
one has a > at the end, then the seq name on the next line.
|
|
469 |
This function fixes the file by adding line breaks'''
|
|
470 |
with open(infile) as f_in, open(outfile, 'w') as f_out:
|
|
471 |
for line in f_in:
|
|
472 |
if line.startswith('>') or '>' not in line:
|
|
473 |
print(line, end='', file=f_out)
|
|
474 |
elif line.endswith('>\n'):
|
|
475 |
print('WARNING: found line with ">" at the end! Fixing. Line:' + line.rstrip() + ' in file ' + infile, file=sys.stderr)
|
|
476 |
print(line.rstrip('>\n'), file=f_out)
|
|
477 |
print('>', end='', file=f_out)
|
|
478 |
else:
|
|
479 |
print('WARNING: found line with ">" not at the start! Fixing. Line:' + line.rstrip() + ' in file ' + infile, file=sys.stderr)
|
|
480 |
line1, line2 = line.split('>')
|
|
481 |
print(line1, file=f_out)
|
|
482 |
print('>', line2, sep='', end='', file=f_out)
|
|
483 |
|
|
484 |
|
429 | 485 |
def _get_from_virulencefinder(self, outprefix):
|
430 | 486 |
outprefix = os.path.abspath(outprefix)
|
431 | 487 |
final_fasta = outprefix + '.fa'
|
|
433 | 489 |
tmpdir = outprefix + '.tmp.download'
|
434 | 490 |
current_dir = os.getcwd()
|
435 | 491 |
|
436 | |
try:
|
437 | |
os.mkdir(tmpdir)
|
|
492 |
if self.version == 'old':
|
|
493 |
try:
|
|
494 |
os.mkdir(tmpdir)
|
|
495 |
os.chdir(tmpdir)
|
|
496 |
except:
|
|
497 |
raise Error('Error mkdir/chdir ' + tmpdir)
|
|
498 |
|
|
499 |
zipfile = 'plasmidfinder.zip'
|
|
500 |
cmd = 'curl -X POST --data "folder=virulencefinder&filename=virulencefinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
|
|
501 |
print('Downloading data with:', cmd, sep='\n')
|
|
502 |
common.syscall(cmd)
|
|
503 |
common.syscall('unzip ' + zipfile)
|
|
504 |
else:
|
|
505 |
RefGenesGetter._get_genetic_epi_database_from_bitbucket('plasmidfinder', tmpdir, git_commit=self.version)
|
438 | 506 |
os.chdir(tmpdir)
|
439 | |
except:
|
440 | |
raise Error('Error mkdir/chdir ' + tmpdir)
|
441 | |
|
442 | |
zipfile = 'plasmidfinder.zip'
|
443 | |
cmd = 'curl -X POST --data "folder=virulencefinder&filename=virulencefinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
|
444 | |
print('Downloading data with:', cmd, sep='\n')
|
445 | |
common.syscall(cmd)
|
446 | |
common.syscall('unzip ' + zipfile)
|
447 | 507 |
|
448 | 508 |
print('Combining downloaded fasta files...')
|
449 | 509 |
fout_fa = pyfastaq.utils.open_file_write(final_fasta)
|
|
453 | 513 |
for filename in os.listdir(tmpdir):
|
454 | 514 |
if filename.endswith('.fsa'):
|
455 | 515 |
print(' ', filename)
|
456 | |
file_reader = pyfastaq.sequences.file_reader(os.path.join(tmpdir, filename))
|
|
516 |
fix_file = os.path.join(tmpdir, filename + '.fix.fsa')
|
|
517 |
RefGenesGetter._fix_virulencefinder_fasta_file(os.path.join(tmpdir, filename), fix_file)
|
|
518 |
file_reader = pyfastaq.sequences.file_reader(fix_file)
|
457 | 519 |
for seq in file_reader:
|
458 | 520 |
original_id = seq.id
|
459 | 521 |
seq.id = seq.id.replace('_', '.', 1)
|