Commit 502269258c7b9c93b6f4f948a84adf611720a4fb - fastaq

+1

-0

README.md less more

57	57
58	58	\| Command \| Description \|
59	59	\|-----------------------\|----------------------------------------------------------------------\|
	60	\| acgtn_only \| Replace every non acgtnACGTN with an N \|
60	61	\| add_indels \| Deletes or inserts bases at given position(s) \|
61	62	\| caf_to_fastq \| Converts a CAF file to FASTQ format \|
62	63	\| capillary_to_pairs \| Converts file of capillary reads to paired and unpaired files \|

+12

-0

pyfastaq/runners/acgtn_only.py less more

	0	import argparse
	1	from pyfastaq import tasks
	2
	3	def run(description):
	4	parser = argparse.ArgumentParser(
	5	description = 'Replaces any character that is not one of acgtACGTnN with an N',
	6	usage = 'fastaq acgtn_only [options] <infile> <outfile>')
	7	parser.add_argument('infile', help='Name of input file')
	8	parser.add_argument('outfile', help='Name of output file')
	9	options = parser.parse_args()
	10	tasks.acgtn_only(options.infile, options.outfile)
	11

+1

-1

pyfastaq/runners/to_random_subset.py less more

5	5	def run(description):
6	6	parser = argparse.ArgumentParser(
7	7	description = 'Takes a random subset of reads from a sequence file and optionally the corresponding read ' +
8		'from a mates file. Ouptut is interleaved if mates file given',
	8	'from a mates file. Output is interleaved if mates file given',
9	9	usage = 'fastaq to_random_subset [options] <infile> <outfile> <percent>')
10	10	parser.add_argument('--mate_file', help='Name of mates file')
11	11	parser.add_argument('--seed', help='Seed for random number generator. If not given, python\'s default is used', metavar='INT')

+6

-0

pyfastaq/sequences.py less more

251	251	'''Replaces all occurrences of 'old' with 'new' '''
252	252	self.seq = self.seq.replace(old, new)
253	253
	254
	255	def replace_non_acgt(self):
	256	'''Replace all non acgt characters with an N (case insensitive)'''
	257	self.seq = re.sub(r'''[^acgtACGTnN]''', 'N', self.seq)
	258
	259
254	260	def replace_interval(self, start, end, new):
255	261	'''Replaces the sequence from start to end with the sequence "new"'''
256	262	if start > end or start > len(self) - 1 or end > len(self) - 1:

+14

-5

pyfastaq/tasks.py less more

4	4	from pyfastaq import sequences, utils, caf
5	5
6	6	class Error (Exception): pass
	7
	8	def acgtn_only(infile, outfile):
	9	'''Replace every non-acgtn (case insensitve) character with an N'''
	10	f = utils.open_file_write(outfile)
	11	for seq in sequences.file_reader(infile):
	12	seq.replace_non_acgt()
	13	print(seq, file=f)
	14	utils.close(f)
	15
7	16
8	17	def caf_to_fastq(infile, outfile, min_length=0, trim=False):
9	18	'''Convert a CAF file to fastq. Reads shorter than min_length are not output. If clipping information is in the CAF file (with a line Clipping QUAL ...) and trim=True, then trim the reads'''

354	363
355	364
356	365	def interleave(infile_1, infile_2, outfile, suffix1=None, suffix2=None):
357		'''Makes interleaved file from two sequence files. If used, will append suffix1 onto end
	366	'''Makes interleaved file from two sequence files. If used, will append suffix1 onto end
358	367	of every sequence name in infile_1, unless it already ends with suffix1. Similar for sufffix2.'''
359	368	seq_reader_1 = sequences.file_reader(infile_1)
360	369	seq_reader_2 = sequences.file_reader(infile_2)

809	818	stats['total_length'] = sum(lengths)
810	819	stats['mean'] = stats['total_length'] / len(lengths)
811	820	stats['number'] = len(lengths)
812
	821
813	822	cumulative_length = 0
814	823	for length in lengths:
815	824	cumulative_length += length

818	827	break
819	828	else:
820	829	stats = {x: 0 for x in ('longest', 'shortest', 'mean', 'N50', 'total_length', 'number')}
821
	830
822	831	return stats
823	832
824	833

841	850	original_line_length = sequences.Fasta.line_length
842	851	sequences.Fasta.line_length = line_length
843	852	if check_unique:
844		used_names = {}
	853	used_names = {}
845	854
846	855	for seq in seq_reader:
847	856	if strip_after_first_whitespace:

868	877
869	878	if not all_unique:
870	879	raise Error('Not all sequence names unique. Cannot continue')
871
	880
872	881
873	882
874	883	def to_fasta_union(infile, outfile, seqname='union'):

+4

-0

pyfastaq/tests/data/test_acgtn_only.expected.fa less more

	0	>seq1
	1	acgtACGTnN
	2	>seq2
	3	aNcNgNNT

+4

-0

pyfastaq/tests/data/test_acgtn_only.in.fa less more

	0	>seq1
	1	acgtACGTnN
	2	>seq2
	3	aXcRg.?T

+14

-0

pyfastaq/tests/sequences_test.py less more

395	395	fa.replace_bases('U', 'T')
396	396	self.assertEqual(fa, sequences.Fasta('X', 'ATCGTTTACT'))
397	397
	398
	399	def test_replace_non_acgt(self):
	400	'''test replace_non_acgt'''
	401	tests = [
	402	('acgtACGTnN', 'acgtACGTnN'),
	403	('abc.g-T?aRC1T', 'aNcNgNTNaNCNT')
	404	]
	405
	406	for seq, expected in tests:
	407	fa = sequences.Fasta('id', seq)
	408	fa.replace_non_acgt()
	409	self.assertEqual(expected, fa.seq)
	410
	411
398	412	def test_replace_interval(self):
399	413	'''Test replace_interval()'''
400	414	fa = sequences.Fasta('ID', 'ACGTA')

+12

-0

pyfastaq/tests/tasks_test.py less more

9	9	data_dir = os.path.join(modules_dir, 'tests', 'data')
10	10
11	11	class Error (Exception): pass
	12
	13
	14	class TestACGTN_only(unittest.TestCase):
	15	def test_acgtn_only(self):
	16	'''Test acgtn_only'''
	17	tmpfile = 'tmp.test_acgtn_only.fa'
	18	infile = os.path.join(data_dir, 'test_acgtn_only.in.fa')
	19	expected = os.path.join(data_dir, 'test_acgtn_only.expected.fa')
	20	tasks.acgtn_only(infile, tmpfile)
	21	self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False))
	22	os.unlink(tmpfile)
	23
12	24
13	25	class TestCafToFastq(unittest.TestCase):
14	26	def test_caf_to_fastq_default(self):

+2

-1

scripts/fastaq less more

3	3	import sys
4	4
5	5	tasks = {
	6	'acgtn_only': 'Replace every non acgtnACGTN with an N',
6	7	'add_indels': 'Deletes or inserts bases at given position(s)',
7	8	'caf_to_fastq': 'Converts a CAF file to FASTQ format',
8	9	'capillary_to_pairs': 'Converts file of capillary reads to paired and unpaired files',

19	20	'make_random_contigs': 'Make contigs of random sequence',
20	21	'merge': 'Converts multi sequence file to a single sequence',
21	22	'replace_bases': 'Replaces all occurrences of one letter with another',
22		'reverse_complement': 'Reverse complement all sequences',
	23	'reverse_complement': 'Reverse complement all sequences',
23	24	'scaffolds_to_contigs': 'Creates a file of contigs from a file of scaffolds',
24	25	'search_for_seq': 'Find all exact matches to a string (and its reverse complement)',
25	26	'sequence_trim': 'Trim exact matches to a given string off the start of every sequence',

+1

-1

setup.py less more

3	3
4	4	setup(
5	5	name='pyfastaq',
6		version='3.13.0',
	6	version='3.14.0',
7	7	description='Script to manipulate FASTA and FASTQ files, plus API for developers',
8	8	packages = find_packages(),
9	9	author='Martin Hunt',