Codebase list fastaq / 5022692
Merge tag 'upstream/3.14.0' Upstream version 3.14.0 Sascha Steinbiss 7 years ago
11 changed file(s) with 71 addition(s) and 8 deletion(s). Raw diff Collapse all Expand all
5757
5858 | Command | Description |
5959 |-----------------------|----------------------------------------------------------------------|
60 | acgtn_only | Replace every non acgtnACGTN with an N |
6061 | add_indels | Deletes or inserts bases at given position(s) |
6162 | caf_to_fastq | Converts a CAF file to FASTQ format |
6263 | capillary_to_pairs | Converts file of capillary reads to paired and unpaired files |
0 import argparse
1 from pyfastaq import tasks
2
3 def run(description):
4 parser = argparse.ArgumentParser(
5 description = 'Replaces any character that is not one of acgtACGTnN with an N',
6 usage = 'fastaq acgtn_only [options] <infile> <outfile>')
7 parser.add_argument('infile', help='Name of input file')
8 parser.add_argument('outfile', help='Name of output file')
9 options = parser.parse_args()
10 tasks.acgtn_only(options.infile, options.outfile)
11
55 def run(description):
66 parser = argparse.ArgumentParser(
77 description = 'Takes a random subset of reads from a sequence file and optionally the corresponding read ' +
8 'from a mates file. Ouptut is interleaved if mates file given',
8 'from a mates file. Output is interleaved if mates file given',
99 usage = 'fastaq to_random_subset [options] <infile> <outfile> <percent>')
1010 parser.add_argument('--mate_file', help='Name of mates file')
1111 parser.add_argument('--seed', help='Seed for random number generator. If not given, python\'s default is used', metavar='INT')
251251 '''Replaces all occurrences of 'old' with 'new' '''
252252 self.seq = self.seq.replace(old, new)
253253
254
255 def replace_non_acgt(self):
256 '''Replace all non acgt characters with an N (case insensitive)'''
257 self.seq = re.sub(r'''[^acgtACGTnN]''', 'N', self.seq)
258
259
254260 def replace_interval(self, start, end, new):
255261 '''Replaces the sequence from start to end with the sequence "new"'''
256262 if start > end or start > len(self) - 1 or end > len(self) - 1:
44 from pyfastaq import sequences, utils, caf
55
66 class Error (Exception): pass
7
8 def acgtn_only(infile, outfile):
9 '''Replace every non-acgtn (case insensitve) character with an N'''
10 f = utils.open_file_write(outfile)
11 for seq in sequences.file_reader(infile):
12 seq.replace_non_acgt()
13 print(seq, file=f)
14 utils.close(f)
15
716
817 def caf_to_fastq(infile, outfile, min_length=0, trim=False):
918 '''Convert a CAF file to fastq. Reads shorter than min_length are not output. If clipping information is in the CAF file (with a line Clipping QUAL ...) and trim=True, then trim the reads'''
354363
355364
356365 def interleave(infile_1, infile_2, outfile, suffix1=None, suffix2=None):
357 '''Makes interleaved file from two sequence files. If used, will append suffix1 onto end
366 '''Makes interleaved file from two sequence files. If used, will append suffix1 onto end
358367 of every sequence name in infile_1, unless it already ends with suffix1. Similar for sufffix2.'''
359368 seq_reader_1 = sequences.file_reader(infile_1)
360369 seq_reader_2 = sequences.file_reader(infile_2)
809818 stats['total_length'] = sum(lengths)
810819 stats['mean'] = stats['total_length'] / len(lengths)
811820 stats['number'] = len(lengths)
812
821
813822 cumulative_length = 0
814823 for length in lengths:
815824 cumulative_length += length
818827 break
819828 else:
820829 stats = {x: 0 for x in ('longest', 'shortest', 'mean', 'N50', 'total_length', 'number')}
821
830
822831 return stats
823832
824833
841850 original_line_length = sequences.Fasta.line_length
842851 sequences.Fasta.line_length = line_length
843852 if check_unique:
844 used_names = {}
853 used_names = {}
845854
846855 for seq in seq_reader:
847856 if strip_after_first_whitespace:
868877
869878 if not all_unique:
870879 raise Error('Not all sequence names unique. Cannot continue')
871
880
872881
873882
874883 def to_fasta_union(infile, outfile, seqname='union'):
0 >seq1
1 acgtACGTnN
2 >seq2
3 aNcNgNNT
0 >seq1
1 acgtACGTnN
2 >seq2
3 aXcRg.?T
395395 fa.replace_bases('U', 'T')
396396 self.assertEqual(fa, sequences.Fasta('X', 'ATCGTTTACT'))
397397
398
399 def test_replace_non_acgt(self):
400 '''test replace_non_acgt'''
401 tests = [
402 ('acgtACGTnN', 'acgtACGTnN'),
403 ('abc.g-T?aRC1T', 'aNcNgNTNaNCNT')
404 ]
405
406 for seq, expected in tests:
407 fa = sequences.Fasta('id', seq)
408 fa.replace_non_acgt()
409 self.assertEqual(expected, fa.seq)
410
411
398412 def test_replace_interval(self):
399413 '''Test replace_interval()'''
400414 fa = sequences.Fasta('ID', 'ACGTA')
99 data_dir = os.path.join(modules_dir, 'tests', 'data')
1010
1111 class Error (Exception): pass
12
13
14 class TestACGTN_only(unittest.TestCase):
15 def test_acgtn_only(self):
16 '''Test acgtn_only'''
17 tmpfile = 'tmp.test_acgtn_only.fa'
18 infile = os.path.join(data_dir, 'test_acgtn_only.in.fa')
19 expected = os.path.join(data_dir, 'test_acgtn_only.expected.fa')
20 tasks.acgtn_only(infile, tmpfile)
21 self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False))
22 os.unlink(tmpfile)
23
1224
1325 class TestCafToFastq(unittest.TestCase):
1426 def test_caf_to_fastq_default(self):
33 import sys
44
55 tasks = {
6 'acgtn_only': 'Replace every non acgtnACGTN with an N',
67 'add_indels': 'Deletes or inserts bases at given position(s)',
78 'caf_to_fastq': 'Converts a CAF file to FASTQ format',
89 'capillary_to_pairs': 'Converts file of capillary reads to paired and unpaired files',
1920 'make_random_contigs': 'Make contigs of random sequence',
2021 'merge': 'Converts multi sequence file to a single sequence',
2122 'replace_bases': 'Replaces all occurrences of one letter with another',
22 'reverse_complement': 'Reverse complement all sequences',
23 'reverse_complement': 'Reverse complement all sequences',
2324 'scaffolds_to_contigs': 'Creates a file of contigs from a file of scaffolds',
2425 'search_for_seq': 'Find all exact matches to a string (and its reverse complement)',
2526 'sequence_trim': 'Trim exact matches to a given string off the start of every sequence',
33
44 setup(
55 name='pyfastaq',
6 version='3.13.0',
6 version='3.14.0',
77 description='Script to manipulate FASTA and FASTQ files, plus API for developers',
88 packages = find_packages(),
99 author='Martin Hunt',