Merge tag 'upstream/3.14.0'
Upstream version 3.14.0
Sascha Steinbiss
7 years ago
57 | 57 | |
58 | 58 | | Command | Description | |
59 | 59 | |-----------------------|----------------------------------------------------------------------| |
60 | | acgtn_only | Replace every non acgtnACGTN with an N | | |
60 | 61 | | add_indels | Deletes or inserts bases at given position(s) | |
61 | 62 | | caf_to_fastq | Converts a CAF file to FASTQ format | |
62 | 63 | | capillary_to_pairs | Converts file of capillary reads to paired and unpaired files | |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Replaces any character that is not one of acgtACGTnN with an N', | |
6 | usage = 'fastaq acgtn_only [options] <infile> <outfile>') | |
7 | parser.add_argument('infile', help='Name of input file') | |
8 | parser.add_argument('outfile', help='Name of output file') | |
9 | options = parser.parse_args() | |
10 | tasks.acgtn_only(options.infile, options.outfile) | |
11 |
5 | 5 | def run(description): |
6 | 6 | parser = argparse.ArgumentParser( |
7 | 7 | description = 'Takes a random subset of reads from a sequence file and optionally the corresponding read ' + |
8 | 'from a mates file. Ouptut is interleaved if mates file given', | |
8 | 'from a mates file. Output is interleaved if mates file given', | |
9 | 9 | usage = 'fastaq to_random_subset [options] <infile> <outfile> <percent>') |
10 | 10 | parser.add_argument('--mate_file', help='Name of mates file') |
11 | 11 | parser.add_argument('--seed', help='Seed for random number generator. If not given, python\'s default is used', metavar='INT') |
251 | 251 | '''Replaces all occurrences of 'old' with 'new' ''' |
252 | 252 | self.seq = self.seq.replace(old, new) |
253 | 253 | |
254 | ||
255 | def replace_non_acgt(self): | |
256 | '''Replace all non acgt characters with an N (case insensitive)''' | |
257 | self.seq = re.sub(r'''[^acgtACGTnN]''', 'N', self.seq) | |
258 | ||
259 | ||
254 | 260 | def replace_interval(self, start, end, new): |
255 | 261 | '''Replaces the sequence from start to end with the sequence "new"''' |
256 | 262 | if start > end or start > len(self) - 1 or end > len(self) - 1: |
4 | 4 | from pyfastaq import sequences, utils, caf |
5 | 5 | |
6 | 6 | class Error (Exception): pass |
7 | ||
8 | def acgtn_only(infile, outfile): | |
9 | '''Replace every non-acgtn (case insensitve) character with an N''' | |
10 | f = utils.open_file_write(outfile) | |
11 | for seq in sequences.file_reader(infile): | |
12 | seq.replace_non_acgt() | |
13 | print(seq, file=f) | |
14 | utils.close(f) | |
15 | ||
7 | 16 | |
8 | 17 | def caf_to_fastq(infile, outfile, min_length=0, trim=False): |
9 | 18 | '''Convert a CAF file to fastq. Reads shorter than min_length are not output. If clipping information is in the CAF file (with a line Clipping QUAL ...) and trim=True, then trim the reads''' |
354 | 363 | |
355 | 364 | |
356 | 365 | def interleave(infile_1, infile_2, outfile, suffix1=None, suffix2=None): |
357 | '''Makes interleaved file from two sequence files. If used, will append suffix1 onto end | |
366 | '''Makes interleaved file from two sequence files. If used, will append suffix1 onto end | |
358 | 367 | of every sequence name in infile_1, unless it already ends with suffix1. Similar for sufffix2.''' |
359 | 368 | seq_reader_1 = sequences.file_reader(infile_1) |
360 | 369 | seq_reader_2 = sequences.file_reader(infile_2) |
809 | 818 | stats['total_length'] = sum(lengths) |
810 | 819 | stats['mean'] = stats['total_length'] / len(lengths) |
811 | 820 | stats['number'] = len(lengths) |
812 | ||
821 | ||
813 | 822 | cumulative_length = 0 |
814 | 823 | for length in lengths: |
815 | 824 | cumulative_length += length |
818 | 827 | break |
819 | 828 | else: |
820 | 829 | stats = {x: 0 for x in ('longest', 'shortest', 'mean', 'N50', 'total_length', 'number')} |
821 | ||
830 | ||
822 | 831 | return stats |
823 | 832 | |
824 | 833 | |
841 | 850 | original_line_length = sequences.Fasta.line_length |
842 | 851 | sequences.Fasta.line_length = line_length |
843 | 852 | if check_unique: |
844 | used_names = {} | |
853 | used_names = {} | |
845 | 854 | |
846 | 855 | for seq in seq_reader: |
847 | 856 | if strip_after_first_whitespace: |
868 | 877 | |
869 | 878 | if not all_unique: |
870 | 879 | raise Error('Not all sequence names unique. Cannot continue') |
871 | ||
880 | ||
872 | 881 | |
873 | 882 | |
874 | 883 | def to_fasta_union(infile, outfile, seqname='union'): |
395 | 395 | fa.replace_bases('U', 'T') |
396 | 396 | self.assertEqual(fa, sequences.Fasta('X', 'ATCGTTTACT')) |
397 | 397 | |
398 | ||
399 | def test_replace_non_acgt(self): | |
400 | '''test replace_non_acgt''' | |
401 | tests = [ | |
402 | ('acgtACGTnN', 'acgtACGTnN'), | |
403 | ('abc.g-T?aRC1T', 'aNcNgNTNaNCNT') | |
404 | ] | |
405 | ||
406 | for seq, expected in tests: | |
407 | fa = sequences.Fasta('id', seq) | |
408 | fa.replace_non_acgt() | |
409 | self.assertEqual(expected, fa.seq) | |
410 | ||
411 | ||
398 | 412 | def test_replace_interval(self): |
399 | 413 | '''Test replace_interval()''' |
400 | 414 | fa = sequences.Fasta('ID', 'ACGTA') |
9 | 9 | data_dir = os.path.join(modules_dir, 'tests', 'data') |
10 | 10 | |
11 | 11 | class Error (Exception): pass |
12 | ||
13 | ||
14 | class TestACGTN_only(unittest.TestCase): | |
15 | def test_acgtn_only(self): | |
16 | '''Test acgtn_only''' | |
17 | tmpfile = 'tmp.test_acgtn_only.fa' | |
18 | infile = os.path.join(data_dir, 'test_acgtn_only.in.fa') | |
19 | expected = os.path.join(data_dir, 'test_acgtn_only.expected.fa') | |
20 | tasks.acgtn_only(infile, tmpfile) | |
21 | self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False)) | |
22 | os.unlink(tmpfile) | |
23 | ||
12 | 24 | |
13 | 25 | class TestCafToFastq(unittest.TestCase): |
14 | 26 | def test_caf_to_fastq_default(self): |
3 | 3 | import sys |
4 | 4 | |
5 | 5 | tasks = { |
6 | 'acgtn_only': 'Replace every non acgtnACGTN with an N', | |
6 | 7 | 'add_indels': 'Deletes or inserts bases at given position(s)', |
7 | 8 | 'caf_to_fastq': 'Converts a CAF file to FASTQ format', |
8 | 9 | 'capillary_to_pairs': 'Converts file of capillary reads to paired and unpaired files', |
19 | 20 | 'make_random_contigs': 'Make contigs of random sequence', |
20 | 21 | 'merge': 'Converts multi sequence file to a single sequence', |
21 | 22 | 'replace_bases': 'Replaces all occurrences of one letter with another', |
22 | 'reverse_complement': 'Reverse complement all sequences', | |
23 | 'reverse_complement': 'Reverse complement all sequences', | |
23 | 24 | 'scaffolds_to_contigs': 'Creates a file of contigs from a file of scaffolds', |
24 | 25 | 'search_for_seq': 'Find all exact matches to a string (and its reverse complement)', |
25 | 26 | 'sequence_trim': 'Trim exact matches to a given string off the start of every sequence', |