Imported Upstream version 3.2.0
Andreas Tille
9 years ago
0 | recursive-include fastaq/tests *.txt *.sh *.py *_test_* | |
0 | recursive-include pyfastaq/tests *.txt *.py *_test_* |
0 | 0 | Fastaq |
1 | 1 | ====== |
2 | 2 | |
3 | Python3 scripts to manipulate FASTA and FASTQ files, plus API for developers | |
3 | Python3 script to manipulate FASTA and FASTQ (and other format) files, plus API for developers | |
4 | 4 | |
5 | 5 | Installation |
6 | 6 | ------------ |
7 | 7 | |
8 | Run the tests: | |
8 | Dependencies: | |
9 | * numpy (install with `apt-get install python3-numpy`) | |
10 | ||
11 | Install with pip: | |
12 | ||
13 | pip3 install pyfastaq | |
14 | ||
15 | ||
16 | Alternatively, you can download the latest release from this github repository, | |
17 | or clone the repository. Then run the tests: | |
9 | 18 | |
10 | 19 | python3 setup.py test |
11 | 20 | |
12 | Install: | |
21 | If the tests all pass, install: | |
13 | 22 | |
14 | 23 | python3 setup.py install |
15 | 24 | |
16 | Notes: | |
17 | * A few scripts assume that samtools is installed and in your path. This is NOT tested in the tests, because most scripts don't need it. | |
18 | * The installation will put all scripts in your path and are named fastaq_*. | |
19 | 25 | |
20 | Scripts | |
21 | ------- | |
26 | Usage | |
27 | ----- | |
28 | ||
29 | The installation will put a single script called `fastaq` in your path. | |
30 | The usage is: | |
31 | ||
32 | fastaq <command> [options] | |
33 | ||
22 | 34 | |
23 | 35 | Key points: |
24 | * Use -h or --help with a script to get its usage. | |
25 | * All scripts automatically detect whether the input is a FASTA or FASTQ file. | |
36 | * To list the available commands and brief descriptions, just run `fastaq` | |
37 | * Use `fastaq command -h` or `fastaq command --help` to get a longer description and the usage of that command. | |
38 | * The type of input file is automatically detected. Currently supported: | |
39 | `FASTA`, `FASTQ`, `GFF3`, `EMBL`, `GBK`, `Phylip`. | |
40 | * `fastaq` only manipulates sequences (and | |
41 | quality scores if present), so annotation is ignored where present in the input. | |
26 | 42 | * Input and output files can be gzipped. An input file is assumed to be gzipped if its name ends with .gz. To gzip an output file, just name it with .gz at the end. |
27 | * You can use a minus sign for a filename to use stdin or stdout, so scripts can be piped together. See the following examples. | |
43 | * You can use a minus sign for a filename to use stdin or stdout, so commands can be piped together. See the example below. | |
44 | ||
45 | ||
46 | Examples | |
47 | -------- | |
28 | 48 | |
29 | 49 | Reverse complement all sequences in a file: |
30 | 50 | |
31 | fastaq_reverse_complement in.fastq out.fastq | |
51 | fastaq reverse_complement in.fastq out.fastq | |
32 | 52 | |
33 | Reverse complement all sequences in a gzipped file, then translate each sequence | |
53 | Reverse complement all sequences in a gzipped file, then translate each sequence: | |
34 | 54 | |
35 | fastaq_reverse_complement in.fastq.gz - | fastaq_translate - out.fasta | |
55 | fastaq reverse_complement in.fastq.gz - | fastaq translate - out.fasta | |
56 | ||
57 | ||
58 | Available commands | |
59 | ------------------ | |
60 | ||
61 | | Command | Description | | |
62 | |-----------------------|----------------------------------------------------------------------| | |
63 | | add_indels | Deletes or inserts bases at given position(s) | | |
64 | | caf_to_fastq | Converts a CAF file to FASTQ format | | |
65 | | capillary_to_pairs | Converts file of capillary reads to paired and unpaired files | | |
66 | | chunker | Splits sequences into equal sized chunks | | |
67 | | count_sequences | Counts the sequences in input file | | |
68 | | deinterleave | Splits interleaved paired file into two separate files | | |
69 | | enumerate_names | Renames sequences in a file, calling them 1,2,3... etc | | |
70 | | expand_nucleotides | Makes every combination of degenerate nucleotides | | |
71 | | fasta_to_fastq | Convert FASTA and .qual to FASTQ | | |
72 | | filter | Filter sequences to get a subset of them | | |
73 | | get_ids | Get the ID of each sequence | | |
74 | | get_seq_flanking_gaps | Gets the sequences flanking gaps | | |
75 | | interleave | Interleaves two files, output is alternating between fwd/rev reads | | |
76 | | long_read_simulate | Simulates long reads from reference | | |
77 | | make_random_contigs | Make contigs of random sequence | | |
78 | | merge | Converts multi sequence file to a single sequence | | |
79 | | replace_bases | Replaces all occurences of one letter with another | | |
80 | | reverse_complement | Reverse complement all sequences | | |
81 | | scaffolds_to_contigs | Creates a file of contigs from a file of scaffolds | | |
82 | | search_for_seq | Find all exact matches to a string (and its reverse complement) | | |
83 | | sequence_trim | Trim exact matches to a given string off the start of every sequence | | |
84 | | sort_by_size | Sorts sequences in length order | | |
85 | | split_by_base_count | Split multi sequence file into separate files | | |
86 | | strip_illumina_suffix | Strips /1 or /2 off the end of every read name | | |
87 | | to_fake_qual | Make fake quality scores file | | |
88 | | to_fasta | Converts a variety of input formats to nicely formatted FASTA format | | |
89 | | to_mira_xml | Create an xml file from a file of reads, for use with Mira assembler | | |
90 | | to_orfs_gff | Writes a GFF file of open reading frames | | |
91 | | to_perfect_reads | Make perfect paired reads from reference | | |
92 | | to_random_subset | Make a random sample of sequences (and optionally mates as well) | | |
93 | | to_tiling_bam | Make a BAM file of reads uniformly spread across the input reference | | |
94 | | to_unique_by_id | Remove duplicate sequences, based on their names. Keep longest seqs | | |
95 | | translate | Translate all sequences in input nucleotide sequences | | |
96 | | trim_Ns_at_end | Trims all Ns at the start/end of all sequences | | |
97 | | trim_contigs | Trims a set number of bases off the end of every contig | | |
98 | | trim_ends | Trim fixed number of bases of start and/or end of every sequence | | |
99 | | version | Print version number and exit | | |
100 | ||
36 | 101 | |
37 | 102 | For developers |
38 | 103 | -------------- |
39 | 104 | |
40 | 105 | Here is a template for counting the sequences in a FASTA or FASTQ file: |
41 | 106 | |
42 | from fastaq import sequences | |
107 | from pyfastaq import sequences | |
43 | 108 | seq_reader = sequences.file_reader(infile) |
44 | 109 | count = 0 |
45 | 110 | for seq in seq_reader: |
46 | 111 | count += 1 |
47 | 112 | print(count) |
48 | 113 | |
49 | Hopefully you get the idea and there are plenty of examples in tasks.py. Detection of FASTA or FASTQ and gzipped or not input file 'infile' is automatic. See help(sequences) for the various methods already defined in the classes Fasta and Fastq. | |
114 | Hopefully you get the idea and there are plenty of examples in tasks.py. Detection of the input file type and whether gzipped or not is automatic. See help(sequences) for the various methods already defined in the classes Fasta and Fastq. |
0 | class Error (Exception): pass | |
1 | ||
2 | ||
3 | class Interval: | |
4 | '''A class to deal with intervals in a genome. Can do things like intersections, unions etc''' | |
5 | def __init__(self, start, end): | |
6 | try: | |
7 | self.start = int(start) | |
8 | self.end = int(end) | |
9 | except ValueError: | |
10 | raise Error('Error making interval from :"' + str(start) + '" and "' + str(end) + '"') | |
11 | ||
12 | if self.end < self.start: | |
13 | raise Error('Error making interval ' + str(self) + '. end < start.') | |
14 | ||
15 | def __len__(self): | |
16 | return self.end - self.start + 1 | |
17 | ||
18 | def __eq__(self, other): | |
19 | return type(other) is type(self) and self.__dict__ == other.__dict__ | |
20 | ||
21 | def __ne__(self, other): | |
22 | return not self.__eq__(other) | |
23 | ||
24 | def __str__(self): | |
25 | return '(' + str(self.start) + ',' + str(self.end) + ')' | |
26 | ||
27 | def __lt__(self, i): | |
28 | return self.start < i.start or (self.start == i.start and self.end < i.end) | |
29 | ||
30 | def __le__(self, i): | |
31 | return self.start < i.start or (self.start == i.start and self.end <= i.end) | |
32 | ||
33 | def intersects(self, i): | |
34 | '''Returns true iff this interval intersects the interval i''' | |
35 | return self.start <= i.end and i.start <= self.end | |
36 | ||
37 | def contains(self, i): | |
38 | '''Returns true iff this interval contains the interval i''' | |
39 | return self.start <= i.start and i.end <= self.end | |
40 | ||
41 | def union(self, i): | |
42 | '''If intervals intersect, returns their union, otherwise returns None''' | |
43 | if self.intersects(i) or self.end + 1 == i.start or i.end + 1 == self.start: | |
44 | return Interval(min(self.start, i.start), max(self.end, i.end)) | |
45 | else: | |
46 | return None | |
47 | ||
48 | def union_fill_gap(self, i): | |
49 | '''Like union, but ignores whether the two intervals intersect or not''' | |
50 | return Interval(min(self.start, i.start), max(self.end, i.end)) | |
51 | ||
52 | def intersection(self, i): | |
53 | '''If intervals intersect, returns their intersection, otherwise returns None''' | |
54 | if self.intersects(i): | |
55 | return Interval(max(self.start, i.start), min(self.end, i.end)) | |
56 | else: | |
57 | return None | |
58 | ||
59 | ||
60 | def intersection(l1, l2): | |
61 | '''Returns intersection of two lists. Assumes the lists are sorted by start positions''' | |
62 | if len(l1) == 0 or len(l2) == 0: | |
63 | return [] | |
64 | ||
65 | out = [] | |
66 | l2_pos = 0 | |
67 | ||
68 | for l in l1: | |
69 | while l2_pos < len(l2) and l2[l2_pos].end < l.start: | |
70 | l2_pos += 1 | |
71 | ||
72 | if l2_pos == len(l2): | |
73 | break | |
74 | ||
75 | while l2_pos < len(l2) and l.intersects(l2[l2_pos]): | |
76 | out.append(l.intersection(l2[l2_pos])) | |
77 | l2_pos += 1 | |
78 | ||
79 | l2_pos = max(0, l2_pos - 1) | |
80 | ||
81 | return out | |
82 | ||
83 | ||
84 | def merge_overlapping_in_list(l): | |
85 | '''Sorts list, merges any overlapping intervals, and also adjacent intervals. e.g. | |
86 | [0,1], [1,2] would be merge to [0,.2].''' | |
87 | i = 0 | |
88 | l.sort() | |
89 | ||
90 | while i < len(l) - 1: | |
91 | u = l[i].union(l[i+1]) | |
92 | if u is not None: | |
93 | l[i] = u | |
94 | l.pop(i+1) | |
95 | else: | |
96 | i += 1 | |
97 | ||
98 | ||
99 | def remove_contained_in_list(l): | |
100 | '''Sorts list in place, then removes any intervals that are completely | |
101 | contained inside another interval''' | |
102 | i = 0 | |
103 | l.sort() | |
104 | ||
105 | while i < len(l) - 1: | |
106 | if l[i+1].contains(l[i]): | |
107 | l.pop(i) | |
108 | elif l[i].contains(l[i+1]): | |
109 | l.pop(i+1) | |
110 | else: | |
111 | i += 1 | |
112 | ||
113 | ||
114 | def length_sum_from_list(l): | |
115 | '''Returns total length of intervals from a list''' | |
116 | return sum([len(x) for x in l]) |
0 | import re | |
1 | import string | |
2 | import random | |
3 | import itertools | |
4 | ||
5 | from fastaq import utils, intervals | |
6 | ||
7 | class Error (Exception): pass | |
8 | ||
9 | ||
10 | # python 3's seek is glacially slow. When we read a fasta file, we know | |
11 | # we've reached the end of a sequence when we get a new line starting with | |
12 | # '>'. Instead of using seek and tell, we just remember the previous line | |
13 | # of the file, for any given filehandle | |
14 | previous_lines = {} | |
15 | ||
16 | ||
17 | codon2aa = { | |
18 | 'GCA': 'A', | |
19 | 'GCC': 'A', | |
20 | 'GCG': 'A', | |
21 | 'GCT': 'A', | |
22 | 'AGA': 'R', | |
23 | 'AGG': 'R', | |
24 | 'CGA': 'R', | |
25 | 'CGC': 'R', | |
26 | 'CGG': 'R', | |
27 | 'CGT': 'R', | |
28 | 'AAC': 'N', | |
29 | 'AAT': 'N', | |
30 | 'GAC': 'D', | |
31 | 'GAT': 'D', | |
32 | 'TGC': 'C', | |
33 | 'TGT': 'C', | |
34 | 'GAA': 'E', | |
35 | 'GAG': 'E', | |
36 | 'CAA': 'Q', | |
37 | 'CAG': 'Q', | |
38 | 'GGA': 'G', | |
39 | 'GGC': 'G', | |
40 | 'GGG': 'G', | |
41 | 'GGT': 'G', | |
42 | 'CAC': 'H', | |
43 | 'CAT': 'H', | |
44 | 'ATA': 'I', | |
45 | 'ATC': 'I', | |
46 | 'ATT': 'I', | |
47 | 'TTA': 'L', | |
48 | 'TTG': 'L', | |
49 | 'CTA': 'L', | |
50 | 'CTC': 'L', | |
51 | 'CTG': 'L', | |
52 | 'CTT': 'L', | |
53 | 'AAA': 'K', | |
54 | 'AAG': 'K', | |
55 | 'ATG': 'M', | |
56 | 'TTC': 'F', | |
57 | 'TTT': 'F', | |
58 | 'CCA': 'P', | |
59 | 'CCC': 'P', | |
60 | 'CCG': 'P', | |
61 | 'CCT': 'P', | |
62 | 'AGC': 'S', | |
63 | 'AGT': 'S', | |
64 | 'TCA': 'S', | |
65 | 'TCC': 'S', | |
66 | 'TCG': 'S', | |
67 | 'TCT': 'S', | |
68 | 'ACA': 'T', | |
69 | 'ACC': 'T', | |
70 | 'ACG': 'T', | |
71 | 'ACT': 'T', | |
72 | 'TGG': 'W', | |
73 | 'TAC': 'Y', | |
74 | 'TAT': 'Y', | |
75 | 'GTA': 'V', | |
76 | 'GTC': 'V', | |
77 | 'GTG': 'V', | |
78 | 'GTT': 'V', | |
79 | 'TAA': '*', | |
80 | 'TAG': '*', | |
81 | 'TGA': '*'} | |
82 | ||
83 | ||
84 | redundant_nts = { | |
85 | 'R': ('A', 'G'), | |
86 | 'Y': ('C', 'T'), | |
87 | 'S': ('C', 'G'), | |
88 | 'W': ('A', 'T'), | |
89 | 'K': ('G', 'T'), | |
90 | 'M': ('A', 'C'), | |
91 | 'B': ('C', 'G', 'T'), | |
92 | 'D': ('A', 'G', 'T'), | |
93 | 'H': ('A', 'C', 'T'), | |
94 | 'V': ('A', 'C', 'G'), | |
95 | 'N': ('A', 'C', 'G', 'T') | |
96 | } | |
97 | ||
98 | def file_reader(fname, read_quals=False): | |
99 | '''Iterates over a FASTA or FASTQ file, yielding the next sequence in the file until there are no more sequences''' | |
100 | f = utils.open_file_read(fname) | |
101 | line = f.readline() | |
102 | phylip_regex = re.compile('^\s*[0-9]+\s+[0-9]+$') | |
103 | gbk_regex = re.compile('^LOCUS\s+\S') | |
104 | ||
105 | if line.startswith('>'): | |
106 | seq = Fasta() | |
107 | previous_lines[f] = line | |
108 | elif line.startswith('##gff-version 3'): | |
109 | seq = Fasta() | |
110 | # if a GFF file, need to skip past all the annotation | |
111 | # and get to the fasta sequences at the end of the file | |
112 | while not line.startswith('>'): | |
113 | line = f.readline() | |
114 | if not line: | |
115 | utils.close(f) | |
116 | raise Error('No sequences found in GFF file "' + fname + '"') | |
117 | ||
118 | seq = Fasta() | |
119 | previous_lines[f] = line | |
120 | elif line.startswith('ID ') and line[5] != ' ': | |
121 | seq = Embl() | |
122 | previous_lines[f] = line | |
123 | elif gbk_regex.search(line): | |
124 | seq = Embl() | |
125 | previous_lines[f] = line | |
126 | elif line.startswith('@'): | |
127 | seq = Fastq() | |
128 | previous_lines[f] = line | |
129 | elif phylip_regex.search(line): | |
130 | # phylip format could be interleaved or not, need to look at next | |
131 | # couple of lines to figure that out. Don't expect these files to | |
132 | # be too huge, so just store all the sequences in memory | |
133 | number_of_seqs, bases_per_seq = line.strip().split() | |
134 | number_of_seqs = int(number_of_seqs) | |
135 | bases_per_seq = int(bases_per_seq) | |
136 | got_blank_line = False | |
137 | ||
138 | first_line = line | |
139 | seq_lines = [] | |
140 | while 1: | |
141 | line = f.readline() | |
142 | if line == '': | |
143 | break | |
144 | elif line == '\n': | |
145 | got_blank_line = True | |
146 | else: | |
147 | seq_lines.append(line.rstrip()) | |
148 | utils.close(f) | |
149 | ||
150 | if len(seq_lines) == 1 or len(seq_lines) == number_of_seqs: | |
151 | sequential = True | |
152 | elif seq_lines[0][10] != ' ' and seq_lines[1][10] == ' ': | |
153 | sequential = True | |
154 | else: | |
155 | sequential = False | |
156 | ||
157 | # if the 11th char of second sequence line is a space, then the file is sequential, e.g.: | |
158 | # GAGCCCGGGC AATACAGGGT AT | |
159 | # as opposed to: | |
160 | # Salmo gairAAGCCTTGGC AGTGCAGGGT | |
161 | if sequential: | |
162 | current_id = None | |
163 | current_seq = '' | |
164 | for line in seq_lines: | |
165 | if len(current_seq) == bases_per_seq or len(current_seq) == 0: | |
166 | if current_id is not None: | |
167 | yield Fasta(current_id, current_seq.replace('-', '')) | |
168 | current_seq = '' | |
169 | current_id, new_bases = line[0:10].rstrip(), line.rstrip()[10:] | |
170 | else: | |
171 | new_bases = line.rstrip() | |
172 | ||
173 | current_seq += new_bases.replace(' ','') | |
174 | ||
175 | yield Fasta(current_id, current_seq.replace('-', '')) | |
176 | else: | |
177 | # seaview files start all seqs at pos >=12. Other files start | |
178 | # their sequence at the start of the line | |
179 | if seq_lines[number_of_seqs + 1][0] == ' ': | |
180 | first_gap_pos = seq_lines[0].find(' ') | |
181 | end_of_gap = first_gap_pos | |
182 | while seq_lines[0][end_of_gap] == ' ': | |
183 | end_of_gap += 1 | |
184 | first_seq_base = end_of_gap | |
185 | else: | |
186 | first_seq_base = 10 | |
187 | ||
188 | seqs = [] | |
189 | for i in range(number_of_seqs): | |
190 | name, bases = seq_lines[i][0:first_seq_base].rstrip(), seq_lines[i][first_seq_base:] | |
191 | seqs.append(Fasta(name, bases)) | |
192 | ||
193 | for i in range(number_of_seqs, len(seq_lines)): | |
194 | seqs[i%number_of_seqs].seq += seq_lines[i] | |
195 | ||
196 | for fa in seqs: | |
197 | fa.seq = fa.seq.replace(' ','').replace('-','') | |
198 | yield fa | |
199 | ||
200 | return | |
201 | elif line == '': | |
202 | utils.close(f) | |
203 | return | |
204 | else: | |
205 | utils.close(f) | |
206 | raise Error('Error determining file type from file "' + fname + '". First line is:\n' + line.rstrip()) | |
207 | ||
208 | try: | |
209 | while seq.get_next_from_file(f, read_quals): | |
210 | yield seq | |
211 | finally: | |
212 | utils.close(f) | |
213 | ||
214 | ||
215 | class Fasta: | |
216 | '''Class to store and manipulate FASTA sequences. They have two things: a name and a sequence''' | |
217 | # this defines the line length when printing sequences | |
218 | line_length = 60 | |
219 | ||
220 | def _get_id_from_header_line(self, line): | |
221 | if line.startswith('>'): | |
222 | return line.rstrip()[1:] | |
223 | else: | |
224 | raise Error('Error! expected line starting with ">", but got this:\n', line) | |
225 | ||
226 | ||
227 | def __init__(self, id_in=None, seq_in=None): | |
228 | self.id = id_in | |
229 | self.seq = seq_in | |
230 | ||
231 | def __eq__(self, other): | |
232 | return type(other) is type(self) and self.__dict__ == other.__dict__ | |
233 | ||
234 | def __ne__(self, other): | |
235 | return not self.__eq__(other) | |
236 | ||
237 | def __len__(self): | |
238 | return len(self.seq) | |
239 | ||
240 | def split_capillary_id(self): | |
241 | '''Gets the prefix and suffix of an name of a capillary read, e.g. xxxxx.p1k or xxxx.q1k. Returns a tuple (prefix, suffx)''' | |
242 | try: | |
243 | a = self.id.rsplit('.', 1) | |
244 | if a[1].startswith('p'): | |
245 | dir = 'fwd' | |
246 | elif a[1].startswith('q'): | |
247 | dir = 'rev' | |
248 | else: | |
249 | dir = 'unk' | |
250 | ||
251 | return {'prefix': a[0], 'dir': dir, 'suffix':a[1]} | |
252 | except: | |
253 | raise Error('Error in split_capillary_id() on ID', self.id) | |
254 | ||
255 | def expand_nucleotides(self): | |
256 | '''Assumes sequence is nucleotides. Returns list of all combinations of redundant nucleotides. e.g. R is A or G, so CRT would have combinations CAT and CGT''' | |
257 | s = list(self.seq) | |
258 | for i in range(len(s)): | |
259 | if s[i] in redundant_nts: | |
260 | s[i] = ''.join(redundant_nts[s[i]]) | |
261 | ||
262 | seqs = [] | |
263 | for x in itertools.product(*s): | |
264 | seqs.append(Fasta(self.id + '.' + str(len(seqs) + 1), ''.join(x))) | |
265 | return seqs | |
266 | ||
267 | def strip_after_first_whitespace(self): | |
268 | '''Removes everything in the name after the first whitespace character''' | |
269 | self.id = self.id.split()[0] | |
270 | ||
271 | def strip_illumina_suffix(self): | |
272 | '''Removes any trailing /1 or /2 off the end of the name''' | |
273 | if self.id.endswith('/1') or self.id.endswith('/2'): | |
274 | self.id = self.id[:-2] | |
275 | ||
276 | def revcomp(self): | |
277 | '''Reverse complements the sequence''' | |
278 | self.seq = self.seq.translate(str.maketrans("ATCGatcg", "TAGCtagc"))[::-1] | |
279 | ||
280 | def is_all_Ns(self, start=0, end=None): | |
281 | '''Returns true if the sequence is all Ns (upper or lower case)''' | |
282 | if end is not None: | |
283 | if start > end: | |
284 | raise Error('Error in is_all_Ns. Start coord must be <= end coord') | |
285 | end += 1 | |
286 | else: | |
287 | end = len(self) | |
288 | ||
289 | if len(self) == 0: | |
290 | return False | |
291 | else: | |
292 | return re.search('[^Nn]', self.seq[start:end]) is None | |
293 | ||
294 | def trim_Ns(self): | |
295 | '''Removes any leading or trailing N or n characters from the sequence''' | |
296 | self.seq = self.seq.strip('Nn') | |
297 | ||
298 | def add_insertions(self, skip=10, window=1, test=False): | |
299 | '''Adds a random base within window bases around every skip bases. e.g. skip=10, window=1 means a random base added somwhere in theintervals [9,11], [19,21] ... ''' | |
300 | assert 2 * window < skip | |
301 | new_seq = list(self.seq) | |
302 | for i in range(len(self) - skip, 0, -skip): | |
303 | pos = random.randrange(i - window, i + window + 1) | |
304 | base = random.choice(['A', 'C', 'G', 'T']) | |
305 | if test: | |
306 | base = 'N' | |
307 | new_seq.insert(pos, base) | |
308 | ||
309 | self.seq = ''.join(new_seq) | |
310 | ||
311 | def replace_bases(self, old, new): | |
312 | '''Replaces all occurences of 'old' with 'new' ''' | |
313 | self.seq = self.seq.replace(old, new) | |
314 | ||
315 | def replace_interval(self, start, end, new): | |
316 | '''Replaces the sequence from start to end with the sequence "new"''' | |
317 | if start > end or start > len(self) - 1 or end > len(self) - 1: | |
318 | raise Error('Error replacing bases ' + str(start) + '-' + str(end) + ' in sequence ' + self.id) | |
319 | ||
320 | self.seq = self.seq[0:start] + new + self.seq[end + 1:] | |
321 | ||
322 | def gaps(self, min_length = 1): | |
323 | '''Finds the positions of all gaps in the sequence that are at least min_length long. Returns a list of Intervals. Coords are zero-based''' | |
324 | gaps = [] | |
325 | regex = re.compile('N+', re.IGNORECASE) | |
326 | for m in regex.finditer(self.seq): | |
327 | if m.span()[1] - m.span()[0] + 1 >= min_length: | |
328 | gaps.append(intervals.Interval(m.span()[0], m.span()[1] - 1)) | |
329 | return gaps | |
330 | ||
331 | def contig_coords(self): | |
332 | '''Finds coords of contigs, i.e. everything that's not a gap (N or n). Returns a list of Intervals. Coords are zero-based''' | |
333 | # contigs are the opposite of gaps, so work out the coords from the gap coords | |
334 | gaps = self.gaps() | |
335 | ||
336 | if len(gaps) == 0: | |
337 | return [intervals.Interval(0, len(self) - 1)] | |
338 | ||
339 | coords = [0] | |
340 | for g in gaps: | |
341 | if g.start == 0: | |
342 | coords = [g.end + 1] | |
343 | else: | |
344 | coords += [g.start - 1, g.end + 1] | |
345 | ||
346 | if coords[-1] < len(self): | |
347 | coords.append(len(self) - 1) | |
348 | ||
349 | return [intervals.Interval(coords[i], coords[i+1]) for i in range(0, len(coords)-1,2)] | |
350 | ||
351 | ||
352 | ||
353 | ||
354 | def orfs(self, frame=0, revcomp=False): | |
355 | assert frame in [0,1,2] | |
356 | if revcomp: | |
357 | self.revcomp() | |
358 | ||
359 | aa_seq = self.translate(frame=frame).seq.rstrip('X') | |
360 | if revcomp: | |
361 | self.revcomp() | |
362 | ||
363 | orfs = _orfs_from_aa_seq(aa_seq) | |
364 | for i in range(len(orfs)): | |
365 | if revcomp: | |
366 | start = len(self) - (orfs[i].end * 3 + 3) - frame | |
367 | end = len(self) - (orfs[i].start * 3) - 1 - frame | |
368 | else: | |
369 | start = orfs[i].start * 3 + frame | |
370 | end = orfs[i].end * 3 + 2 + frame | |
371 | ||
372 | orfs[i] = intervals.Interval(start, end) | |
373 | ||
374 | return orfs | |
375 | ||
376 | ||
377 | def all_orfs(self, min_length=300): | |
378 | orfs = [] | |
379 | for frame in [0,1,2]: | |
380 | for revcomp in [False, True]: | |
381 | orfs.extend([(t, revcomp) for t in self.orfs(frame=frame, revcomp=revcomp) if len(t)>=min_length]) | |
382 | ||
383 | return sorted(orfs, key=lambda t:t[0]) | |
384 | ||
385 | # Fills the object with the next sequence in the file. Returns | |
386 | # True if this was successful, False if no more sequences in the file. | |
387 | # If reading a file of quality scores, set read_quals = True | |
388 | def get_next_from_file(self, f, read_quals=False): | |
389 | if f in previous_lines: | |
390 | if previous_lines[f] == None: | |
391 | self.id = self.seq = None | |
392 | return False | |
393 | else: | |
394 | self.id = self._get_id_from_header_line(previous_lines[f]) | |
395 | else: | |
396 | line = '\n' | |
397 | while line == '\n': | |
398 | line = f.readline() | |
399 | self.id = self._get_id_from_header_line(line) | |
400 | ||
401 | self.seq = '' | |
402 | seq_lines = [] # much faster to store the seq lines in an array, | |
403 | # then join at the end | |
404 | ||
405 | while 1: | |
406 | line = f.readline() | |
407 | ||
408 | if line.startswith('>'): | |
409 | previous_lines[f] = line.rstrip() | |
410 | break | |
411 | elif line == '': | |
412 | previous_lines[f] = None | |
413 | break | |
414 | else: | |
415 | seq_lines.append(line.rstrip()) | |
416 | ||
417 | if read_quals: | |
418 | self.seq = ' '.join(seq_lines) | |
419 | else: | |
420 | self.seq = ''.join(seq_lines) | |
421 | return True | |
422 | ||
423 | def __str__(self): | |
424 | if Fasta.line_length == 0: | |
425 | return '>' + self.id + '\n' + self.seq | |
426 | else: | |
427 | return '>' + self.id + '\n' + '\n'.join(self.seq[i:i+Fasta.line_length] for i in range(0, len(self), Fasta.line_length)) | |
428 | ||
429 | def __getitem__(self, index): | |
430 | return self.seq[index] | |
431 | ||
432 | def trim(self, start, end): | |
433 | '''Removes first 'start'/'end' bases off the start/end of the sequence''' | |
434 | self.seq = self.seq[start:len(self.seq) - end] | |
435 | ||
436 | # qual_scores should be a list of quality scores | |
437 | def to_Fastq(self, qual_scores): | |
438 | '''Returns a Fastq object. qual_scores expected to be a list of numbers, like you would get in a .qual file''' | |
439 | if len(self) != len(qual_scores): | |
440 | raise Error('Error making Fastq from Fasta, lengths differ.', self.id) | |
441 | return Fastq(self.id, self.seq, ''.join([chr(max(0, min(x, 93)) + 33) for x in qual_scores])) | |
442 | ||
443 | def search(self, search_string): | |
444 | '''Finds every occurence (including overlapping ones) of the search_string, including on the reverse strand. Returns a list where each element is a tuple (position, strand) where strand is in ['-', '+']. Positions are zero-based''' | |
445 | seq = self.seq.upper() | |
446 | search_string = search_string.upper() | |
447 | pos = 0 | |
448 | found = seq.find(search_string, pos) | |
449 | hits = [] | |
450 | ||
451 | while found != -1: | |
452 | hits.append((found, '+')) | |
453 | pos = found + 1 | |
454 | found = seq.find(search_string, pos) | |
455 | ||
456 | ||
457 | pos = 0 | |
458 | search_string = Fasta('x', search_string) | |
459 | search_string.revcomp() | |
460 | search_string = search_string.seq | |
461 | found = seq.find(search_string, pos) | |
462 | ||
463 | while found != -1: | |
464 | hits.append((found, '-')) | |
465 | pos = found + 1 | |
466 | found = seq.find(search_string, pos) | |
467 | ||
468 | return hits | |
469 | ||
470 | def translate(self, frame=0): | |
471 | '''Returns a Fasta sequence, translated into amino acids. Starts translating from 'frame', where frame expected to be 0,1 or 2''' | |
472 | return Fasta(self.id, ''.join([codon2aa.get(self.seq[x:x+3].upper(), 'X') for x in range(frame, len(self)-1-frame, 3)])) | |
473 | ||
474 | ||
475 | class Embl(Fasta): | |
476 | '''Exactly the same as Fasta, but reading seqs from a file works differently''' | |
477 | def __eq__(self, other): | |
478 | return type(other) in [Fasta, Embl] and type(self) in [Fasta, Embl] and self.__dict__ == other.__dict__ | |
479 | ||
480 | def _get_id_from_header_line(self, line): | |
481 | if line.startswith('ID ') and line[5] != ' ': | |
482 | return line.split()[1].rstrip(';') | |
483 | elif line.startswith('LOCUS'): | |
484 | return line.split()[1] | |
485 | else: | |
486 | raise Error('Error! expected line starting with "ID" or "LOCUS", but got this:\n', line) | |
487 | ||
488 | def get_next_from_file(self, f, read_quals=False): | |
489 | if f in previous_lines: | |
490 | line = '' | |
491 | if previous_lines[f] == None: | |
492 | self.id = self.seq = None | |
493 | return False | |
494 | else: | |
495 | self.id = self._get_id_from_header_line(previous_lines[f]) | |
496 | else: | |
497 | line = '\n' | |
498 | while line == '\n': | |
499 | line = f.readline() | |
500 | self.id = self._get_id_from_header_line(line) | |
501 | ||
502 | self.seq = '' | |
503 | seq_lines = [] | |
504 | ||
505 | while not (line.startswith('SQ') or line.rstrip() == 'ORIGIN'): | |
506 | line = f.readline() | |
507 | if line == '': | |
508 | raise Error('Error! No SQ or ORIGIN line found for sequence ' + self.id) | |
509 | ||
510 | line = f.readline() | |
511 | ||
512 | while not line.startswith('//'): | |
513 | if line == '' or line[0] != ' ': | |
514 | raise Error('Error! Did not find end of sequence ' + self.id) | |
515 | seq_lines.append(''.join(line.rstrip().strip(' 0123456789').split())) | |
516 | line = f.readline() | |
517 | ||
518 | ||
519 | while 1: | |
520 | if line.startswith('ID') or line.startswith('LOCUS'): | |
521 | previous_lines[f] = line.rstrip() | |
522 | break | |
523 | elif line == '': | |
524 | previous_lines[f] = None | |
525 | break | |
526 | ||
527 | line = f.readline() | |
528 | ||
529 | self.seq = ''.join(seq_lines) | |
530 | return True | |
531 | ||
532 | class Fastq(Fasta): | |
533 | '''Class to store and manipulate FASTQ sequences. They have three things: a name, sequence and string of quality scores''' | |
534 | def __init__(self, id_in=None, seq_in=None, qual_in=None): | |
535 | super().__init__(id_in, seq_in) | |
536 | self.qual = qual_in | |
537 | if (not self.seq == self.qual == None) and len(self.qual) != len(self.seq): | |
538 | raise Error('Error constructing Fastq. Mismatch in sequence and quality length\n' + str(self)) | |
539 | ||
540 | def __str__(self): | |
541 | return '@' + self.id + '\n' + self.seq + '\n+\n' + self.qual | |
542 | ||
543 | def __eq__(self, other): | |
544 | return type(other) is type(self) and self.__dict__ == other.__dict__ | |
545 | ||
546 | def get_next_from_file(self, f, read_quals=False): | |
547 | if f in previous_lines: | |
548 | line = previous_lines[f] | |
549 | del previous_lines[f] | |
550 | else: | |
551 | line = f.readline() | |
552 | ||
553 | while line == '\n': | |
554 | line = f.readline() | |
555 | ||
556 | if not line: | |
557 | self = Fastq('', '', '') | |
558 | return False | |
559 | ||
560 | if not line.startswith('@'): | |
561 | raise Error('Error getting next sequence from fastq file. Got line:\n' + line) | |
562 | ||
563 | self.id = line.rstrip()[1:] | |
564 | line = f.readline() | |
565 | if not line: | |
566 | raise Error('Error getting next sequence from fastq file, sequence has ID ' + self.id) | |
567 | ||
568 | self.seq = line.strip() | |
569 | ||
570 | line = f.readline() | |
571 | if not (line and line.startswith('+')): | |
572 | raise Error('Error getting next sequence from fastq file, no line starting with +, sequence has ID ' + self.id) | |
573 | ||
574 | line = f.readline() | |
575 | if not line: | |
576 | raise Error('Error getting next sequence from fastq file, sequence has ID ' + self.id) | |
577 | ||
578 | self.qual = line.rstrip() | |
579 | return True | |
580 | ||
581 | def revcomp(self): | |
582 | '''Reverse complements the sequence''' | |
583 | super().revcomp() | |
584 | self.qual = self.qual[::-1] | |
585 | ||
586 | def trim(self, start, end): | |
587 | '''Removes first 'start'/'end' bases off the start/end of the sequence''' | |
588 | super().trim(start, end) | |
589 | self.qual = self.qual[start:len(self.qual) - end] | |
590 | ||
591 | def to_Fasta_and_qual(self): | |
592 | quals = [ord(x) - 33 for x in self.qual] | |
593 | return (Fasta(self.id, self.seq), quals) | |
594 | ||
595 | def expand_nucleotides(self): | |
596 | return [Fastq(x.id, x.seq, self.qual) for x in super().expand_nucleotides()] | |
597 | ||
598 | def trim_Ns(self): | |
599 | '''Removes any leading or trailing N or n characters from the sequence''' | |
600 | # get index of first base that is not an N | |
601 | i = 0 | |
602 | while i < len(self) and self.seq[i] in 'nN': | |
603 | i += 1 | |
604 | ||
605 | # strip off start of sequence and quality | |
606 | self.seq = self.seq[i:] | |
607 | self.qual = self.qual[i:] | |
608 | ||
609 | # strip the ends | |
610 | self.seq = self.seq.rstrip('Nn') | |
611 | self.qual = self.qual[:len(self.seq)] | |
612 | ||
613 | def replace_interval(self, start, end, new, qual_string): | |
614 | '''Replaces the sequence from start to end with the sequence "new"''' | |
615 | if len(new) != len(qual_string): | |
616 | raise Error('Length of new seq and qual string in replace_interval() must be equal. Cannot continue') | |
617 | super().replace_interval(start, end, new) | |
618 | self.qual = self.qual[0:start] + qual_string + self.qual[end + 1:] | |
619 | ||
620 | def translate(self): | |
621 | '''Returns a Fasta sequence, translated into amino acids. Starts translating from 'frame', where frame expected to be 0,1 or 2''' | |
622 | fa = super().translate() | |
623 | return Fastq(fa.id, fa.seq, 'I'*len(fa.seq)) | |
624 | ||
625 | ||
626 | def _orfs_from_aa_seq(seq): | |
627 | orfs = [] | |
628 | pos = 0 | |
629 | while pos < len(seq): | |
630 | next_stop = seq.find('*', pos) | |
631 | if next_stop == -1: | |
632 | orfs.append(intervals.Interval(pos, len(seq)-1)) | |
633 | break | |
634 | elif next_stop > pos: | |
635 | orfs.append(intervals.Interval(pos, next_stop)) | |
636 | pos = next_stop + 1 | |
637 | return orfs |
0 | import re | |
1 | import sys | |
2 | import copy | |
3 | import random | |
4 | import numpy | |
5 | from fastaq import sequences, utils | |
6 | ||
7 | class Error (Exception): pass | |
8 | ||
9 | def capillary_to_pairs(infile, outprefix): | |
10 | # hash the sequences, only taking longest where an end has been sequenced more than once | |
11 | seq_reader = sequences.file_reader(infile) | |
12 | fwd_seqs = {} | |
13 | rev_seqs = {} | |
14 | unpaired_seqs = {} | |
15 | ||
16 | for seq in seq_reader: | |
17 | id_info = seq.split_capillary_id() | |
18 | if id_info['dir'] == 'fwd': | |
19 | seq.id = id_info['prefix'] + '/1' | |
20 | h = fwd_seqs | |
21 | elif id_info['dir'] == 'rev': | |
22 | seq.id = id_info['prefix'] + '/2' | |
23 | h = rev_seqs | |
24 | else: | |
25 | seq.id = id_info['prefix'] | |
26 | h = unpaired_seqs | |
27 | ||
28 | key = id_info['prefix'] | |
29 | ||
30 | if key not in h or len(h[key]) < len(seq): | |
31 | h[key] = copy.copy(seq) | |
32 | ||
33 | # write the output files | |
34 | f_pe = utils.open_file_write(outprefix + '.paired.gz') | |
35 | f_up = utils.open_file_write(outprefix + '.unpaired.gz') | |
36 | ||
37 | for id in fwd_seqs: | |
38 | if id in rev_seqs: | |
39 | print(fwd_seqs[id], file=f_pe) | |
40 | print(rev_seqs[id], file=f_pe) | |
41 | del rev_seqs[id] | |
42 | else: | |
43 | print(fwd_seqs[id], file=f_up) | |
44 | ||
45 | for seq in rev_seqs.values(): | |
46 | print(seq, file=f_up) | |
47 | ||
48 | for seq in unpaired_seqs.values(): | |
49 | print(seq, file=f_up) | |
50 | ||
51 | utils.close(f_pe) | |
52 | utils.close(f_up) | |
53 | ||
54 | ||
55 | def count_sequences(infile): | |
56 | '''Returns the number of sequences in a file''' | |
57 | seq_reader = sequences.file_reader(infile) | |
58 | n = 0 | |
59 | for seq in seq_reader: | |
60 | n += 1 | |
61 | return n | |
62 | ||
63 | ||
64 | def deinterleave(infile, outfile_1, outfile_2, fasta_out=False): | |
65 | seq_reader = sequences.file_reader(infile) | |
66 | f_1 = utils.open_file_write(outfile_1) | |
67 | f_2 = utils.open_file_write(outfile_2) | |
68 | for seq in seq_reader: | |
69 | if fasta_out: | |
70 | print(sequences.Fasta(seq.id, seq.seq), file=f_1) | |
71 | else: | |
72 | print(seq, file=f_1) | |
73 | try: | |
74 | next(seq_reader) | |
75 | except StopIteration: | |
76 | utils.close(f_1) | |
77 | utils.close(f_2) | |
78 | raise Error('Error getting mate for sequence. Cannot continue') | |
79 | if fasta_out: | |
80 | print(sequences.Fasta(seq.id, seq.seq), file=f_2) | |
81 | else: | |
82 | print(seq, file=f_2) | |
83 | ||
84 | utils.close(f_1) | |
85 | utils.close(f_2) | |
86 | ||
87 | ||
88 | def enumerate_names(infile, outfile, start_index=1, keep_illumina_suffix=False, rename_file=None): | |
89 | seq_reader = sequences.file_reader(infile) | |
90 | fout_seqs = utils.open_file_write(outfile) | |
91 | counter = start_index | |
92 | ||
93 | if keep_illumina_suffix: | |
94 | sequence_suffixes = ['/1', '/2'] | |
95 | else: | |
96 | sequence_suffixes = [] | |
97 | ||
98 | ||
99 | if rename_file is not None: | |
100 | fout_rename = utils.open_file_write(rename_file) | |
101 | print('#old\tnew', file=fout_rename) | |
102 | ||
103 | for seq in seq_reader: | |
104 | old_id = seq.id | |
105 | seq.id = str(counter) | |
106 | ||
107 | for suff in sequence_suffixes: | |
108 | if old_id.endswith(suff): | |
109 | seq.id += suff | |
110 | break | |
111 | ||
112 | if rename_file is not None: | |
113 | print(old_id, seq.id, sep='\t', file=fout_rename) | |
114 | ||
115 | print(seq, file=fout_seqs) | |
116 | counter += 1 | |
117 | ||
118 | utils.close(fout_seqs) | |
119 | ||
120 | if rename_file is not None: | |
121 | utils.close(fout_rename) | |
122 | ||
123 | ||
124 | def expand_nucleotides(infile, outfile): | |
125 | seq_reader = sequences.file_reader(infile) | |
126 | fout = utils.open_file_write(outfile) | |
127 | ||
128 | for seq in seq_reader: | |
129 | seqs = seq.expand_nucleotides() | |
130 | if len(seqs) > 1: | |
131 | for s in seqs: | |
132 | print(s, file=fout) | |
133 | else: | |
134 | print(seq, file=fout) | |
135 | ||
136 | ||
137 | def extend_gaps(infile, outfile, trim): | |
138 | seq_reader = sequences.file_reader(infile) | |
139 | fout = utils.open_file_write(outfile) | |
140 | ||
141 | for seq in seq_reader: | |
142 | if len(seq) < 2 * trim: | |
143 | continue | |
144 | ||
145 | gaps = seq.gaps() | |
146 | bases = list(seq.seq) | |
147 | ||
148 | # extend the length of each gap | |
149 | for gap in gaps: | |
150 | left_start = max(gap.start - trim, 0) | |
151 | right_end = min(gap.end + trim + 1, len(seq)) | |
152 | ||
153 | for i in range(left_start, gap.start): | |
154 | bases[i] = 'N' | |
155 | ||
156 | for i in range(gap.end, right_end): | |
157 | bases[i] = 'N' | |
158 | ||
159 | seq.seq = ''.join(bases) | |
160 | ||
161 | # trim start/end bases and tidy up any resulting Ns at either end of the trimmed seq | |
162 | seq.trim(trim, trim) | |
163 | seq.trim_Ns() | |
164 | ||
165 | # check that there is some non-N sequence left over | |
166 | regex = re.compile('[^nN]') | |
167 | if regex.search(seq.seq) is not None: | |
168 | print(seq, file=fout) | |
169 | ||
170 | utils.close(fout) | |
171 | ||
172 | ||
173 | def fastaq_to_fake_qual(infile, outfile, q=40): | |
174 | seq_reader = sequences.file_reader(infile) | |
175 | fout = utils.open_file_write(outfile) | |
176 | ||
177 | for seq in seq_reader: | |
178 | print('>' + seq.id, file=fout) | |
179 | if sequences.Fasta.line_length == 0: | |
180 | print(' '.join([str(q)] * len(seq)), file=fout) | |
181 | else: | |
182 | for i in range(0, len(seq), sequences.Fasta.line_length): | |
183 | print(' '.join([str(q)] * min(sequences.Fasta.line_length, len(seq) - i)), file=fout) | |
184 | ||
185 | utils.close(fout) | |
186 | ||
187 | ||
188 | def fasta_to_fastq(fasta_in, qual_in, outfile): | |
189 | fa_reader = sequences.file_reader(fasta_in) | |
190 | qual_reader = sequences.file_reader(qual_in, read_quals=True) | |
191 | f_out = utils.open_file_write(outfile) | |
192 | ||
193 | for seq in fa_reader: | |
194 | qual = next(qual_reader) | |
195 | if seq.id != qual.id: | |
196 | utils.close(f_out) | |
197 | raise Error('Mismatch in names from fasta and qual file', seq.id, qual.id) | |
198 | ||
199 | qual.seq = [int(x) for x in qual.seq.split()] | |
200 | print(seq.to_Fastq(qual.seq), file=f_out) | |
201 | ||
202 | utils.close(f_out) | |
203 | ||
204 | ||
205 | def fastaq_to_mira_xml(infile, outfile): | |
206 | seq_reader = sequences.file_reader(infile) | |
207 | fout = utils.open_file_write(outfile) | |
208 | print('<?xml version="1.0"?>', '<trace_volume>', sep='\n', file=fout) | |
209 | ||
210 | for seq in seq_reader: | |
211 | print(' <trace>', | |
212 | ' <trace_name>' + seq.id + '</trace_name>', | |
213 | ' <clip_quality_right>' + str(len(seq)) + '</clip_quality_right>', | |
214 | ' <clip_vector_left>1</clip_vector_left>', | |
215 | ' </trace>', sep='\n', file=fout) | |
216 | ||
217 | ||
218 | print('</trace_volume>', file=fout) | |
219 | utils.close(fout) | |
220 | ||
221 | ||
222 | def fastaq_to_orfs_gff(infile, outfile, min_length=300, tool_name='fastaq'): | |
223 | seq_reader = sequences.file_reader(infile) | |
224 | fout = utils.open_file_write(outfile) | |
225 | for seq in seq_reader: | |
226 | orfs = seq.all_orfs(min_length=min_length) | |
227 | for coords, revcomp in orfs: | |
228 | if revcomp: | |
229 | strand = '-' | |
230 | else: | |
231 | strand = '+' | |
232 | ||
233 | print(seq.id, tool_name, 'CDS', coords.start+1, coords.end+1, '.', strand, '.', sep='\t', file=fout) | |
234 | ||
235 | utils.close(fout) | |
236 | ||
237 | ||
238 | def file_to_dict(infile, d): | |
239 | seq_reader = sequences.file_reader(infile) | |
240 | for seq in seq_reader: | |
241 | d[seq.id] = copy.copy(seq) | |
242 | ||
243 | ||
244 | def filter(infile, outfile, minlength=0, maxlength=float('inf'), regex=None, ids_file=None, invert=False): | |
245 | ids_from_file = set() | |
246 | if ids_file is not None: | |
247 | f = utils.open_file_read(ids_file) | |
248 | for line in f: | |
249 | ids_from_file.add(line.rstrip()) | |
250 | utils.close(f) | |
251 | ||
252 | seq_reader = sequences.file_reader(infile) | |
253 | f_out = utils.open_file_write(outfile) | |
254 | if regex is not None: | |
255 | r = re.compile(regex) | |
256 | ||
257 | for seq in seq_reader: | |
258 | hit = minlength <= len(seq) <= maxlength \ | |
259 | and (regex is None or r.search(seq.id) is not None) \ | |
260 | and (ids_file is None or seq.id in ids_from_file) | |
261 | ||
262 | if hit != invert: | |
263 | print(seq, file=f_out) | |
264 | utils.close(f_out) | |
265 | ||
266 | ||
267 | def get_ids(infile, outfile): | |
268 | seq_reader = sequences.file_reader(infile) | |
269 | f_out = utils.open_file_write(outfile) | |
270 | for seq in seq_reader: | |
271 | print(seq.id, file=f_out) | |
272 | utils.close(f_out) | |
273 | ||
274 | ||
275 | def get_seqs_flanking_gaps(infile, outfile, left, right): | |
276 | seq_reader = sequences.file_reader(infile) | |
277 | fout = utils.open_file_write(outfile) | |
278 | ||
279 | print('#id', 'gap_start', 'gap_end', 'left_bases', 'right_bases', sep='\t', file=fout) | |
280 | ||
281 | for seq in seq_reader: | |
282 | gaps = seq.gaps() | |
283 | ||
284 | for gap in gaps: | |
285 | left_start = max(gap.start - left, 0) | |
286 | right_end = min(gap.end + right + 1, len(seq)) | |
287 | print(seq.id, | |
288 | gap.start + 1, | |
289 | gap.end + 1, | |
290 | seq.seq[left_start:gap.start], | |
291 | seq.seq[gap.end + 1:right_end], | |
292 | sep='\t', file=fout) | |
293 | ||
294 | utils.close(fout) | |
295 | ||
296 | ||
297 | def interleave(infile_1, infile_2, outfile): | |
298 | seq_reader_1 = sequences.file_reader(infile_1) | |
299 | seq_reader_2 = sequences.file_reader(infile_2) | |
300 | f_out = utils.open_file_write(outfile) | |
301 | ||
302 | for seq_1 in seq_reader_1: | |
303 | try: | |
304 | seq_2 = next(seq_reader_2) | |
305 | except: | |
306 | utils.close(f_out) | |
307 | raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue') | |
308 | ||
309 | print(seq_1, file=f_out) | |
310 | print(seq_2, file=f_out) | |
311 | ||
312 | try: | |
313 | seq_2 = next(seq_reader_2) | |
314 | except: | |
315 | seq_2 = None | |
316 | ||
317 | if seq_2 is not None: | |
318 | utils.close(f_out) | |
319 | raise Error('Error getting mate for sequence', seq_2.id, ' ... cannot continue') | |
320 | ||
321 | utils.close(f_out) | |
322 | ||
323 | ||
324 | def make_random_contigs(contigs, length, outfile, name_by_letters=False, prefix='', seed=None, first_number=1): | |
325 | '''Makes a multi fasta file of random sequences, all the same length''' | |
326 | random.seed(a=seed) | |
327 | fout = utils.open_file_write(outfile) | |
328 | letters = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') | |
329 | letters_index = 0 | |
330 | ||
331 | for i in range(contigs): | |
332 | if name_by_letters: | |
333 | name = letters[letters_index] | |
334 | letters_index += 1 | |
335 | if letters_index == len(letters): | |
336 | letters_index = 0 | |
337 | else: | |
338 | name = str(i + first_number) | |
339 | ||
340 | fa = sequences.Fasta(prefix + name, ''.join([random.choice('ACGT') for x in range(length)])) | |
341 | print(fa, file=fout) | |
342 | ||
343 | utils.close(fout) | |
344 | ||
345 | ||
346 | def make_long_reads(infile, outfile, method='tiling', fixed_read_length=20000, tile_step=10000, gamma_shape=1.2, gamma_scale=6000, coverage=10, gamma_min_length=20000, seed=None, ins_skip=None, ins_window=None,): | |
347 | assert method in ['tiling', 'gamma', 'uniform'] | |
348 | assert ins_skip == ins_window == None or None not in [ins_skip, ins_window] | |
349 | if seed is not None: | |
350 | random.seed(a=seed) | |
351 | seq_reader = sequences.file_reader(infile) | |
352 | f = utils.open_file_write(outfile) | |
353 | ||
354 | for seq in seq_reader: | |
355 | if method == 'tiling': | |
356 | if len(seq) < fixed_read_length: | |
357 | print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr) | |
358 | continue | |
359 | for i in range(0, len(seq), tile_step): | |
360 | end = min(len(seq), i + fixed_read_length) | |
361 | fa = sequences.Fasta('_'.join([seq.id, str(i + 1), str(end)]), seq[i:end]) | |
362 | if ins_skip: | |
363 | fa.add_insertions(skip=ins_skip, window=ins_window) | |
364 | print(fa, file=f) | |
365 | if end >= len(seq): | |
366 | break | |
367 | elif method == 'gamma': | |
368 | if len(seq) < gamma_min_length: | |
369 | print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr) | |
370 | continue | |
371 | total_read_length = 0 | |
372 | while total_read_length < coverage * len(seq) - 0.5 * gamma_min_length: | |
373 | read_length = int(numpy.random.gamma(gamma_shape, scale=gamma_scale)) | |
374 | while read_length < gamma_min_length or read_length > len(seq): | |
375 | read_length = int(numpy.random.gamma(gamma_shape, scale=gamma_scale)) | |
376 | ||
377 | start = random.randint(0, len(seq) - read_length) | |
378 | end = start + read_length - 1 | |
379 | fa = sequences.Fasta('_'.join([seq.id, str(start + 1), str(end + 1)]), seq[start:end+1]) | |
380 | total_read_length += len(fa) | |
381 | if ins_skip: | |
382 | fa.add_insertions(skip=ins_skip, window=ins_window) | |
383 | print(fa, file=f) | |
384 | elif method == 'uniform': | |
385 | if len(seq) < fixed_read_length: | |
386 | print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr) | |
387 | continue | |
388 | total_read_length = 0 | |
389 | while total_read_length < coverage * len(seq) - 0.5 * fixed_read_length: | |
390 | start = random.randint(0, len(seq) - fixed_read_length) | |
391 | end = start + fixed_read_length - 1 | |
392 | fa = sequences.Fasta('_'.join([seq.id, str(start + 1), str(end + 1)]), seq[start:end+1]) | |
393 | total_read_length += len(fa) | |
394 | if ins_skip: | |
395 | fa.add_insertions(skip=ins_skip, window=ins_window) | |
396 | print(fa, file=f) | |
397 | ||
398 | ||
399 | utils.close(f) | |
400 | ||
401 | ||
402 | def merge_to_one_seq(infile, outfile, seqname='union'): | |
403 | '''Takes a multi fasta or fastq file and writes a new file that contains just one sequence, with the original sequences catted together, preserving their order''' | |
404 | seq_reader = sequences.file_reader(infile) | |
405 | seqs = [] | |
406 | ||
407 | for seq in seq_reader: | |
408 | seqs.append(copy.copy(seq)) | |
409 | ||
410 | new_seq = ''.join([seq.seq for seq in seqs]) | |
411 | ||
412 | if type(seqs[0]) == sequences.Fastq: | |
413 | new_qual = ''.join([seq.qual for seq in seqs]) | |
414 | seqs[:] = [] | |
415 | merged = sequences.Fastq(seqname, new_seq, new_qual) | |
416 | else: | |
417 | merged = sequences.Fasta(seqname, new_seq) | |
418 | seqs[:] = [] | |
419 | ||
420 | f = utils.open_file_write(outfile) | |
421 | print(merged, file=f) | |
422 | utils.close(f) | |
423 | ||
424 | ||
425 | def reverse_complement(infile, outfile): | |
426 | seq_reader = sequences.file_reader(infile) | |
427 | fout = utils.open_file_write(outfile) | |
428 | ||
429 | for seq in seq_reader: | |
430 | seq.revcomp() | |
431 | print(seq, file=fout) | |
432 | ||
433 | utils.close(fout) | |
434 | ||
435 | ||
436 | def scaffolds_to_contigs(infile, outfile, number_contigs=False): | |
437 | '''Makes a file of contigs from scaffolds by splitting at every N. | |
438 | Use number_contigs=True to add .1, .2, etc onto end of each | |
439 | contig, instead of default to append coordinates.''' | |
440 | seq_reader = sequences.file_reader(infile) | |
441 | fout = utils.open_file_write(outfile) | |
442 | ||
443 | for seq in seq_reader: | |
444 | contigs = seq.contig_coords() | |
445 | counter = 1 | |
446 | for contig in contigs: | |
447 | if number_contigs: | |
448 | name = seq.id + '.' + str(counter) | |
449 | counter += 1 | |
450 | else: | |
451 | name = '.'.join([seq.id, str(contig.start + 1), str(contig.end + 1)]) | |
452 | print(sequences.Fasta(name, seq[contig.start:contig.end+1]), file=fout) | |
453 | ||
454 | utils.close(fout) | |
455 | ||
456 | ||
457 | def search_for_seq(infile, outfile, search_string): | |
458 | seq_reader = sequences.file_reader(infile) | |
459 | fout = utils.open_file_write(outfile) | |
460 | ||
461 | for seq in seq_reader: | |
462 | hits = seq.search(search_string) | |
463 | for hit in hits: | |
464 | print(seq.id, hit[0]+1, hit[1], sep='\t', file=fout) | |
465 | ||
466 | utils.close(fout) | |
467 | ||
468 | ||
469 | def sequence_trim(infile_1, infile_2, outfile_1, outfile_2, to_trim_file, min_length=50, check_revcomp=False): | |
470 | to_trim_seqs = {} | |
471 | file_to_dict(to_trim_file, to_trim_seqs) | |
472 | trim_seqs = [x.seq for x in to_trim_seqs.values()] | |
473 | if check_revcomp: | |
474 | for seq in to_trim_seqs.values(): | |
475 | seq.revcomp() | |
476 | trim_seqs_revcomp = [x.seq for x in to_trim_seqs.values()] | |
477 | else: | |
478 | trim_seqs_revcomp = [] | |
479 | ||
480 | seq_reader_1 = sequences.file_reader(infile_1) | |
481 | seq_reader_2 = sequences.file_reader(infile_2) | |
482 | f_out_1 = utils.open_file_write(outfile_1) | |
483 | f_out_2 = utils.open_file_write(outfile_2) | |
484 | ||
485 | for seq_1 in seq_reader_1: | |
486 | try: | |
487 | seq_2 = next(seq_reader_2) | |
488 | except: | |
489 | utils.close(f_out) | |
490 | raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue') | |
491 | ||
492 | for seq in seq_1, seq_2: | |
493 | for trim_seq in trim_seqs: | |
494 | if seq.seq.startswith(trim_seq): | |
495 | seq.trim(len(trim_seq),0) | |
496 | break | |
497 | ||
498 | for trim_seq in trim_seqs_revcomp: | |
499 | if seq.seq.endswith(trim_seq): | |
500 | seq.trim(0,len(trim_seq)) | |
501 | break | |
502 | ||
503 | if len(seq_1) >= min_length and len(seq_2) >= min_length: | |
504 | print(seq_1, file=f_out_1) | |
505 | print(seq_2, file=f_out_2) | |
506 | ||
507 | ||
508 | utils.close(f_out_1) | |
509 | utils.close(f_out_2) | |
510 | ||
511 | ||
512 | ||
513 | def translate(infile, outfile, frame=0): | |
514 | seq_reader = sequences.file_reader(infile) | |
515 | fout = utils.open_file_write(outfile) | |
516 | ||
517 | for seq in seq_reader: | |
518 | print(seq.translate(frame=frame), file=fout) | |
519 | ||
520 | utils.close(fout) | |
521 | ||
522 | ||
523 | def trim(infile, outfile, start, end): | |
524 | seq_reader = sequences.file_reader(infile) | |
525 | fout = utils.open_file_write(outfile) | |
526 | ||
527 | for seq in seq_reader: | |
528 | seq.trim(start, end) | |
529 | if len(seq): | |
530 | print(seq, file=fout) | |
531 | ||
532 | utils.close(fout) | |
533 | ||
534 | ||
535 | def trim_Ns_at_end(infile, outfile): | |
536 | seq_reader = sequences.file_reader(infile) | |
537 | fout = utils.open_file_write(outfile) | |
538 | ||
539 | for seq in seq_reader: | |
540 | seq.trim_Ns() | |
541 | if len(seq): | |
542 | print(seq, file=fout) | |
543 | ||
544 | utils.close(fout) | |
545 | ||
546 | ||
547 | def lengths_from_fai(fai_file, d): | |
548 | f = utils.open_file_read(fai_file) | |
549 | for line in f: | |
550 | (id, length) = line.rstrip().split()[:2] | |
551 | d[id] = int(length) | |
552 | utils.close(f) | |
553 | ||
554 | ||
555 | def split_by_base_count(infile, outfiles_prefix, max_bases, max_seqs=None): | |
556 | '''Splits a fasta/q file into separate files, file size determined by number of bases. | |
557 | ||
558 | Puts <= max_bases in each split file The exception is a single sequence >=max_bases | |
559 | is put in its own file. This does not split sequences. | |
560 | ''' | |
561 | seq_reader = sequences.file_reader(infile) | |
562 | base_count = 0 | |
563 | file_count = 1 | |
564 | seq_count = 0 | |
565 | fout = None | |
566 | if max_seqs is None: | |
567 | max_seqs = float('inf') | |
568 | ||
569 | for seq in seq_reader: | |
570 | if base_count == 0: | |
571 | fout = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) | |
572 | file_count += 1 | |
573 | ||
574 | if base_count + len(seq) > max_bases or seq_count >= max_seqs: | |
575 | if base_count == 0: | |
576 | print(seq, file=fout) | |
577 | utils.close(fout) | |
578 | else: | |
579 | utils.close(fout) | |
580 | fout = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) | |
581 | print(seq, file=fout) | |
582 | base_count = len(seq) | |
583 | file_count += 1 | |
584 | seq_count = 1 | |
585 | else: | |
586 | base_count += len(seq) | |
587 | seq_count += 1 | |
588 | print(seq, file=fout) | |
589 | ||
590 | utils.close(fout) | |
591 | ||
592 | ||
593 | def split_by_fixed_size(infile, outfiles_prefix, chunk_size, tolerance, skip_if_all_Ns=False): | |
594 | '''Splits fasta/q file into separate files, with up to (chunk_size + tolerance) bases in each file''' | |
595 | file_count = 1 | |
596 | coords = [] | |
597 | small_sequences = [] # sequences shorter than chunk_size | |
598 | seq_reader = sequences.file_reader(infile) | |
599 | f_coords = utils.open_file_write(outfiles_prefix + '.coords') | |
600 | ||
601 | for seq in seq_reader: | |
602 | if skip_if_all_Ns and seq.is_all_Ns(): | |
603 | continue | |
604 | if len(seq) < chunk_size: | |
605 | small_sequences.append(copy.copy(seq)) | |
606 | elif len(seq) <= chunk_size + tolerance: | |
607 | f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) | |
608 | print(seq, file=f) | |
609 | utils.close(f) | |
610 | file_count += 1 | |
611 | else: | |
612 | # make list of chunk coords | |
613 | chunks = [(x,x+chunk_size) for x in range(0, len(seq), chunk_size)] | |
614 | if chunks[-1][1] - 1 > len(seq): | |
615 | chunks[-1] = (chunks[-1][0], len(seq)) | |
616 | if len(chunks) > 1 and (chunks[-1][1] - chunks[-1][0]) <= tolerance: | |
617 | chunks[-2] = (chunks[-2][0], chunks[-1][1]) | |
618 | chunks.pop() | |
619 | ||
620 | # write one output file per chunk | |
621 | offset = 0 | |
622 | for chunk in chunks: | |
623 | if not(skip_if_all_Ns and seq.is_all_Ns(start=chunk[0], end=chunk[1]-1)): | |
624 | f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) | |
625 | chunk_id = seq.id + ':' + str(chunk[0]+1) + '-' + str(chunk[1]) | |
626 | print(sequences.Fasta(chunk_id, seq[chunk[0]:chunk[1]]), file=f) | |
627 | print(chunk_id, seq.id, offset, sep='\t', file=f_coords) | |
628 | utils.close(f) | |
629 | file_count += 1 | |
630 | ||
631 | offset += chunk[1] - chunk[0] | |
632 | ||
633 | # write files of small sequences | |
634 | if len(small_sequences): | |
635 | f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) | |
636 | file_count += 1 | |
637 | base_count = 0 | |
638 | for seq in small_sequences: | |
639 | if base_count > 0 and base_count + len(seq) > chunk_size + tolerance: | |
640 | utils.close(f) | |
641 | f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) | |
642 | file_count += 1 | |
643 | base_count = 0 | |
644 | ||
645 | print(seq, file=f) | |
646 | base_count += len(seq) | |
647 | ||
648 | utils.close(f) | |
649 | ||
650 | ||
651 | def replace_bases(infile, outfile, old, new): | |
652 | seq_reader = sequences.file_reader(infile) | |
653 | f_out = utils.open_file_write(outfile) | |
654 | ||
655 | for seq in seq_reader: | |
656 | seq.replace_bases(old, new) | |
657 | print(seq, file=f_out) | |
658 | ||
659 | utils.close(f_out) | |
660 | ||
661 | ||
662 | def strip_illumina_suffix(infile, outfile): | |
663 | seq_reader = sequences.file_reader(infile) | |
664 | f_out = utils.open_file_write(outfile) | |
665 | ||
666 | for seq in seq_reader: | |
667 | seq.strip_illumina_suffix() | |
668 | print(seq, file=f_out) | |
669 | ||
670 | utils.close(f_out) | |
671 | ||
672 | ||
673 | def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False): | |
674 | seq_reader = sequences.file_reader(infile) | |
675 | f_out = utils.open_file_write(outfile) | |
676 | original_line_length = sequences.Fasta.line_length | |
677 | sequences.Fasta.line_length = line_length | |
678 | ||
679 | for seq in seq_reader: | |
680 | if strip_after_first_whitespace: | |
681 | seq.strip_after_first_whitespace() | |
682 | ||
683 | if type(seq) == sequences.Fastq: | |
684 | print(sequences.Fasta(seq.id, seq.seq), file=f_out) | |
685 | else: | |
686 | print(seq, file=f_out) | |
687 | ||
688 | utils.close(f_out) | |
689 | sequences.Fasta.line_length = original_line_length | |
690 | ||
691 | ||
692 | def to_fasta_union(infile, outfile, seqname='union'): | |
693 | seq_reader = sequences.file_reader(infile) | |
694 | new_seq = [] | |
695 | ||
696 | for seq in seq_reader: | |
697 | new_seq.append(seq.seq) | |
698 | ||
699 | f_out = utils.open_file_write(outfile) | |
700 | print(sequences.Fasta(seqname, ''.join(new_seq)), file=f_out) | |
701 | utils.close(f_out) | |
702 | ||
703 | ||
704 | ||
705 | def to_unique_by_id(infile, outfile): | |
706 | seq_reader = sequences.file_reader(infile) | |
707 | seqs = {} | |
708 | ids_in_order = [] | |
709 | ||
710 | # has the reads, keeping the longest one when we get the same | |
711 | # name more than once | |
712 | for seq in seq_reader: | |
713 | if len(seq) == 0: | |
714 | continue | |
715 | if seq.id not in seqs: | |
716 | seqs[seq.id] = copy.copy(seq) | |
717 | ids_in_order.append(seq.id) | |
718 | elif len(seqs[seq.id]) < len(seq): | |
719 | seqs[seq.id] = copy.copy(seq) | |
720 | ||
721 | # write the output | |
722 | f_out = utils.open_file_write(outfile) | |
723 | for id in ids_in_order: | |
724 | print(seqs[id], file=f_out) | |
725 | utils.close(f_out) |
0 | ID seq1; SV 1; linear; mRNA; STD; PLN; 1859 BP. | |
1 | XX | |
2 | AC X56734; S46826; | |
3 | XX | |
4 | DT 12-SEP-1991 (Rel. 29, Created) | |
5 | DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) | |
6 | XX | |
7 | DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase | |
8 | XX | |
9 | KW beta-glucosidase. | |
10 | XX | |
11 | OS Trifolium repens (white clover) | |
12 | OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; | |
13 | OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; | |
14 | OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. | |
15 | XX | |
16 | RN [5] | |
17 | RP 1-1859 | |
18 | RX DOI; 10.1007/BF00039495. | |
19 | RX PUBMED; 1907511. | |
20 | RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; | |
21 | RT "Nucleotide and derived amino acid sequence of the cyanogenic | |
22 | RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; | |
23 | RL Plant Mol. Biol. 17(2):209-219(1991). | |
24 | XX | |
25 | RN [6] | |
26 | RP 1-1859 | |
27 | RA Hughes M.A.; | |
28 | RT ; | |
29 | RL Submitted (19-NOV-1990) to the INSDC. | |
30 | RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle | |
31 | RL Upon Tyne, NE2 4HH, UK | |
32 | XX | |
33 | DR EuropePMC; PMC99098; 11752244. | |
34 | XX | |
35 | FH Key Location/Qualifiers | |
36 | FH | |
37 | FT source 1..1859 | |
38 | FT /organism="Trifolium repens" | |
39 | FT /mol_type="mRNA" | |
40 | FT /clone_lib="lambda gt10" | |
41 | FT /clone="TRE361" | |
42 | FT /tissue_type="leaves" | |
43 | FT /db_xref="taxon:3899" | |
44 | FT mRNA 1..1859 | |
45 | FT /experiment="experimental evidence, no additional details | |
46 | FT recorded" | |
47 | FT CDS 14..1495 | |
48 | FT /product="beta-glucosidase" | |
49 | FT /EC_number="3.2.1.21" | |
50 | FT /note="non-cyanogenic" | |
51 | FT /db_xref="GOA:P26204" | |
52 | FT /db_xref="InterPro:IPR001360" | |
53 | FT /db_xref="InterPro:IPR013781" | |
54 | FT /db_xref="InterPro:IPR017853" | |
55 | FT /db_xref="InterPro:IPR018120" | |
56 | FT /db_xref="UniProtKB/Swiss-Prot:P26204" | |
57 | FT /protein_id="CAA40058.1" | |
58 | FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI | |
59 | FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK | |
60 | FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ | |
61 | FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR | |
62 | FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD | |
63 | FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF | |
64 | FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ | |
65 | FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA | |
66 | FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" | |
67 | XX | |
68 | SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; | |
69 | aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 | |
70 | cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 | |
71 | tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 | |
72 | aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 | |
73 | tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 | |
74 | caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 | |
75 | ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 | |
76 | atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 | |
77 | ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 | |
78 | tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 | |
79 | gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 | |
80 | aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 | |
81 | aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 | |
82 | taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 | |
83 | gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 | |
84 | cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 | |
85 | gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 | |
86 | ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 | |
87 | acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 | |
88 | acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 | |
89 | gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 | |
90 | gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 | |
91 | agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 | |
92 | ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 | |
93 | taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 | |
94 | tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 | |
95 | ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 | |
96 | tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 | |
97 | aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 | |
98 | agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 | |
99 | tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 | |
100 | // | |
101 | ID seq2; SV 1; linear; mRNA; STD; PLN; 1859 BP. | |
102 | XX | |
103 | AC X56734; S46826; | |
104 | XX | |
105 | DT 12-SEP-1991 (Rel. 29, Created) | |
106 | DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) | |
107 | XX | |
108 | DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase | |
109 | XX | |
110 | KW beta-glucosidase. | |
111 | XX | |
112 | OS Trifolium repens (white clover) | |
113 | OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; | |
114 | OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; | |
115 | OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. | |
116 | XX | |
117 | RN [5] | |
118 | RP 1-1859 | |
119 | RX DOI; 10.1007/BF00039495. | |
120 | RX PUBMED; 1907511. | |
121 | RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; | |
122 | RT "Nucleotide and derived amino acid sequence of the cyanogenic | |
123 | RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; | |
124 | RL Plant Mol. Biol. 17(2):209-219(1991). | |
125 | XX | |
126 | RN [6] | |
127 | RP 1-1859 | |
128 | RA Hughes M.A.; | |
129 | RT ; | |
130 | RL Submitted (19-NOV-1990) to the INSDC. | |
131 | RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle | |
132 | RL Upon Tyne, NE2 4HH, UK | |
133 | XX | |
134 | DR EuropePMC; PMC99098; 11752244. | |
135 | XX | |
136 | FH Key Location/Qualifiers | |
137 | FH | |
138 | FT source 1..1859 | |
139 | FT /organism="Trifolium repens" | |
140 | FT /mol_type="mRNA" | |
141 | FT /clone_lib="lambda gt10" | |
142 | FT /clone="TRE361" | |
143 | FT /tissue_type="leaves" | |
144 | FT /db_xref="taxon:3899" | |
145 | FT mRNA 1..1859 | |
146 | FT /experiment="experimental evidence, no additional details | |
147 | FT recorded" | |
148 | FT CDS 14..1495 | |
149 | FT /product="beta-glucosidase" | |
150 | FT /EC_number="3.2.1.21" | |
151 | FT /note="non-cyanogenic" | |
152 | FT /db_xref="GOA:P26204" | |
153 | FT /db_xref="InterPro:IPR001360" | |
154 | FT /db_xref="InterPro:IPR013781" | |
155 | FT /db_xref="InterPro:IPR017853" | |
156 | FT /db_xref="InterPro:IPR018120" | |
157 | FT /db_xref="UniProtKB/Swiss-Prot:P26204" | |
158 | FT /protein_id="CAA40058.1" | |
159 | FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI | |
160 | FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK | |
161 | FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ | |
162 | FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR | |
163 | FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD | |
164 | FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF | |
165 | FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ | |
166 | FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA | |
167 | FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" | |
168 | XX | |
169 | SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; | |
170 | aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 | |
171 | cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 | |
172 | tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 | |
173 | aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 | |
174 | tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 | |
175 | caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 | |
176 | ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 | |
177 | atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 | |
178 | ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 | |
179 | tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 | |
180 | gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 | |
181 | aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 | |
182 | aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 | |
183 | taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 | |
184 | gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 | |
185 | cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 | |
186 | gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 | |
187 | ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 | |
188 | acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 | |
189 | acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 | |
190 | gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 | |
191 | gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 | |
192 | agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 | |
193 | ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 | |
194 | taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 | |
195 | tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 | |
196 | ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 | |
197 | tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 | |
198 | aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 | |
199 | agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 | |
200 | tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa ccccccccc 1859 | |
201 | // | |
202 |
0 | ID seq1; SV 1; linear; mRNA; STD; PLN; 1859 BP. | |
1 | XX | |
2 | AC X56734; S46826; | |
3 | XX | |
4 | DT 12-SEP-1991 (Rel. 29, Created) | |
5 | DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) | |
6 | XX | |
7 | DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase | |
8 | XX | |
9 | KW beta-glucosidase. | |
10 | XX | |
11 | OS Trifolium repens (white clover) | |
12 | OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; | |
13 | OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; | |
14 | OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. | |
15 | XX | |
16 | RN [5] | |
17 | RP 1-1859 | |
18 | RX DOI; 10.1007/BF00039495. | |
19 | RX PUBMED; 1907511. | |
20 | RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; | |
21 | RT "Nucleotide and derived amino acid sequence of the cyanogenic | |
22 | RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; | |
23 | RL Plant Mol. Biol. 17(2):209-219(1991). | |
24 | XX | |
25 | RN [6] | |
26 | RP 1-1859 | |
27 | RA Hughes M.A.; | |
28 | RT ; | |
29 | RL Submitted (19-NOV-1990) to the INSDC. | |
30 | RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle | |
31 | RL Upon Tyne, NE2 4HH, UK | |
32 | XX | |
33 | DR EuropePMC; PMC99098; 11752244. | |
34 | XX | |
35 | FH Key Location/Qualifiers | |
36 | FH | |
37 | FT source 1..1859 | |
38 | FT /organism="Trifolium repens" | |
39 | FT /mol_type="mRNA" | |
40 | FT /clone_lib="lambda gt10" | |
41 | FT /clone="TRE361" | |
42 | FT /tissue_type="leaves" | |
43 | FT /db_xref="taxon:3899" | |
44 | FT mRNA 1..1859 | |
45 | FT /experiment="experimental evidence, no additional details | |
46 | FT recorded" | |
47 | FT CDS 14..1495 | |
48 | FT /product="beta-glucosidase" | |
49 | FT /EC_number="3.2.1.21" | |
50 | FT /note="non-cyanogenic" | |
51 | FT /db_xref="GOA:P26204" | |
52 | FT /db_xref="InterPro:IPR001360" | |
53 | FT /db_xref="InterPro:IPR013781" | |
54 | FT /db_xref="InterPro:IPR017853" | |
55 | FT /db_xref="InterPro:IPR018120" | |
56 | FT /db_xref="UniProtKB/Swiss-Prot:P26204" | |
57 | FT /protein_id="CAA40058.1" | |
58 | FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI | |
59 | FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK | |
60 | FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ | |
61 | FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR | |
62 | FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD | |
63 | FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF | |
64 | FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ | |
65 | FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA | |
66 | FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" | |
67 | XX | |
68 | SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; | |
69 | aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 | |
70 | cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 | |
71 | tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 | |
72 | aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 | |
73 | tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 | |
74 | caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 | |
75 | ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 | |
76 | atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 | |
77 | ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 | |
78 | tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 | |
79 | gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 | |
80 | aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 | |
81 | aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 | |
82 | taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 | |
83 | gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 | |
84 | cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 | |
85 | gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 | |
86 | ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 | |
87 | acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 | |
88 | acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 | |
89 | gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 | |
90 | gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 | |
91 | agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 | |
92 | ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 | |
93 | taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 | |
94 | tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 | |
95 | ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 | |
96 | tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 | |
97 | aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 | |
98 | agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 | |
99 | tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 | |
100 | // | |
101 | ID seq2; SV 1; linear; mRNA; STD; PLN; 1859 BP. | |
102 | XX | |
103 | AC X56734; S46826; | |
104 | XX | |
105 | DT 12-SEP-1991 (Rel. 29, Created) | |
106 | DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) | |
107 | XX | |
108 | DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase | |
109 | XX | |
110 | KW beta-glucosidase. | |
111 | XX | |
112 | OS Trifolium repens (white clover) | |
113 | OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; | |
114 | OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; | |
115 | OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. | |
116 | XX | |
117 | RN [5] | |
118 | RP 1-1859 | |
119 | RX DOI; 10.1007/BF00039495. | |
120 | RX PUBMED; 1907511. | |
121 | RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; | |
122 | RT "Nucleotide and derived amino acid sequence of the cyanogenic | |
123 | RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; | |
124 | RL Plant Mol. Biol. 17(2):209-219(1991). | |
125 | XX | |
126 | RN [6] | |
127 | RP 1-1859 | |
128 | RA Hughes M.A.; | |
129 | RT ; | |
130 | RL Submitted (19-NOV-1990) to the INSDC. | |
131 | RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle | |
132 | RL Upon Tyne, NE2 4HH, UK | |
133 | XX | |
134 | DR EuropePMC; PMC99098; 11752244. | |
135 | XX | |
136 | FH Key Location/Qualifiers | |
137 | FH | |
138 | FT source 1..1859 | |
139 | FT /organism="Trifolium repens" | |
140 | FT /mol_type="mRNA" | |
141 | FT /clone_lib="lambda gt10" | |
142 | FT /clone="TRE361" | |
143 | FT /tissue_type="leaves" | |
144 | FT /db_xref="taxon:3899" | |
145 | FT mRNA 1..1859 | |
146 | FT /experiment="experimental evidence, no additional details | |
147 | FT recorded" | |
148 | FT CDS 14..1495 | |
149 | FT /product="beta-glucosidase" | |
150 | FT /EC_number="3.2.1.21" | |
151 | FT /note="non-cyanogenic" | |
152 | FT /db_xref="GOA:P26204" | |
153 | FT /db_xref="InterPro:IPR001360" | |
154 | FT /db_xref="InterPro:IPR013781" | |
155 | FT /db_xref="InterPro:IPR017853" | |
156 | FT /db_xref="InterPro:IPR018120" | |
157 | FT /db_xref="UniProtKB/Swiss-Prot:P26204" | |
158 | FT /protein_id="CAA40058.1" | |
159 | FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI | |
160 | FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK | |
161 | FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ | |
162 | FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR | |
163 | FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD | |
164 | FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF | |
165 | FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ | |
166 | FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA | |
167 | FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" | |
168 | XX | |
169 | aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 | |
170 | cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 | |
171 | tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 | |
172 | aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 | |
173 | tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 | |
174 | caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 | |
175 | ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 | |
176 | atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 | |
177 | ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 | |
178 | tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 | |
179 | gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 | |
180 | aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 | |
181 | aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 | |
182 | taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 | |
183 | gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 | |
184 | cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 | |
185 | gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 | |
186 | ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 | |
187 | acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 | |
188 | acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 | |
189 | gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 | |
190 | gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 | |
191 | agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 | |
192 | ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 | |
193 | taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 | |
194 | tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 | |
195 | ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 | |
196 | tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 | |
197 | aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 | |
198 | agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 | |
199 | tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa ccccccccc 1859 | |
200 | // | |
201 |
0 | ID seq1; SV 1; linear; mRNA; STD; PLN; 1859 BP. | |
1 | XX | |
2 | AC X56734; S46826; | |
3 | XX | |
4 | DT 12-SEP-1991 (Rel. 29, Created) | |
5 | DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) | |
6 | XX | |
7 | DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase | |
8 | XX | |
9 | KW beta-glucosidase. | |
10 | XX | |
11 | OS Trifolium repens (white clover) | |
12 | OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; | |
13 | OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; | |
14 | OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. | |
15 | XX | |
16 | RN [5] | |
17 | RP 1-1859 | |
18 | RX DOI; 10.1007/BF00039495. | |
19 | RX PUBMED; 1907511. | |
20 | RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; | |
21 | RT "Nucleotide and derived amino acid sequence of the cyanogenic | |
22 | RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; | |
23 | RL Plant Mol. Biol. 17(2):209-219(1991). | |
24 | XX | |
25 | RN [6] | |
26 | RP 1-1859 | |
27 | RA Hughes M.A.; | |
28 | RT ; | |
29 | RL Submitted (19-NOV-1990) to the INSDC. | |
30 | RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle | |
31 | RL Upon Tyne, NE2 4HH, UK | |
32 | XX | |
33 | DR EuropePMC; PMC99098; 11752244. | |
34 | XX | |
35 | FH Key Location/Qualifiers | |
36 | FH | |
37 | FT source 1..1859 | |
38 | FT /organism="Trifolium repens" | |
39 | FT /mol_type="mRNA" | |
40 | FT /clone_lib="lambda gt10" | |
41 | FT /clone="TRE361" | |
42 | FT /tissue_type="leaves" | |
43 | FT /db_xref="taxon:3899" | |
44 | FT mRNA 1..1859 | |
45 | FT /experiment="experimental evidence, no additional details | |
46 | FT recorded" | |
47 | FT CDS 14..1495 | |
48 | FT /product="beta-glucosidase" | |
49 | FT /EC_number="3.2.1.21" | |
50 | FT /note="non-cyanogenic" | |
51 | FT /db_xref="GOA:P26204" | |
52 | FT /db_xref="InterPro:IPR001360" | |
53 | FT /db_xref="InterPro:IPR013781" | |
54 | FT /db_xref="InterPro:IPR017853" | |
55 | FT /db_xref="InterPro:IPR018120" | |
56 | FT /db_xref="UniProtKB/Swiss-Prot:P26204" | |
57 | FT /protein_id="CAA40058.1" | |
58 | FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI | |
59 | FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK | |
60 | FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ | |
61 | FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR | |
62 | FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD | |
63 | FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF | |
64 | FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ | |
65 | FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA | |
66 | FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" | |
67 | XX | |
68 | SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; | |
69 | aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 | |
70 | cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 | |
71 | tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 | |
72 | aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 | |
73 | tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 | |
74 | caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 | |
75 | ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 | |
76 | atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 | |
77 | ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 | |
78 | tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 | |
79 | gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 | |
80 | aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 | |
81 | aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 | |
82 | taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 | |
83 | gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 | |
84 | cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 | |
85 | gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 | |
86 | ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 | |
87 | acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 | |
88 | acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 | |
89 | gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 | |
90 | gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 | |
91 | agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 | |
92 | ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 | |
93 | taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 | |
94 | tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 | |
95 | ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 | |
96 | tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 | |
97 | aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 | |
98 | agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 | |
99 | tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 | |
100 | ID seq2; SV 1; linear; mRNA; STD; PLN; 1859 BP. | |
101 | XX | |
102 | AC X56734; S46826; | |
103 | XX | |
104 | DT 12-SEP-1991 (Rel. 29, Created) | |
105 | DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) | |
106 | XX | |
107 | DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase | |
108 | XX | |
109 | KW beta-glucosidase. | |
110 | XX | |
111 | OS Trifolium repens (white clover) | |
112 | OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; | |
113 | OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; | |
114 | OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. | |
115 | XX | |
116 | RN [5] | |
117 | RP 1-1859 | |
118 | RX DOI; 10.1007/BF00039495. | |
119 | RX PUBMED; 1907511. | |
120 | RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; | |
121 | RT "Nucleotide and derived amino acid sequence of the cyanogenic | |
122 | RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; | |
123 | RL Plant Mol. Biol. 17(2):209-219(1991). | |
124 | XX | |
125 | RN [6] | |
126 | RP 1-1859 | |
127 | RA Hughes M.A.; | |
128 | RT ; | |
129 | RL Submitted (19-NOV-1990) to the INSDC. | |
130 | RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle | |
131 | RL Upon Tyne, NE2 4HH, UK | |
132 | XX | |
133 | DR EuropePMC; PMC99098; 11752244. | |
134 | XX | |
135 | FH Key Location/Qualifiers | |
136 | FH | |
137 | FT source 1..1859 | |
138 | FT /organism="Trifolium repens" | |
139 | FT /mol_type="mRNA" | |
140 | FT /clone_lib="lambda gt10" | |
141 | FT /clone="TRE361" | |
142 | FT /tissue_type="leaves" | |
143 | FT /db_xref="taxon:3899" | |
144 | FT mRNA 1..1859 | |
145 | FT /experiment="experimental evidence, no additional details | |
146 | FT recorded" | |
147 | FT CDS 14..1495 | |
148 | FT /product="beta-glucosidase" | |
149 | FT /EC_number="3.2.1.21" | |
150 | FT /note="non-cyanogenic" | |
151 | FT /db_xref="GOA:P26204" | |
152 | FT /db_xref="InterPro:IPR001360" | |
153 | FT /db_xref="InterPro:IPR013781" | |
154 | FT /db_xref="InterPro:IPR017853" | |
155 | FT /db_xref="InterPro:IPR018120" | |
156 | FT /db_xref="UniProtKB/Swiss-Prot:P26204" | |
157 | FT /protein_id="CAA40058.1" | |
158 | FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI | |
159 | FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK | |
160 | FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ | |
161 | FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR | |
162 | FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD | |
163 | FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF | |
164 | FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ | |
165 | FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA | |
166 | FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" | |
167 | XX | |
168 | SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; | |
169 | aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 | |
170 | cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 | |
171 | tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 | |
172 | aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 | |
173 | tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 | |
174 | caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 | |
175 | ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 | |
176 | atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 | |
177 | ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 | |
178 | tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 | |
179 | gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 | |
180 | aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 | |
181 | aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 | |
182 | taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 | |
183 | gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 | |
184 | cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 | |
185 | gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 | |
186 | ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 | |
187 | acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 | |
188 | acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 | |
189 | gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 | |
190 | gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 | |
191 | agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 | |
192 | ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 | |
193 | taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 | |
194 | tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 | |
195 | ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 | |
196 | tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 | |
197 | aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 | |
198 | agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 | |
199 | tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa ccccccccc 1859 | |
200 | // | |
201 |
0 | >seq1 | |
1 | aaacaaaccaaatatggattttattgtagccatatttgctctgtttgttattagctcatt | |
2 | cacaattacttccacaaatgcagttgaagcttctactcttcttgacataggtaacctgag | |
3 | tcggagcagttttcctcgtggcttcatctttggtgctggatcttcagcataccaatttga | |
4 | aggtgcagtaaacgaaggcggtagaggaccaagtatttgggataccttcacccataaata | |
5 | tccagaaaaaataagggatggaagcaatgcagacatcacggttgaccaatatcaccgcta | |
6 | caaggaagatgttgggattatgaaggatcaaaatatggattcgtatagattctcaatctc | |
7 | ttggccaagaatactcccaaagggaaagttgagcggaggcataaatcacgaaggaatcaa | |
8 | atattacaacaaccttatcaacgaactattggctaacggtatacaaccatttgtaactct | |
9 | ttttcattgggatcttccccaagtcttagaagatgagtatggtggtttcttaaactccgg | |
10 | tgtaataaatgattttcgagactatacggatctttgcttcaaggaatttggagatagagt | |
11 | gaggtattggagtactctaaatgagccatgggtgtttagcaattctggatatgcactagg | |
12 | aacaaatgcaccaggtcgatgttcggcctccaacgtggccaagcctggtgattctggaac | |
13 | aggaccttatatagttacacacaatcaaattcttgctcatgcagaagctgtacatgtgta | |
14 | taagactaaataccaggcatatcaaaagggaaagataggcataacgttggtatctaactg | |
15 | gttaatgccacttgatgataatagcataccagatataaaggctgccgagagatcacttga | |
16 | cttccaatttggattgtttatggaacaattaacaacaggagattattctaagagcatgcg | |
17 | gcgtatagttaaaaaccgattacctaagttctcaaaattcgaatcaagcctagtgaatgg | |
18 | ttcatttgattttattggtataaactattactcttctagttatattagcaatgccccttc | |
19 | acatggcaatgccaaacccagttactcaacaaatcctatgaccaatatttcatttgaaaa | |
20 | acatgggatacccttaggtccaagggctgcttcaatttggatatatgtttatccatatat | |
21 | gtttatccaagaggacttcgagatcttttgttacatattaaaaataaatataacaatcct | |
22 | gcaattttcaatcactgaaaatggtatgaatgaattcaacgatgcaacacttccagtaga | |
23 | agaagctcttttgaatacttacagaattgattactattaccgtcacttatactacattcg | |
24 | ttctgcaatcagggctggctcaaatgtgaagggtttttacgcatggtcatttttggactg | |
25 | taatgaatggtttgcaggctttactgttcgttttggattaaactttgtagattagaaaga | |
26 | tggattaaaaaggtaccctaagctttctgcccaatggtacaagaactttctcaaaagaaa | |
27 | ctagctagtattattaaaagaactttgtagtagattacagtacatcgtttgaagttgagt | |
28 | tggtgcacctaattaaataaaagaggttactcttaacatatttttaggccattcgttgtg | |
29 | aagttgttaggctgttatttctattatactatgttgtagtaataagtgcattgttgtacc | |
30 | agaagctatgatcataactataggttgatccttcatgtatcagtttgatgttgagaatac | |
31 | tttgaattaaaagtctttttttatttttttaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | |
32 | >seq2 | |
33 | aaacaaaccaaatatggattttattgtagccatatttgctctgtttgttattagctcatt | |
34 | cacaattacttccacaaatgcagttgaagcttctactcttcttgacataggtaacctgag | |
35 | tcggagcagttttcctcgtggcttcatctttggtgctggatcttcagcataccaatttga | |
36 | aggtgcagtaaacgaaggcggtagaggaccaagtatttgggataccttcacccataaata | |
37 | tccagaaaaaataagggatggaagcaatgcagacatcacggttgaccaatatcaccgcta | |
38 | caaggaagatgttgggattatgaaggatcaaaatatggattcgtatagattctcaatctc | |
39 | ttggccaagaatactcccaaagggaaagttgagcggaggcataaatcacgaaggaatcaa | |
40 | atattacaacaaccttatcaacgaactattggctaacggtatacaaccatttgtaactct | |
41 | ttttcattgggatcttccccaagtcttagaagatgagtatggtggtttcttaaactccgg | |
42 | tgtaataaatgattttcgagactatacggatctttgcttcaaggaatttggagatagagt | |
43 | gaggtattggagtactctaaatgagccatgggtgtttagcaattctggatatgcactagg | |
44 | aacaaatgcaccaggtcgatgttcggcctccaacgtggccaagcctggtgattctggaac | |
45 | aggaccttatatagttacacacaatcaaattcttgctcatgcagaagctgtacatgtgta | |
46 | taagactaaataccaggcatatcaaaagggaaagataggcataacgttggtatctaactg | |
47 | gttaatgccacttgatgataatagcataccagatataaaggctgccgagagatcacttga | |
48 | cttccaatttggattgtttatggaacaattaacaacaggagattattctaagagcatgcg | |
49 | gcgtatagttaaaaaccgattacctaagttctcaaaattcgaatcaagcctagtgaatgg | |
50 | ttcatttgattttattggtataaactattactcttctagttatattagcaatgccccttc | |
51 | acatggcaatgccaaacccagttactcaacaaatcctatgaccaatatttcatttgaaaa | |
52 | acatgggatacccttaggtccaagggctgcttcaatttggatatatgtttatccatatat | |
53 | gtttatccaagaggacttcgagatcttttgttacatattaaaaataaatataacaatcct | |
54 | gcaattttcaatcactgaaaatggtatgaatgaattcaacgatgcaacacttccagtaga | |
55 | agaagctcttttgaatacttacagaattgattactattaccgtcacttatactacattcg | |
56 | ttctgcaatcagggctggctcaaatgtgaagggtttttacgcatggtcatttttggactg | |
57 | taatgaatggtttgcaggctttactgttcgttttggattaaactttgtagattagaaaga | |
58 | tggattaaaaaggtaccctaagctttctgcccaatggtacaagaactttctcaaaagaaa | |
59 | ctagctagtattattaaaagaactttgtagtagattacagtacatcgtttgaagttgagt | |
60 | tggtgcacctaattaaataaaagaggttactcttaacatatttttaggccattcgttgtg | |
61 | aagttgttaggctgttatttctattatactatgttgtagtaataagtgcattgttgtacc | |
62 | agaagctatgatcataactataggttgatccttcatgtatcagtttgatgttgagaatac | |
63 | tttgaattaaaagtctttttttatttttttaaaaaaaaaaaaaaaaaaaaccccccccc |
0 | >1 | |
1 | 40 40 40 | |
2 | 40 40 | |
3 | ||
4 | >2 | |
5 | 40 | |
6 | 40 | |
7 | ||
8 | 40 | |
9 | 40 40 | |
10 | >3 | |
11 | ||
12 | 40 40 40 40 40 | |
13 | ||
14 | >4 | |
15 | 40 40 40 40 40 | |
16 |
0 | >1 | |
1 | 40 40 40 | |
2 | 40 40 | |
3 | ||
4 | >3 | |
5 | 40 | |
6 | 40 | |
7 | ||
8 | 40 | |
9 | 40 40 | |
10 | >3 | |
11 | ||
12 | 40 40 40 40 40 | |
13 | ||
14 | >4 | |
15 | 40 40 40 40 40 | |
16 |
0 | @1 | |
1 | ACGTA | |
2 | + | |
3 | IIIII | |
4 | @2 | |
5 | ACGTA | |
6 | + | |
7 | IIIII | |
8 | @3 | |
9 | ACGTA | |
10 | + | |
11 | IIIII | |
12 | @4 | |
13 | ACGTA | |
14 | + | |
15 | IIIII |
0 | LOCUS NAME1 5028 bp DNA PLN 21-JUN-1999 | |
1 | DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p | |
2 | (AXL2) and Rev7p (REV7) genes, complete cds. | |
3 | ACCESSION U49845 | |
4 | VERSION U49845.1 GI:1293613 | |
5 | KEYWORDS . | |
6 | SOURCE Saccharomyces cerevisiae (baker's yeast) | |
7 | ORGANISM Saccharomyces cerevisiae | |
8 | Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; | |
9 | Saccharomycetales; Saccharomycetaceae; Saccharomyces. | |
10 | REFERENCE 1 (bases 1 to 5028) | |
11 | AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W. | |
12 | TITLE Cloning and sequence of REV7, a gene whose function is required for | |
13 | DNA damage-induced mutagenesis in Saccharomyces cerevisiae | |
14 | JOURNAL Yeast 10 (11), 1503-1509 (1994) | |
15 | PUBMED 7871890 | |
16 | REFERENCE 2 (bases 1 to 5028) | |
17 | AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. | |
18 | TITLE Selection of axial growth sites in yeast requires Axl2p, a novel | |
19 | plasma membrane glycoprotein | |
20 | JOURNAL Genes Dev. 10 (7), 777-793 (1996) | |
21 | PUBMED 8846915 | |
22 | REFERENCE 3 (bases 1 to 5028) | |
23 | AUTHORS Roemer,T. | |
24 | TITLE Direct Submission | |
25 | JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New | |
26 | Haven, CT, USA | |
27 | FEATURES Location/Qualifiers | |
28 | source 1..5028 | |
29 | /organism="Saccharomyces cerevisiae" | |
30 | /db_xref="taxon:4932" | |
31 | /chromosome="IX" | |
32 | /map="9" | |
33 | CDS <1..206 | |
34 | /codon_start=3 | |
35 | /product="TCP1-beta" | |
36 | /protein_id="AAA98665.1" | |
37 | /db_xref="GI:1293614" | |
38 | /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA | |
39 | AEVLLRVDNIIRARPRTANRQHM" | |
40 | gene 687..3158 | |
41 | /gene="AXL2" | |
42 | CDS 687..3158 | |
43 | /gene="AXL2" | |
44 | /note="plasma membrane glycoprotein" | |
45 | /codon_start=1 | |
46 | /function="required for axial budding pattern of S. | |
47 | cerevisiae" | |
48 | /product="Axl2p" | |
49 | /protein_id="AAA98666.1" | |
50 | /db_xref="GI:1293615" | |
51 | /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF | |
52 | TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN | |
53 | VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE | |
54 | VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE | |
55 | TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV | |
56 | YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG | |
57 | DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ | |
58 | DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA | |
59 | NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA | |
60 | CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN | |
61 | NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ | |
62 | SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS | |
63 | YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK | |
64 | HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL | |
65 | VDFSNKSNVNVGQVKDIHGRIPEML" | |
66 | gene complement(3300..4037) | |
67 | /gene="REV7" | |
68 | CDS complement(3300..4037) | |
69 | /gene="REV7" | |
70 | /codon_start=1 | |
71 | /product="Rev7p" | |
72 | /protein_id="AAA98667.1" | |
73 | /db_xref="GI:1293616" | |
74 | /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ | |
75 | FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD | |
76 | KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR | |
77 | RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK | |
78 | LISGDDKILNGVYSQYEEGESIFGSLF" | |
79 | ORIGIN | |
80 | 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg | |
81 | 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct | |
82 | 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa | |
83 | 181 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc | |
84 | // | |
85 | LOCUS NAME2 5028 bp DNA PLN 21-JUN-1999 | |
86 | DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p | |
87 | (AXL2) and Rev7p (REV7) genes, complete cds. | |
88 | ACCESSION U49845 | |
89 | VERSION U49845.1 GI:1293613 | |
90 | KEYWORDS . | |
91 | SOURCE Saccharomyces cerevisiae (baker's yeast) | |
92 | ORGANISM Saccharomyces cerevisiae | |
93 | Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; | |
94 | Saccharomycetales; Saccharomycetaceae; Saccharomyces. | |
95 | REFERENCE 1 (bases 1 to 5028) | |
96 | AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W. | |
97 | TITLE Cloning and sequence of REV7, a gene whose function is required for | |
98 | DNA damage-induced mutagenesis in Saccharomyces cerevisiae | |
99 | JOURNAL Yeast 10 (11), 1503-1509 (1994) | |
100 | PUBMED 7871890 | |
101 | REFERENCE 2 (bases 1 to 5028) | |
102 | AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. | |
103 | TITLE Selection of axial growth sites in yeast requires Axl2p, a novel | |
104 | plasma membrane glycoprotein | |
105 | JOURNAL Genes Dev. 10 (7), 777-793 (1996) | |
106 | PUBMED 8846915 | |
107 | REFERENCE 3 (bases 1 to 5028) | |
108 | AUTHORS Roemer,T. | |
109 | TITLE Direct Submission | |
110 | JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New | |
111 | Haven, CT, USA | |
112 | FEATURES Location/Qualifiers | |
113 | source 1..5028 | |
114 | /organism="Saccharomyces cerevisiae" | |
115 | /db_xref="taxon:4932" | |
116 | /chromosome="IX" | |
117 | /map="9" | |
118 | CDS <1..206 | |
119 | /codon_start=3 | |
120 | /product="TCP1-beta" | |
121 | /protein_id="AAA98665.1" | |
122 | /db_xref="GI:1293614" | |
123 | /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA | |
124 | AEVLLRVDNIIRARPRTANRQHM" | |
125 | gene 687..3158 | |
126 | /gene="AXL2" | |
127 | CDS 687..3158 | |
128 | /gene="AXL2" | |
129 | /note="plasma membrane glycoprotein" | |
130 | /codon_start=1 | |
131 | /function="required for axial budding pattern of S. | |
132 | cerevisiae" | |
133 | /product="Axl2p" | |
134 | /protein_id="AAA98666.1" | |
135 | /db_xref="GI:1293615" | |
136 | /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF | |
137 | TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN | |
138 | VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE | |
139 | VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE | |
140 | TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV | |
141 | YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG | |
142 | DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ | |
143 | DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA | |
144 | NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA | |
145 | CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN | |
146 | NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ | |
147 | SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS | |
148 | YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK | |
149 | HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL | |
150 | VDFSNKSNVNVGQVKDIHGRIPEML" | |
151 | gene complement(3300..4037) | |
152 | /gene="REV7" | |
153 | CDS complement(3300..4037) | |
154 | /gene="REV7" | |
155 | /codon_start=1 | |
156 | /product="Rev7p" | |
157 | /protein_id="AAA98667.1" | |
158 | /db_xref="GI:1293616" | |
159 | /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ | |
160 | FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD | |
161 | KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR | |
162 | RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK | |
163 | LISGDDKILNGVYSQYEEGESIFGSLF" | |
164 | ORIGIN | |
165 | 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg | |
166 | 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct | |
167 | 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa | |
168 | 181 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgaaa | |
169 | // |
0 | >NAME1 | |
1 | gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattg | |
2 | ccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagct | |
3 | ctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaa | |
4 | tgccatgactcagattctaattttaagctattcaatttctctttgatc | |
5 | >NAME2 | |
6 | gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattg | |
7 | ccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagct | |
8 | ctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaa | |
9 | tgccatgactcagattctaattttaagctattcaatttctctttgaaa |
0 | >ID | |
1 | A | |
2 | >ID | |
3 | AA | |
4 | >ID | |
5 | AAA | |
6 | >ID | |
7 | AAA | |
8 | A | |
9 | >ID | |
10 | AAA | |
11 | AA | |
12 | >ID | |
13 | AAA | |
14 | AAA | |
15 | >ID | |
16 | AAA | |
17 | AAA | |
18 | A |
0 | >one.p1k | |
1 | ACGT | |
2 | >one.q1k | |
3 | CCCC | |
4 | >two.p1k | |
5 | A | |
6 | >two.q1k | |
7 | C | |
8 | >one.p1k | |
9 | TTTTTTTTTT | |
10 | >three.q1k | |
11 | A | |
12 | >four.x | |
13 | T | |
14 | >five.p1k | |
15 | G |
0 | ##gff-version 3 | |
1 | # comment | |
2 | ##sequence-region seq1 1 10 | |
3 | seq1 . gene 3 7 . + . ID=gene1;name=name1 | |
4 | ##FASTA | |
5 | >seq1 | |
6 | ACGTACGTAC | |
7 | >seq2 | |
8 | ACGTACGTAC |
0 | ##gff-version 3 | |
1 | # comment | |
2 | ##sequence-region seq1 1 10 | |
3 | seq1 . gene 3 7 . + . ID=gene1;name=name1 | |
4 | >seq1 | |
5 | ACGTACGTAC | |
6 | >seq2 | |
7 | ACGTACGTAC |
0 | ##gff-version 3 | |
1 | # comment | |
2 | ##sequence-region seq1 1 10 | |
3 | seq1 . gene 3 7 . + . ID=gene1;name=name1 | |
4 | ##FASTA | |
5 | oops |
0 | ##gff-version 3 | |
1 | # comment | |
2 | ##sequence-region seq1 1 10 | |
3 | seq1 . gene 3 7 . + . ID=gene1;name=name1 |
0 | <?xml version="1.0"?> | |
1 | <trace_volume> | |
2 | <trace> | |
3 | <trace_name>ID</trace_name> | |
4 | <clip_quality_right>5</clip_quality_right> | |
5 | <clip_vector_left>1</clip_vector_left> | |
6 | </trace> | |
7 | <trace> | |
8 | <trace_name>ID</trace_name> | |
9 | <clip_quality_right>5</clip_quality_right> | |
10 | <clip_vector_left>1</clip_vector_left> | |
11 | </trace> | |
12 | </trace_volume> |
0 | >A | |
1 | ACG | |
2 | >B | |
3 | ACG | |
4 | >C | |
5 | ACG | |
6 | >D | |
7 | ACG | |
8 | >E | |
9 | ACG | |
10 | >F | |
11 | ACG | |
12 | >G | |
13 | ACG | |
14 | >H | |
15 | ACG | |
16 | >I | |
17 | ACG | |
18 | >J | |
19 | ACG | |
20 | >K | |
21 | ACG | |
22 | >L | |
23 | ACG | |
24 | >M | |
25 | ACG | |
26 | >N | |
27 | ACG | |
28 | >O | |
29 | ACG | |
30 | >P | |
31 | ACG | |
32 | >Q | |
33 | ACG | |
34 | >R | |
35 | ACG | |
36 | >S | |
37 | ACG | |
38 | >T | |
39 | ACG | |
40 | >U | |
41 | ACG | |
42 | >V | |
43 | ACG | |
44 | >W | |
45 | ACG | |
46 | >X | |
47 | ACG | |
48 | >Y | |
49 | ACG | |
50 | >Z | |
51 | ACG | |
52 | >A | |
53 | ACG | |
54 | >B | |
55 | ACG |
0 | >1 | |
1 | GTATGACGACTTCTCGGTCAAAGGTAAGGTGAACAAGGGATTGAATGCTTAAATCCCGTG | |
2 | CCTACACTCAGTACCGGTGCTTGGCTGAAGCGTTCCTATGCAAGAATGAGAACTGGCAAC | |
3 | ACGTCGCGGCCAGCCCGGGACCATCAGGACCCGAACGTGTACCGCGAATGTTTACATTTC | |
4 | ACCCAGTTACCCGGATTCGGGCCAAAGCAGGAGAGCCTCTGAATTAGATGGTGCCACGTA | |
5 | AGTCTATTTTCGCACGTTTTATTGATTCAAGTGAGTGTCAACGTAGATTTATTGGTGCTT | |
6 | GGCTAAAGACGTATGGATCACGGGATGGAACATCTGGATCCCCCATGTACGTAAGTGTGT | |
7 | CGTCAAACAAAATTCTGTATCCCGTCGCTCCTGCCAGGGCAATCGCGGAGCTACGGACAT | |
8 | AGTCCTTAGTGAACTAATGATGATGAACATCTCGAACCAGGTTAACACGATACGATGAAG | |
9 | CGGGTTACTGAACACACTTAACAGGAGCCTGAGCAAATGTCATTTACAAAAGGTTTCTAG | |
10 | ACCCCCTTGGTAAGTCACTTGACACGTCTCATGCGGGGCCTACGGTAAACCAGATGCTAG | |
11 | AGTAGCGAACGGTGGGTGCGCAGGCATGTCCGGTCTCTCGATGGTGCACTTACGGACATC | |
12 | TCCCTATACAGATCTATTCAGTCACGAAGGTCAGCGAACATAACCCACGGGAGTTATCTC | |
13 | AACGAGTACGGGAGCGAACGGTGCACGGATCTGTCTTAGCTCAGAGGCGTCACGCGGTCC | |
14 | TATCTAACGCAAGAGCATGTGCCATTCCGGCCCTCTGATGTGCCTATGTACATAGAGCCG | |
15 | ACCCCGGCGGATTGGAGTCCCTAGCTACCGTCGACAGAGACGCAAAGACTCAATTGCTAT | |
16 | GTATATTGTTACTCTTCAACCACTGGAAAGACAAATAATTGCGGGCAAGTGCGTTACCCA | |
17 | TCACTCTGTTCTGTACACGAAAGGCTGAATAGCAAGTGGC |
0 | 1 fastaq CDS 28 222 . + . | |
1 | 1 fastaq CDS 45 227 . + . | |
2 | 1 fastaq CDS 49 171 . - . | |
3 | 1 fastaq CDS 110 241 . + . | |
4 | 1 fastaq CDS 144 266 . - . | |
5 | 1 fastaq CDS 228 422 . + . | |
6 | 1 fastaq CDS 278 433 . - . | |
7 | 1 fastaq CDS 287 478 . + . | |
8 | 1 fastaq CDS 289 519 . - . | |
9 | 1 fastaq CDS 563 703 . + . | |
10 | 1 fastaq CDS 601 759 . + . | |
11 | 1 fastaq CDS 606 818 . + . | |
12 | 1 fastaq CDS 819 938 . + . | |
13 | 1 fastaq CDS 836 988 . + . | |
14 | 1 fastaq CDS 865 999 . + . |
0 | 3 42 | |
1 | Turkey AA-CTNGGGC ATTTCAGGGT | |
2 | Salmo_gairAAGCCTTGGC AGTGCAGGGT | |
3 | H. SapiensACCGGTTGGC CGTTCAGGGT | |
4 | ||
5 | GAGCCCGGGC AATACAGGGT AT | |
6 | GAGCCGTGGC CGGGCACGGT AT | |
7 | ACAGGTTGGC CGTTCAGGGT AA |
0 | >Turkey | |
1 | AACTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT | |
2 | >Salmo_gair | |
3 | AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT | |
4 | >H. Sapiens | |
5 | ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA |
0 | 3 42 | |
1 | Turkey AA-CTNGGGC ATTTCAGGGT | |
2 | Salmo_gairAAGCCTTGGC AGTGCAGGGT | |
3 | H. SapiensACCGGTTGGC CGTTCAGGGT | |
4 | GAGCCCGGGC AATACAGGGT AT | |
5 | GAGCCGTGGC CGGGCACGGT AT | |
6 | ACAGGTTGGC CGTTCAGGGT AA |
0 | >Turkey | |
1 | AACTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT | |
2 | >Salmo_gair | |
3 | AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT | |
4 | >H. Sapiens | |
5 | ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA |
0 | 2 97 | |
1 | seq1 GGGGGGGGGG GGGGGGGGGG GGGGGGGGGG GGGGGGGGGG GGGGGGGGGG GGGGGGGGGG | |
2 | seq2 AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA | |
3 | ||
4 | GGGGGGGGGG GGGGGGGGGG GGGGGGGGGG GGGGGGT | |
5 | AAAAAAAAAA AAAAAAAAAA AAAAAAAAA- -AAAAAG |
0 | >seq1 | |
1 | GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG | |
2 | GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGT | |
3 | >seq2 | |
4 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA | |
5 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG |
0 | 3 42 | |
1 | Turkey AA-CTNGGGC ATTTCAGGGT | |
2 | GAGCCCGGGC AATACAGGGT AT | |
3 | Salmo_gairAAGCCTTGGC AGTGCAGGGT | |
4 | GAGCCGTGGC CGGGCACGGT AT | |
5 | H. SapiensACCGGTTGGC CGTTCAGGGT | |
6 | ACAGGTTGGC CGTTCAGGGT AA |
0 | >Turkey | |
1 | AACTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT | |
2 | >Salmo_gair | |
3 | AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT | |
4 | >H. Sapiens | |
5 | ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA |
0 | seq1:1-4 seq1 0 |
0 | >seq | |
1 | GCAGCCGCGGCTAGAAGGCGACGCCGGCGTAACAATGACGATTGCTGTGAAGAGCAACAGGGAGGCGGGGGTCACCATATAATCATTTTATTGCTACTCCTGCTTAAAAAGATGTTCTTTCCACCCCCGCCTAGCAGTTCATCCTCGTCTACAACCACGACTTGGTACTATGTAGTCGTGGTTTAATAGTGA |
0 | >1 | |
1 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 | |
2 | 40 40 | |
3 | >2 | |
4 | 40 40 40 40 40 |
0 | >1 | |
1 | 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 | |
2 | 42 42 | |
3 | >2 | |
4 | 42 42 42 42 42 |
0 | >1_1_10 | |
1 | ACGCTCTCGA | |
2 | >1_6_15 | |
3 | CTCGAGCGCG | |
4 | >1_11_20 | |
5 | GCGCGAGCGC | |
6 | >1_16_25 | |
7 | AGCGCGAGCG | |
8 | >1_21_27 | |
9 | GAGCGAC |
0 | >1/1 | |
1 | 1234567890 | |
2 | >2/1 | |
3 | AACG123456789 | |
4 | >3/1 | |
5 | 1234567890 | |
6 | >4/1 | |
7 | AACG1234567890 | |
8 | >5/1 | |
9 | 1234567890 | |
10 | >6/1 | |
11 | AACG1234567890 | |
12 | >7/1 | |
13 | 123456789AGGC | |
14 | >8/1 | |
15 | 123456789 | |
16 | >9/1 | |
17 | 1234567890AGGC | |
18 | >10/1 | |
19 | AACG123456789CGTT | |
20 | >11/1 | |
21 | AACG1234567890CGTT | |
22 | >12/1 | |
23 | AACG1234567890CGTT |
0 | >1/1 | |
1 | 1234567890 | |
2 | >4/1 | |
3 | 1234567890 | |
4 | >5/1 | |
5 | 1234567890 | |
6 | >6/1 | |
7 | 1234567890 | |
8 | >9/1 | |
9 | 1234567890 | |
10 | >12/1 | |
11 | 1234567890 |
0 | >1/2 | |
1 | 1234567890 | |
2 | >2/2 | |
3 | 1234567890 | |
4 | >3/2 | |
5 | AACG123456789 | |
6 | >4/2 | |
7 | 1234567890 | |
8 | >5/2 | |
9 | AACG1234567890 | |
10 | >6/2 | |
11 | GCCT1234567890 | |
12 | >7/2 | |
13 | 1234567890 | |
14 | >8/2 | |
15 | 123456789AGGC | |
16 | >9/2 | |
17 | 1234567890CGTT | |
18 | >10/2 | |
19 | AACG1234567890CGTT | |
20 | >11/2 | |
21 | AACG123456789CGTT | |
22 | >12/2 | |
23 | AACG1234567890CGTT |
0 | >1/2 | |
1 | 1234567890 | |
2 | >4/2 | |
3 | 1234567890 | |
4 | >5/2 | |
5 | 1234567890 | |
6 | >6/2 | |
7 | 1234567890 | |
8 | >9/2 | |
9 | 1234567890 | |
10 | >12/2 | |
11 | 1234567890 |
0 | this is the contents of system call test file |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import unittest | |
3 | from fastaq import intervals | |
4 | ||
5 | class TestIntervals(unittest.TestCase): | |
6 | def test_init(self): | |
7 | '''Throw error if try to construct genome_interval from a non-int, or end<start''' | |
8 | with self.assertRaises(intervals.Error): | |
9 | intervals.Interval('a', 1) | |
10 | with self.assertRaises(intervals.Error): | |
11 | intervals.Interval(1, 'a') | |
12 | with self.assertRaises(intervals.Error): | |
13 | intervals.Interval('a', 'a') | |
14 | with self.assertRaises(intervals.Error): | |
15 | intervals.Interval(3, 2) | |
16 | ||
17 | def test_comparisons(self): | |
18 | '''<, <=, == should work as expected''' | |
19 | self.assertTrue(intervals.Interval(1,2) < intervals.Interval(2,2)) | |
20 | self.assertTrue(intervals.Interval(1,2) <= intervals.Interval(2,2)) | |
21 | self.assertFalse(intervals.Interval(2,2) <= intervals.Interval(1,2)) | |
22 | self.assertFalse(intervals.Interval(2,2) < intervals.Interval(1,2)) | |
23 | self.assertFalse(intervals.Interval(2,2) < intervals.Interval(2,2)) | |
24 | self.assertTrue(intervals.Interval(1,2) == intervals.Interval(1,2)) | |
25 | self.assertFalse(intervals.Interval(1,2) == intervals.Interval(1,3)) | |
26 | self.assertTrue(intervals.Interval(1,2) != intervals.Interval(1,3)) | |
27 | self.assertFalse(intervals.Interval(1,2) != intervals.Interval(1,2)) | |
28 | ||
29 | def test_len(self): | |
30 | self.assertEqual(len(intervals.Interval(1,2)), 2) | |
31 | self.assertEqual(len(intervals.Interval(1,1)), 1) | |
32 | self.assertEqual(len(intervals.Interval(10,20)), 11) | |
33 | ||
34 | def test_intersects(self): | |
35 | '''Intersection of two intervals should do the right thing''' | |
36 | a = intervals.Interval(5, 10) | |
37 | no_intersect = [intervals.Interval(3, 4), | |
38 | intervals.Interval(11,20)] | |
39 | intersect = [intervals.Interval(3,5), | |
40 | intervals.Interval(3,6), | |
41 | intervals.Interval(9,12), | |
42 | intervals.Interval(10,12), | |
43 | intervals.Interval(6,7), | |
44 | intervals.Interval(1,20)] | |
45 | ||
46 | for i in no_intersect: | |
47 | self.assertFalse(a.intersects(i), 'shouldn\'t intersect: ' + str(a) + ', ' + str(i)) | |
48 | ||
49 | for i in intersect: | |
50 | self.assertTrue(a.intersects(i), 'should intersect: ' + str(a) + ', ' + str(i)) | |
51 | ||
52 | def test_contains(self): | |
53 | '''Check that contains() works as expected''' | |
54 | a = intervals.Interval(5, 10) | |
55 | not_contained = [intervals.Interval(1,2), | |
56 | intervals.Interval(4,5), | |
57 | intervals.Interval(4,10), | |
58 | intervals.Interval(4,11), | |
59 | intervals.Interval(5,11), | |
60 | intervals.Interval(1,2), | |
61 | intervals.Interval(9,11), | |
62 | intervals.Interval(10,11), | |
63 | intervals.Interval(11,20)] | |
64 | ||
65 | ||
66 | contained = [intervals.Interval(5,5), | |
67 | intervals.Interval(5,10), | |
68 | intervals.Interval(6,7), | |
69 | intervals.Interval(6,10), | |
70 | intervals.Interval(10,10)] | |
71 | ||
72 | for i in not_contained: | |
73 | self.assertFalse(a.contains(i), 'shouldn\'t contain: ' + str(a) + ', ' + str(i)) | |
74 | ||
75 | for i in contained: | |
76 | self.assertTrue(a.contains(i), 'should contain: ' + str(a) + ', ' + str(i)) | |
77 | ||
78 | def test_union(self): | |
79 | '''Union should either return None or the correct union''' | |
80 | a = intervals.Interval(5, 10) | |
81 | b = intervals.Interval(8, 15) | |
82 | c = intervals.Interval(12, 20) | |
83 | d = intervals.Interval(21,22) | |
84 | self.assertEqual(a.union(c), None) | |
85 | self.assertEqual(c.union(a), None) | |
86 | self.assertEqual(a.union(b), intervals.Interval(5,15)) | |
87 | self.assertEqual(b.union(a), intervals.Interval(5,15)) | |
88 | self.assertEqual(c.union(d), intervals.Interval(12,22)) | |
89 | self.assertEqual(d.union(c), intervals.Interval(12,22)) | |
90 | ||
91 | def test_union_flll_gap(self): | |
92 | '''union_fill_gap() should ignore intersections and return the maximum range of coords''' | |
93 | a = intervals.Interval(5, 10) | |
94 | b = intervals.Interval(8, 15) | |
95 | c = intervals.Interval(12, 20) | |
96 | d = intervals.Interval(21,22) | |
97 | self.assertEqual(a.union_fill_gap(c), intervals.Interval(5,20)) | |
98 | self.assertEqual(c.union_fill_gap(a), intervals.Interval(5,20)) | |
99 | self.assertEqual(a.union_fill_gap(b), intervals.Interval(5,15)) | |
100 | self.assertEqual(b.union_fill_gap(a), intervals.Interval(5,15)) | |
101 | self.assertEqual(c.union_fill_gap(d), intervals.Interval(12,22)) | |
102 | self.assertEqual(d.union_fill_gap(c), intervals.Interval(12,22)) | |
103 | ||
104 | ||
105 | def test_intersection(self): | |
106 | '''Intersection should either return None or the correct intersection''' | |
107 | a = intervals.Interval(5, 10) | |
108 | b = intervals.Interval(8, 15) | |
109 | c = intervals.Interval(12, 20) | |
110 | self.assertEqual(a.intersection(c), None) | |
111 | self.assertEqual(a.intersection(b), intervals.Interval(8,10)) | |
112 | ||
113 | class Test_intersection(unittest.TestCase): | |
114 | def test_intersection(self): | |
115 | '''intersection() should correctly intersect two lists of intervals''' | |
116 | a = [intervals.Interval(1,2), | |
117 | intervals.Interval(10,20), | |
118 | intervals.Interval(51,52), | |
119 | intervals.Interval(54,55), | |
120 | intervals.Interval(57,58)] | |
121 | ||
122 | b = [intervals.Interval(5,6), | |
123 | intervals.Interval(9,11), | |
124 | intervals.Interval(13,14), | |
125 | intervals.Interval(17,18), | |
126 | intervals.Interval(20,25), | |
127 | intervals.Interval(50,60)] | |
128 | ||
129 | c = [intervals.Interval(100,200)] | |
130 | ||
131 | i = [intervals.Interval(10,11), | |
132 | intervals.Interval(13,14), | |
133 | intervals.Interval(17,18), | |
134 | intervals.Interval(20,20), | |
135 | intervals.Interval(51,52), | |
136 | intervals.Interval(54,55), | |
137 | intervals.Interval(57,58)] | |
138 | ||
139 | self.assertSequenceEqual(intervals.intersection(a,b), i) | |
140 | self.assertSequenceEqual(intervals.intersection(b,a), i) | |
141 | self.assertSequenceEqual(intervals.intersection(c,a), []) | |
142 | self.assertEqual(intervals.intersection([],a), []) | |
143 | self.assertEqual(intervals.intersection(a,[]), []) | |
144 | ||
145 | class Test_merge_overlapping_in_list(unittest.TestCase): | |
146 | def test_merge_overlapping_in_list(self): | |
147 | '''merge_overlapping_in_list() merges correctly''' | |
148 | a = [intervals.Interval(1,2), | |
149 | intervals.Interval(51,60), | |
150 | intervals.Interval(10,20), | |
151 | intervals.Interval(20,30), | |
152 | intervals.Interval(20,30), | |
153 | intervals.Interval(29,50), | |
154 | intervals.Interval(65,70)] | |
155 | ||
156 | b = [intervals.Interval(1,2), | |
157 | intervals.Interval(10,60), | |
158 | intervals.Interval(65,70)] | |
159 | ||
160 | intervals.merge_overlapping_in_list(a) | |
161 | self.assertSequenceEqual(a, b) | |
162 | ||
163 | class Test_remove_contained_in_list(unittest.TestCase): | |
164 | def test_remove_contained_in_list(self): | |
165 | '''test_remove_contained_in_list removes the right elements of list''' | |
166 | a = [intervals.Interval(1,2), | |
167 | intervals.Interval(4,4), | |
168 | intervals.Interval(4,5), | |
169 | intervals.Interval(5,6), | |
170 | intervals.Interval(7,9), | |
171 | intervals.Interval(8,10), | |
172 | intervals.Interval(9,11), | |
173 | intervals.Interval(20,25), | |
174 | intervals.Interval(20,24), | |
175 | intervals.Interval(20,26), | |
176 | intervals.Interval(30,38), | |
177 | intervals.Interval(30,37), | |
178 | intervals.Interval(30,36), | |
179 | intervals.Interval(30,35), | |
180 | intervals.Interval(30,35), | |
181 | intervals.Interval(32,33), | |
182 | intervals.Interval(38,50), | |
183 | intervals.Interval(65,70), | |
184 | intervals.Interval(67,70)] | |
185 | ||
186 | b = [intervals.Interval(1,2), | |
187 | intervals.Interval(4,5), | |
188 | intervals.Interval(5,6), | |
189 | intervals.Interval(7,9), | |
190 | intervals.Interval(8,10), | |
191 | intervals.Interval(9,11), | |
192 | intervals.Interval(20,26), | |
193 | intervals.Interval(30,38), | |
194 | intervals.Interval(38,50), | |
195 | intervals.Interval(65,70)] | |
196 | ||
197 | intervals.remove_contained_in_list(a) | |
198 | self.assertSequenceEqual(a, b) | |
199 | ||
200 | class Test_length_sum_from_list(unittest.TestCase): | |
201 | def test_length_sum_from_list(self): | |
202 | '''Test that total length of intervals is summed correctly''' | |
203 | a = [intervals.Interval(1,2), | |
204 | intervals.Interval(4,5), | |
205 | intervals.Interval(10,19)] | |
206 | ||
207 | self.assertEqual(14, intervals.length_sum_from_list(a)) | |
208 | ||
209 | ||
210 | if __name__ == '__main__': | |
211 | unittest.main() |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import sys | |
3 | import filecmp | |
4 | import os | |
5 | import unittest | |
6 | from fastaq import sequences, utils, intervals, tasks | |
7 | ||
8 | modules_dir = os.path.dirname(os.path.abspath(sequences.__file__)) | |
9 | data_dir = os.path.join(modules_dir, 'tests', 'data') | |
10 | ||
11 | class Error (Exception): pass | |
12 | ||
13 | expected_embl = [ | |
14 | 'aaacaaaccaaatatggattttattgtagccatatttgctctgtttgttattagctcattcacaattacttccacaaatgcagttgaagcttctactcttcttgacataggtaacctgagtcggagcagttttcctcgtggcttcatctttggtgctggatcttcagcataccaatttgaaggtgcagtaaacgaaggcggtagaggaccaagtatttgggataccttcacccataaatatccagaaaaaataagggatggaagcaatgcagacatcacggttgaccaatatcaccgctacaaggaagatgttgggattatgaaggatcaaaatatggattcgtatagattctcaatctcttggccaagaatactcccaaagggaaagttgagcggaggcataaatcacgaaggaatcaaatattacaacaaccttatcaacgaactattggctaacggtatacaaccatttgtaactctttttcattgggatcttccccaagtcttagaagatgagtatggtggtttcttaaactccggtgtaataaatgattttcgagactatacggatctttgcttcaaggaatttggagatagagtgaggtattggagtactctaaatgagccatgggtgtttagcaattctggatatgcactaggaacaaatgcaccaggtcgatgttcggcctccaacgtggccaagcctggtgattctggaacaggaccttatatagttacacacaatcaaattcttgctcatgcagaagctgtacatgtgtataagactaaataccaggcatatcaaaagggaaagataggcataacgttggtatctaactggttaatgccacttgatgataatagcataccagatataaaggctgccgagagatcacttgacttccaatttggattgtttatggaacaattaacaacaggagattattctaagagcatgcggcgtatagttaaaaaccgattacctaagttctcaaaattcgaatcaagcctagtgaatggttcatttgattttattggtataaactattactcttctagttatattagcaatgccccttcacatggcaatgccaaacccagttactcaacaaatcctatgaccaatatttcatttgaaaaacatgggatacccttaggtccaagggctgcttcaatttggatatatgtttatccatatatgtttatccaagaggacttcgagatcttttgttacatattaaaaataaatataacaatcctgcaattttcaatcactgaaaatggtatgaatgaattcaacgatgcaacacttccagtagaagaagctcttttgaatacttacagaattgattactattaccgtcacttatactacattcgttctgcaatcagggctggctcaaatgtgaagggtttttacgcatggtcatttttggactgtaatgaatggtttgcaggctttactgttcgttttggattaaactttgtagattagaaagatggattaaaaaggtaccctaagctttctgcccaatggtacaagaactttctcaaaagaaactagctagtattattaaaagaactttgtagtagattacagtacatcgtttgaagttgagttggtgcacctaattaaataaaagaggttactcttaacatatttttaggccattcgttgtgaagttgttaggctgttatttctattatactatgttgtagtaataagtgcattgttgtaccagaagctatgatcataactataggttgatccttcatgtatcagtttgatgttgagaatactttgaattaaaagtctttttttatttttttaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', | |
15 | 'aaacaaaccaaatatggattttattgtagccatatttgctctgtttgttattagctcattcacaattacttccacaaatgcagttgaagcttctactcttcttgacataggtaacctgagtcggagcagttttcctcgtggcttcatctttggtgctggatcttcagcataccaatttgaaggtgcagtaaacgaaggcggtagaggaccaagtatttgggataccttcacccataaatatccagaaaaaataagggatggaagcaatgcagacatcacggttgaccaatatcaccgctacaaggaagatgttgggattatgaaggatcaaaatatggattcgtatagattctcaatctcttggccaagaatactcccaaagggaaagttgagcggaggcataaatcacgaaggaatcaaatattacaacaaccttatcaacgaactattggctaacggtatacaaccatttgtaactctttttcattgggatcttccccaagtcttagaagatgagtatggtggtttcttaaactccggtgtaataaatgattttcgagactatacggatctttgcttcaaggaatttggagatagagtgaggtattggagtactctaaatgagccatgggtgtttagcaattctggatatgcactaggaacaaatgcaccaggtcgatgttcggcctccaacgtggccaagcctggtgattctggaacaggaccttatatagttacacacaatcaaattcttgctcatgcagaagctgtacatgtgtataagactaaataccaggcatatcaaaagggaaagataggcataacgttggtatctaactggttaatgccacttgatgataatagcataccagatataaaggctgccgagagatcacttgacttccaatttggattgtttatggaacaattaacaacaggagattattctaagagcatgcggcgtatagttaaaaaccgattacctaagttctcaaaattcgaatcaagcctagtgaatggttcatttgattttattggtataaactattactcttctagttatattagcaatgccccttcacatggcaatgccaaacccagttactcaacaaatcctatgaccaatatttcatttgaaaaacatgggatacccttaggtccaagggctgcttcaatttggatatatgtttatccatatatgtttatccaagaggacttcgagatcttttgttacatattaaaaataaatataacaatcctgcaattttcaatcactgaaaatggtatgaatgaattcaacgatgcaacacttccagtagaagaagctcttttgaatacttacagaattgattactattaccgtcacttatactacattcgttctgcaatcagggctggctcaaatgtgaagggtttttacgcatggtcatttttggactgtaatgaatggtttgcaggctttactgttcgttttggattaaactttgtagattagaaagatggattaaaaaggtaccctaagctttctgcccaatggtacaagaactttctcaaaagaaactagctagtattattaaaagaactttgtagtagattacagtacatcgtttgaagttgagttggtgcacctaattaaataaaagaggttactcttaacatatttttaggccattcgttgtgaagttgttaggctgttatttctattatactatgttgtagtaataagtgcattgttgtaccagaagctatgatcataactataggttgatccttcatgtatcagtttgatgttgagaatactttgaattaaaagtctttttttatttttttaaaaaaaaaaaaaaaaaaaaccccccccc', | |
16 | ] | |
17 | class TestFasta(unittest.TestCase): | |
18 | def setUp(self): | |
19 | self.fasta = sequences.Fasta('ID', 'ACGTA') | |
20 | ||
21 | def test_equality(self): | |
22 | self.assertTrue(self.fasta == sequences.Fasta('ID', 'ACGTA')) | |
23 | self.assertFalse(self.fasta == sequences.Fasta('I', 'ACGTA')) | |
24 | self.assertFalse(self.fasta == sequences.Fasta('ID', 'ACGT')) | |
25 | self.assertFalse(self.fasta != sequences.Fasta('ID', 'ACGTA')) | |
26 | self.assertTrue(self.fasta != sequences.Fasta('I', 'ACGTA')) | |
27 | self.assertTrue(self.fasta != sequences.Fasta('ID', 'ACGT')) | |
28 | ||
29 | def test_init(self): | |
30 | '''__init__ should get the ID and sequence correctly''' | |
31 | self.assertEqual(self.fasta.id, 'ID') | |
32 | self.assertEqual(self.fasta.seq, 'ACGTA') | |
33 | ||
34 | def test_get_next_from_file(self): | |
35 | '''get_next_from_file() should read seqs from OK, including weirdness in file''' | |
36 | f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.fa')) | |
37 | fa = sequences.Fasta() | |
38 | counter = 1 | |
39 | ||
40 | while fa.get_next_from_file(f_in): | |
41 | self.assertEqual(fa, sequences.Fasta(str(counter), 'ACGTA')) | |
42 | counter += 1 | |
43 | ||
44 | utils.close(f_in) | |
45 | ||
46 | def test_get_id_from_header_line(self): | |
47 | '''Check that can get ID from header line or die properly''' | |
48 | self.assertEqual(sequences.Fasta._get_id_from_header_line(self.fasta, '>X'), 'X') | |
49 | with self.assertRaises(sequences.Error): | |
50 | self.assertEqual(sequences.Fasta._get_id_from_header_line(self.fasta, 'X'), 'X') | |
51 | ||
52 | def test_getitem(self): | |
53 | '''getitem() should return the right subsequence''' | |
54 | seq = 'AACGTGTCA' | |
55 | fa = sequences.Fasta('x', seq) | |
56 | self.assertEqual(seq[1], fa[1]) | |
57 | self.assertEqual(seq[0:2], fa[0:2]) | |
58 | self.assertEqual(seq[1:], fa[1:]) | |
59 | ||
60 | def test_len(self): | |
61 | '''len() should return the length of the sequence''' | |
62 | self.assertEqual(5, len(self.fasta)) | |
63 | ||
64 | def test_print_line_length(self): | |
65 | '''__str__ should be formatted correctly with the right number of chars per line of sequence''' | |
66 | line_lengths = [0, 3] | |
67 | correct_files = [os.path.join(data_dir, x) for x in ['sequences_test_one-per-line.fa', 'sequences_test_3-per-line.fa']] | |
68 | ||
69 | for i in range(len(line_lengths)): | |
70 | seq_reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_one-per-line.fa')) | |
71 | sequences.Fasta.line_length = line_lengths[i] | |
72 | tmp_out = 'tmp.line_length_test.fa' | |
73 | f = utils.open_file_write(tmp_out) | |
74 | for s in seq_reader: | |
75 | print(s, file=f) | |
76 | utils.close(f) | |
77 | self.assertTrue(filecmp.cmp(correct_files[i], tmp_out)) | |
78 | os.unlink(tmp_out) | |
79 | ||
80 | sequences.Fasta.line_length = 60 | |
81 | ||
82 | def test_strip_after_first_whitespace(self): | |
83 | '''Test strip_after_first_whitespace()''' | |
84 | seqs = [ | |
85 | sequences.Fasta('name', 'A'), | |
86 | sequences.Fasta('name foo', 'A'), | |
87 | sequences.Fasta('name foo bar', 'A'), | |
88 | sequences.Fasta('name\tfoo', 'A'), | |
89 | ] | |
90 | ||
91 | for seq in seqs: | |
92 | seq.strip_after_first_whitespace() | |
93 | ||
94 | for seq in seqs: | |
95 | self.assertEqual(seq.id, 'name') | |
96 | ||
97 | def test_strip_illumina_suffix(self): | |
98 | '''Check that /1 and /2 removed correctly from IDs''' | |
99 | seqs = [sequences.Fasta('name/1', 'A'), | |
100 | sequences.Fasta('name/2', 'A'), | |
101 | sequences.Fasta('name', 'A'), | |
102 | sequences.Fasta('name/1/2', 'A'), | |
103 | sequences.Fasta('name/2/1', 'A'), | |
104 | sequences.Fasta('name/3', 'A')] | |
105 | ||
106 | correct_names = ['name', 'name', 'name', 'name/1', 'name/2', 'name/3'] | |
107 | ||
108 | for seq in seqs: | |
109 | seq.strip_illumina_suffix() | |
110 | ||
111 | for i in range(len(seqs)): | |
112 | self.assertEqual(seqs[i].id, correct_names[i]) | |
113 | ||
114 | def test_revcomp(self): | |
115 | '''revcomp() should correctly reverse complement a sequence''' | |
116 | fa = sequences.Fasta('ID', 'ACGTNacgtn') | |
117 | fa.revcomp() | |
118 | self.assertEqual(fa, sequences.Fasta('ID', 'nacgtNACGT')) | |
119 | ||
120 | def test_gaps(self): | |
121 | '''gaps() should find the gaps in a sequence correctly''' | |
122 | test_seqs = [sequences.Fasta('ID', 'ACGT'), | |
123 | sequences.Fasta('ID', 'NACGT'), | |
124 | sequences.Fasta('ID', 'NACGTN'), | |
125 | sequences.Fasta('ID', 'ANNCGT'), | |
126 | sequences.Fasta('ID', 'NANNCGTNN')] | |
127 | ||
128 | correct_gaps = [[], | |
129 | [intervals.Interval(0, 0)], | |
130 | [intervals.Interval(0, 0), intervals.Interval(5, 5)], | |
131 | [intervals.Interval(1, 2)], | |
132 | [intervals.Interval(0, 0), intervals.Interval(2, 3), intervals.Interval(7, 8)]] | |
133 | ||
134 | for i in range(len(test_seqs)): | |
135 | gaps = test_seqs[i].gaps() | |
136 | self.assertListEqual(correct_gaps[i], gaps) | |
137 | ||
138 | def test_contig_coords(self): | |
139 | '''contig_coords() should get the coords of all contigs in a sequence correctly''' | |
140 | test_seqs = [sequences.Fasta('ID', 'ACGT'), | |
141 | sequences.Fasta('ID', 'NACGT'), | |
142 | sequences.Fasta('ID', 'NNACGT'), | |
143 | sequences.Fasta('ID', 'ACGTN'), | |
144 | sequences.Fasta('ID', 'ACGTNN'), | |
145 | sequences.Fasta('ID', 'NANNCGT'), | |
146 | sequences.Fasta('ID', 'ACNNNGTNA'), | |
147 | sequences.Fasta('ID', 'ANNCGTNNAAAAA')] | |
148 | ||
149 | correct_coords = [[intervals.Interval(0,3)], | |
150 | [intervals.Interval(1, 4)], | |
151 | [intervals.Interval(2, 5)], | |
152 | [intervals.Interval(0, 3)], | |
153 | [intervals.Interval(0, 3)], | |
154 | [intervals.Interval(1, 1), intervals.Interval(4,6)], | |
155 | [intervals.Interval(0, 1), intervals.Interval(5, 6), intervals.Interval(8, 8)], | |
156 | [intervals.Interval(0, 0), intervals.Interval(3, 5), intervals.Interval(8, 12)]] | |
157 | ||
158 | for i in range(len(test_seqs)): | |
159 | gaps = test_seqs[i].contig_coords() | |
160 | self.assertListEqual(correct_coords[i], gaps) | |
161 | ||
162 | ||
163 | ||
164 | ||
165 | def test_orfs(self): | |
166 | '''Test orfs()''' | |
167 | test_seqs = [(sequences.Fasta('ID', 'AAACCCGG'), 0, False, [intervals.Interval(0,5)]), | |
168 | (sequences.Fasta('ID', 'AAAACCCGG'), 1, False, [intervals.Interval(1,6)]), | |
169 | (sequences.Fasta('ID', 'AAAAACCCGG'), 2, False, [intervals.Interval(2,7)]), | |
170 | (sequences.Fasta('ID', 'CCGGGTTT'), 0, True, [intervals.Interval(2,7)]), | |
171 | (sequences.Fasta('ID', 'CCGGGTTTT'), 1, True, [intervals.Interval(2,7)]), | |
172 | (sequences.Fasta('ID', 'CCGGGTTTTT'), 2, True, [intervals.Interval(2,7)]), | |
173 | (sequences.Fasta('ID', 'AAACCCTGA'), 0, False, [intervals.Interval(0,8)]), | |
174 | (sequences.Fasta('ID', 'AAACCCTGATAG'), 0, False, [intervals.Interval(0,8)]), | |
175 | (sequences.Fasta('ID', 'AAACCCTGA'), 1, False, [intervals.Interval(1,6)]), | |
176 | (sequences.Fasta('ID', ''), 0, False, []), | |
177 | (sequences.Fasta('ID', 'A'), 0, False, []), | |
178 | (sequences.Fasta('ID', 'AA'), 0, False, []), | |
179 | (sequences.Fasta('ID', 'AAA'), 0, False, [intervals.Interval(0,2)]), | |
180 | (sequences.Fasta('ID', 'AAAAAA'), 0, False, [intervals.Interval(0,5)]), | |
181 | (sequences.Fasta('ID', 'AAA'), 1, False, []), | |
182 | (sequences.Fasta('ID', 'AAA'), 2, False, []), | |
183 | (sequences.Fasta('ID', 'AAA'), 0, True, [intervals.Interval(0,2)]), | |
184 | (sequences.Fasta('ID', 'AAA'), 1, True, []), | |
185 | (sequences.Fasta('ID', 'AAA'), 2, True, []), | |
186 | (sequences.Fasta('ID', 'TAA'), 0, False, []), | |
187 | (sequences.Fasta('ID', 'CTA'), 0, True, [])] | |
188 | ||
189 | ||
190 | for t in test_seqs: | |
191 | orfs = t[0].orfs(frame=t[1], revcomp=t[2]) | |
192 | self.assertListEqual(orfs, t[3]) | |
193 | ||
194 | def test_all_orfs(self): | |
195 | '''Test all_orfs()''' | |
196 | d = {} | |
197 | tasks.file_to_dict(os.path.join(data_dir, 'sequences_test_orfs.fa'), d) | |
198 | seq = d['1'] | |
199 | orfs = seq.all_orfs(min_length=120) | |
200 | expected = [ | |
201 | (intervals.Interval(27, 221), False), | |
202 | (intervals.Interval(44, 226), False), | |
203 | (intervals.Interval(48, 170), True), | |
204 | (intervals.Interval(109, 240), False), | |
205 | (intervals.Interval(143, 265), True), | |
206 | (intervals.Interval(227, 421), False), | |
207 | (intervals.Interval(277, 432), True), | |
208 | (intervals.Interval(286, 477), False), | |
209 | (intervals.Interval(288, 518), True), | |
210 | (intervals.Interval(562, 702), False), | |
211 | (intervals.Interval(600, 758), False), | |
212 | (intervals.Interval(605, 817), False), | |
213 | (intervals.Interval(818, 937), False), | |
214 | (intervals.Interval(835, 987), False), | |
215 | (intervals.Interval(864, 998), False) | |
216 | ] | |
217 | ||
218 | self.assertEqual(len(orfs), len(expected)) | |
219 | ||
220 | for i in range(len(orfs)): | |
221 | print(orfs[i][0], expected[i][0]) | |
222 | self.assertEqual(orfs[i][0], expected[i][0]) | |
223 | self.assertEqual(orfs[i][1], expected[i][1]) | |
224 | ||
225 | ||
226 | def test_is_all_Ns(self): | |
227 | '''Test is_all_Ns()''' | |
228 | self.assertTrue(sequences.Fasta('ID', 'n').is_all_Ns()) | |
229 | self.assertTrue(sequences.Fasta('ID', 'N').is_all_Ns()) | |
230 | self.assertTrue(sequences.Fasta('ID', 'nNn').is_all_Ns()) | |
231 | self.assertFalse(sequences.Fasta('ID', 'a').is_all_Ns()) | |
232 | self.assertFalse(sequences.Fasta('ID', '').is_all_Ns()) | |
233 | self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns()) | |
234 | self.assertFalse(sequences.Fasta('ID', 'naN').is_all_Ns()) | |
235 | self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(start=0, end=0)) | |
236 | self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(start=0, end=1)) | |
237 | self.assertTrue(sequences.Fasta('ID', 'anNg').is_all_Ns(start=1, end=1)) | |
238 | self.assertTrue(sequences.Fasta('ID', 'anNg').is_all_Ns(start=1, end=2)) | |
239 | self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(start=1)) | |
240 | self.assertTrue(sequences.Fasta('ID', 'anN').is_all_Ns(start=1)) | |
241 | self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(end=1)) | |
242 | self.assertTrue(sequences.Fasta('ID', 'nNA').is_all_Ns(end=1)) | |
243 | ||
244 | with self.assertRaises(sequences.Error): | |
245 | sequences.Fasta('ID', 'anNg').is_all_Ns(start=1, end=0) | |
246 | ||
247 | def test_trim_Ns(self): | |
248 | '''trim_Ns() should do the right trimming of a sequence''' | |
249 | fa = sequences.Fasta('ID', 'ANNANA') | |
250 | test_seqs = [sequences.Fasta('ID', 'ANNANA'), | |
251 | sequences.Fasta('ID', 'NANNANA'), | |
252 | sequences.Fasta('ID', 'NANNANAN'), | |
253 | sequences.Fasta('ID', 'ANNANAN'), | |
254 | sequences.Fasta('ID', 'NNNNNNANNANAN'), | |
255 | sequences.Fasta('ID', 'NNANNANANn')] | |
256 | ||
257 | for s in test_seqs: | |
258 | s.trim_Ns() | |
259 | self.assertEqual(fa, s) | |
260 | ||
261 | def test_add_insertions(self): | |
262 | '''Test add_insertions''' | |
263 | fa = sequences.Fasta('X', 'acgtacgtacgt') | |
264 | fa.add_insertions(skip=4, window=0, test=True) | |
265 | self.assertEqual(fa, sequences.Fasta('X', 'acgtNacgtNacgt')) | |
266 | ||
267 | def test_replace_bases(self): | |
268 | '''Check that bases get replaced correctly''' | |
269 | fa = sequences.Fasta('X', 'AUCGTUUACT') | |
270 | fa.replace_bases('U', 'T') | |
271 | self.assertEqual(fa, sequences.Fasta('X', 'ATCGTTTACT')) | |
272 | ||
273 | def test_replace_interval(self): | |
274 | '''Test replace_interval()''' | |
275 | fa = sequences.Fasta('ID', 'ACGTA') | |
276 | fa.replace_interval(0, 0, 'NEW') | |
277 | self.assertEqual(fa, sequences.Fasta('ID', 'NEWCGTA')) | |
278 | ||
279 | fa = sequences.Fasta('ID', 'ACGTA') | |
280 | fa.replace_interval(4, 4, 'NEW') | |
281 | self.assertEqual(fa, sequences.Fasta('ID', 'ACGTNEW')) | |
282 | ||
283 | fa = sequences.Fasta('ID', 'ACGTA') | |
284 | fa.replace_interval(2, 3, 'NEW') | |
285 | self.assertEqual(fa, sequences.Fasta('ID', 'ACNEWA')) | |
286 | ||
287 | fa = sequences.Fasta('ID', 'ACGTA') | |
288 | with self.assertRaises(sequences.Error): | |
289 | fa.replace_interval(3,2,'x') | |
290 | with self.assertRaises(sequences.Error): | |
291 | fa.replace_interval(1,5,'x') | |
292 | with self.assertRaises(sequences.Error): | |
293 | fa.replace_interval(5,10,'x') | |
294 | ||
295 | fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE') | |
296 | fq.replace_interval(0, 0, 'NEW', 'III') | |
297 | self.assertEqual(fq, sequences.Fastq('ID', 'NEWCGTA', 'IIIBCDE')) | |
298 | ||
299 | fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE') | |
300 | fq.replace_interval(4, 4, 'NEW', 'III') | |
301 | self.assertEqual(fq, sequences.Fastq('ID', 'ACGTNEW', 'ABCDIII')) | |
302 | ||
303 | fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE') | |
304 | fq.replace_interval(2, 3, 'NEW', 'III') | |
305 | self.assertEqual(fq, sequences.Fastq('ID', 'ACNEWA', 'ABIIIE')) | |
306 | ||
307 | with self.assertRaises(sequences.Error): | |
308 | fq.replace_interval(1,1,'x', 'xx') | |
309 | ||
310 | def test_search_string(self): | |
311 | '''Check that search_string() finds all the hits''' | |
312 | fa = sequences.Fasta('X', 'AAA') | |
313 | hits = fa.search('G') | |
314 | self.assertTrue(len(hits) == 0) | |
315 | hits = fa.search('AAA') | |
316 | self.assertListEqual(hits, [(0, '+')]) | |
317 | hits = fa.search('AA') | |
318 | self.assertListEqual(hits, [(0, '+'), (1, '+')]) | |
319 | hits = fa.search('TTT') | |
320 | self.assertListEqual(hits, [(0, '-')]) | |
321 | ||
322 | def test_to_Fastq(self): | |
323 | '''Check to_Fastq converts OK, including out of range quality scores''' | |
324 | fa = sequences.Fasta('X', 'AAAAA') | |
325 | quals = [-1, 0, 40, 93, 94] | |
326 | self.assertEqual(sequences.Fastq('X', 'AAAAA', '!!I~~'), fa.to_Fastq(quals)) | |
327 | with self.assertRaises(sequences.Error): | |
328 | fa.to_Fastq('AAAAAAAAAAAAA') | |
329 | ||
330 | ||
331 | def test_translate(self): | |
332 | '''Test nucleotide -> amino acid conversion works on Fasta''' | |
333 | fa = sequences.Fasta('ID', 'GCAGCCGCGGCTAGAAGGCGACGCCGGCGTAACAATGACGATTGCTGTGAAGAGCAACAGGGAGGCGGGGGTCACCATATAATCATTTTATTGCTACTCCTGCTTAAAAAGATGTTCTTTCCACCCCCGCCTAGCAGTTCATCCTCGTCTACAACCACGACTTGGTACTATGTAGTCGTGGTTTAATAGTGA') | |
334 | self.assertEqual(sequences.Fasta('ID', 'AAAARRRRRRNNDDCCEEQQGGGGHHIIILLLLLLKKMFFPPPPSSSSSSTTTTWYYVVVV***'), fa.translate()) | |
335 | self.assertEqual(sequences.Fasta('ID', 'QPRLEGDAGVTMTIAVKSNREAGVTI*SFYCYSCLKRCSFHPRLAVHPRLQPRLGTM*SWFNS'), fa.translate(frame=1)) | |
336 | print(fa.translate(frame=1)) | |
337 | self.assertEqual(sequences.Fasta('ID', 'SRG*KATPA*Q*RLL*RATGRRGSPYNHFIATPA*KDVLSTPA*QFILVYNHDLVLCSRGLIV'), fa.translate(frame=2)) | |
338 | ||
339 | def test_expand_nucleotides(self): | |
340 | '''Test expand_nucleotides''' | |
341 | tests = [ | |
342 | (sequences.Fasta('1', 'A'), [sequences.Fasta('1.1', 'A')]), | |
343 | (sequences.Fasta('2', 'C'), [sequences.Fasta('2.1', 'C')]), | |
344 | (sequences.Fasta('3', 'G'), [sequences.Fasta('3.1', 'G')]), | |
345 | (sequences.Fasta('4', 'T'), [sequences.Fasta('4.1', 'T')]), | |
346 | (sequences.Fasta('6', 'R'), [sequences.Fasta('6.1', 'A'), sequences.Fasta('6.2', 'G')]), | |
347 | (sequences.Fasta('7', 'Y'), [sequences.Fasta('7.1', 'C'), sequences.Fasta('7.2', 'T')]), | |
348 | (sequences.Fasta('8', 'S'), [sequences.Fasta('8.1', 'C'), sequences.Fasta('8.2', 'G')]), | |
349 | (sequences.Fasta('9', 'W'), [sequences.Fasta('9.1', 'A'), sequences.Fasta('9.2', 'T')]), | |
350 | (sequences.Fasta('10', 'K'), [sequences.Fasta('10.1', 'G'), sequences.Fasta('10.2', 'T')]), | |
351 | (sequences.Fasta('11', 'M'), [sequences.Fasta('11.1', 'A'), sequences.Fasta('11.2', 'C')]), | |
352 | (sequences.Fasta('12', 'B'), [sequences.Fasta('12.1', 'C'), sequences.Fasta('12.2', 'G'), sequences.Fasta('12.3', 'T')]), | |
353 | (sequences.Fasta('13', 'D'), [sequences.Fasta('13.1', 'A'), sequences.Fasta('13.2', 'G'), sequences.Fasta('13.3', 'T')]), | |
354 | (sequences.Fasta('14', 'H'), [sequences.Fasta('14.1', 'A'), sequences.Fasta('14.2', 'C'), sequences.Fasta('14.3', 'T')]), | |
355 | (sequences.Fasta('15', 'V'), [sequences.Fasta('15.1', 'A'), sequences.Fasta('15.2', 'C'), sequences.Fasta('15.3', 'G')]), | |
356 | (sequences.Fasta('16', 'N'), [sequences.Fasta('16.1', 'A'), sequences.Fasta('16.2', 'C'), sequences.Fasta('16.3', 'G'), sequences.Fasta('16.4', 'T')]), | |
357 | (sequences.Fasta('17', 'ART'), [sequences.Fasta('17.1', 'AAT'), sequences.Fasta('17.2', 'AGT')]), | |
358 | (sequences.Fasta('18', 'ARRT'), [sequences.Fasta('18.1', 'AAAT'), sequences.Fasta('18.2', 'AAGT'), sequences.Fasta('18.3', 'AGAT'), sequences.Fasta('18.4', 'AGGT')]), | |
359 | (sequences.Fasta('19', 'ARTR'), [sequences.Fasta('19.1', 'AATA'), sequences.Fasta('19.2', 'AATG'), sequences.Fasta('19.3', 'AGTA'), sequences.Fasta('19.4', 'AGTG')]), | |
360 | (sequences.Fastq('20', 'ART', 'GHI'), [sequences.Fastq('20.1', 'AAT', 'GHI'), sequences.Fastq('20.2', 'AGT', 'GHI')]), | |
361 | ] | |
362 | ||
363 | for t in tests: | |
364 | self.assertListEqual(t[0].expand_nucleotides(), t[1]) | |
365 | ||
366 | def test_split_capillary_id(self): | |
367 | '''Tests that we get information from a sanger capillary read name OK''' | |
368 | ids = ['abcde.p1k', 'abcde.x.p1k', 'abcde.p1ka', 'abcde.q1k', 'abcde.w2k'] | |
369 | expected = [{'prefix': 'abcde', 'dir': 'fwd', 'suffix': 'p1k'}, | |
370 | {'prefix': 'abcde.x', 'dir': 'fwd', 'suffix': 'p1k'}, | |
371 | {'prefix': 'abcde', 'dir': 'fwd', 'suffix': 'p1ka'}, | |
372 | {'prefix': 'abcde', 'dir': 'rev', 'suffix': 'q1k'}, | |
373 | {'prefix': 'abcde', 'dir': 'unk', 'suffix': 'w2k'}] | |
374 | ||
375 | for i in range(len(ids)): | |
376 | fa = sequences.Fasta(ids[i], 'A') | |
377 | self.assertEqual(fa.split_capillary_id(), expected[i]) | |
378 | ||
379 | with self.assertRaises(sequences.Error): | |
380 | fa = sequences.Fasta('name', 'A') | |
381 | fa.split_capillary_id() | |
382 | ||
383 | ||
384 | class TestEmbl(unittest.TestCase): | |
385 | def test_get_id_from_header_line(self): | |
386 | '''Test get id from header line of EMBL''' | |
387 | embl = sequences.Embl('ID', 'ACGT') | |
388 | self.assertEqual(embl._get_id_from_header_line('ID X; blah'), 'X') | |
389 | self.assertEqual(embl._get_id_from_header_line('LOCUS X foo'), 'X') | |
390 | with self.assertRaises(sequences.Error): | |
391 | self.assertEqual(embl._get_id_from_header_line('ID X;'), 'X') | |
392 | with self.assertRaises(sequences.Error): | |
393 | self.assertEqual(embl._get_id_from_header_line('XX X;'), 'X') | |
394 | ||
395 | ||
396 | def test_get_next_from_embl_file(self): | |
397 | f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.embl')) | |
398 | embl = sequences.Embl() | |
399 | counter = 1 | |
400 | ||
401 | while embl.get_next_from_file(f_in): | |
402 | self.assertEqual(embl, sequences.Fasta('seq' + str(counter), expected_embl[counter-1])) | |
403 | counter += 1 | |
404 | ||
405 | utils.close(f_in) | |
406 | ||
407 | ||
408 | def test_get_next_from_gbk_file(self): | |
409 | f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.gbk')) | |
410 | embl = sequences.Embl() | |
411 | counter = 1 | |
412 | expected = [ | |
413 | 'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgatc', | |
414 | 'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgaaa'] | |
415 | ||
416 | while embl.get_next_from_file(f_in): | |
417 | self.assertEqual(embl, sequences.Fasta('NAME' + str(counter), expected[counter-1])) | |
418 | counter += 1 | |
419 | ||
420 | utils.close(f_in) | |
421 | ||
422 | ||
423 | class TestFastq(unittest.TestCase): | |
424 | def setUp(self): | |
425 | self.fastq = sequences.Fastq('ID', 'ACGTA', 'IIIII') | |
426 | ||
427 | def test_init(self): | |
428 | '''__init__ should get the ID, sequence and quality correctly''' | |
429 | self.assertEqual(self.fastq.id, 'ID') | |
430 | self.assertEqual(self.fastq.seq, 'ACGTA') | |
431 | self.assertEqual(self.fastq.qual, 'IIIII') | |
432 | ||
433 | def test_init_length_mismatch(self): | |
434 | '''__init__ should raise an error when length of seq and quality not the same''' | |
435 | with self.assertRaises(sequences.Error): | |
436 | sequences.Fastq('X', 'A', 'II') | |
437 | ||
438 | def test_get_next_from_file(self): | |
439 | '''get_next_from_file() should read seqs from OK, and raise error at badly formatted file''' | |
440 | bad_files = ['sequences_test_fail_no_AT.fq', | |
441 | 'sequences_test_fail_no_seq.fq', | |
442 | 'sequences_test_fail_no_plus.fq', | |
443 | 'sequences_test_fail_no_qual.fq'] | |
444 | ||
445 | bad_files = [os.path.join(data_dir, x) for x in bad_files] | |
446 | ||
447 | for fname in bad_files: | |
448 | f_in = utils.open_file_read(fname) | |
449 | fq = sequences.Fastq() | |
450 | with self.assertRaises(sequences.Error): | |
451 | while fq.get_next_from_file(f_in): | |
452 | pass | |
453 | ||
454 | utils.close(f_in) | |
455 | ||
456 | fname = os.path.join(data_dir, 'sequences_test_good_file.fq') | |
457 | try: | |
458 | f_in = open(fname) | |
459 | except IOError: | |
460 | print("Error opening '" + fname + "'", file=sys.stderr) | |
461 | sys.exit(1) | |
462 | ||
463 | fq = sequences.Fastq() | |
464 | while fq.get_next_from_file(f_in): | |
465 | self.assertEqual(fq, sequences.Fastq('ID', 'ACGTA', 'IIIII')) | |
466 | utils.close(f_in) | |
467 | ||
468 | def test_revcomp(self): | |
469 | '''revcomp() should correctly reverse complement a sequence''' | |
470 | fq = sequences.Fastq('ID', 'ACGTNacgtn', '1234567890') | |
471 | fq.revcomp() | |
472 | self.assertEqual(fq, sequences.Fastq('ID', 'nacgtNACGT', '0987654321')) | |
473 | ||
474 | def test_trim_Ns(self): | |
475 | '''trim_Ns() should do the right trimming of a fastq sequence''' | |
476 | fq = sequences.Fastq('ID', 'ANNANA', '111111') | |
477 | test_seqs = [sequences.Fastq('ID', 'ANNANA', '111111'), | |
478 | sequences.Fastq('ID', 'NANNANA', '1111111'), | |
479 | sequences.Fastq('ID', 'NANNANAN', '11111111'), | |
480 | sequences.Fastq('ID', 'ANNANAN', '1111111'), | |
481 | sequences.Fastq('ID', 'NNNNNNANNANAN', '1111111111111'), | |
482 | sequences.Fastq('ID', 'NNANNANANn', '1111111111')] | |
483 | ||
484 | for s in test_seqs: | |
485 | s.trim_Ns() | |
486 | self.assertEqual(fq, s) | |
487 | ||
488 | def test_trim(self): | |
489 | '''trim() should trim the right number of bases off start and end''' | |
490 | fq = sequences.Fastq('ID', '1234567890', '1234567890') | |
491 | fq.trim(0, 0) | |
492 | self.assertEqual(fq, sequences.Fastq('ID', '1234567890', '1234567890')) | |
493 | ||
494 | fq = sequences.Fastq('ID', '1234567890', '1234567890') | |
495 | fq.trim(1, 0) | |
496 | self.assertEqual(fq, sequences.Fastq('ID', '234567890', '234567890')) | |
497 | ||
498 | fq = sequences.Fastq('ID', '1234567890', '1234567890') | |
499 | fq.trim(0, 1) | |
500 | self.assertEqual(fq, sequences.Fastq('ID', '123456789', '123456789')) | |
501 | ||
502 | fq = sequences.Fastq('ID', '1234567890', '1234567890') | |
503 | fq.trim(2, 2) | |
504 | self.assertEqual(fq, sequences.Fastq('ID', '345678', '345678')) | |
505 | ||
506 | def test_to_Fasta_and_qual(self): | |
507 | '''Check to_Fasta_and_qual converts quality scores correctly''' | |
508 | fq = sequences.Fastq('ID', 'ACGT', '>ADI') | |
509 | (fa, qual) = fq.to_Fasta_and_qual() | |
510 | self.assertEqual(fa, sequences.Fasta('ID', 'ACGT')) | |
511 | self.assertListEqual(qual, [29, 32, 35, 40]) | |
512 | ||
513 | ||
514 | def test_translate(self): | |
515 | '''Test nucleatide -> amino acid conversion works on Fasta''' | |
516 | fq = sequences.Fastq('ID', 'GCAGCCGCGGCTAGAAGGCGACGCCGGCGTAACAATGACGATTGCTGTGAAGAGCAACAGGGAGGCGGGGGTCACCATATAATCATTTTATTGCTACTCCTGCTTAAAAAGATGTTCTTTCCACCCCCGCCTAGCAGTTCATCCTCGTCTACAACCACGACTTGGTACTATGTAGTCGTGGTTTAATAGTGA', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII') | |
517 | ||
518 | self.assertEqual(sequences.Fastq('ID', 'AAAARRRRRRNNDDCCEEQQGGGGHHIIILLLLLLKKMFFPPPPSSSSSSTTTTWYYVVVV***', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII'), fq.translate()) | |
519 | ||
520 | class TestFileReader(unittest.TestCase): | |
521 | def test_file_reader_fasta(self): | |
522 | '''file_reader should iterate through a fasta file correctly''' | |
523 | reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test.fa')) | |
524 | counter = 1 | |
525 | for seq in reader: | |
526 | self.assertEqual(seq, sequences.Fasta(str(counter), 'ACGTA')) | |
527 | counter += 1 | |
528 | ||
529 | def test_file_reader_fastq(self): | |
530 | '''file_reader should iterate through a fastq file correctly''' | |
531 | reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_good_file.fq')) | |
532 | for seq in reader: | |
533 | self.assertEqual(seq, sequences.Fastq('ID', 'ACGTA', 'IIIII')) | |
534 | ||
535 | def test_file_reader_bad_format(self): | |
536 | '''file_reader should die properly when not given fasta or fastq file''' | |
537 | with self.assertRaises(sequences.Error): | |
538 | reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_not_a_fastaq_file')) | |
539 | for seq in reader: | |
540 | pass | |
541 | ||
542 | def test_file_reader_gff(self): | |
543 | '''Test read gff file''' | |
544 | good_files = [ | |
545 | 'sequences_test_gffv3.gff', | |
546 | 'sequences_test_gffv3.no_FASTA_line.gff' | |
547 | ] | |
548 | good_files = [os.path.join(data_dir, x) for x in good_files] | |
549 | ||
550 | for f in good_files: | |
551 | reader = sequences.file_reader(f) | |
552 | counter = 1 | |
553 | for seq in reader: | |
554 | self.assertEqual(seq, sequences.Fasta('seq' + str(counter), 'ACGTACGTAC')) | |
555 | counter += 1 | |
556 | ||
557 | bad_files = [ | |
558 | 'sequences_test_gffv3.no_seq.gff', | |
559 | 'sequences_test_gffv3.no_seq.2.gff' | |
560 | ] | |
561 | bad_files = [os.path.join(data_dir, x) for x in bad_files] | |
562 | ||
563 | for filename in bad_files: | |
564 | with self.assertRaises(sequences.Error): | |
565 | reader = sequences.file_reader(filename) | |
566 | for seq in reader: | |
567 | pass | |
568 | ||
569 | def test_file_reader_embl(self): | |
570 | '''Test read embl file''' | |
571 | reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test.embl')) | |
572 | ||
573 | counter = 1 | |
574 | for seq in reader: | |
575 | self.assertEqual(seq, sequences.Fasta('seq' + str(counter), expected_embl[counter-1])) | |
576 | counter += 1 | |
577 | ||
578 | bad_files = [ | |
579 | 'sequences_test.embl.bad', | |
580 | 'sequences_test.embl.bad2', | |
581 | ] | |
582 | bad_files = [os.path.join(data_dir, x) for x in bad_files] | |
583 | ||
584 | for filename in bad_files: | |
585 | with self.assertRaises(sequences.Error): | |
586 | reader = sequences.file_reader(filename) | |
587 | for seq in reader: | |
588 | pass | |
589 | ||
590 | def test_file_reader_phylip(self): | |
591 | '''Test read phylip file''' | |
592 | test_files = [ | |
593 | 'sequences_test_phylip.interleaved', | |
594 | 'sequences_test_phylip.interleaved2', | |
595 | 'sequences_test_phylip.sequential' | |
596 | ] | |
597 | ||
598 | test_files = [os.path.join(data_dir, f) for f in test_files] | |
599 | ||
600 | expected_seqs = [ | |
601 | sequences.Fasta('Turkey', 'AACTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT'), | |
602 | sequences.Fasta('Salmo_gair', 'AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT'), | |
603 | sequences.Fasta('H. Sapiens', 'ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA') | |
604 | ] | |
605 | ||
606 | for fname in test_files: | |
607 | reader = sequences.file_reader(fname) | |
608 | i = 0 | |
609 | for seq in reader: | |
610 | self.assertEqual(expected_seqs[i], seq) | |
611 | i += 1 | |
612 | ||
613 | # files made by seaview are a little different in the first line. | |
614 | # Test one of these | |
615 | expected_seqs = [ | |
616 | sequences.Fasta('seq1', 96 * 'G' + 'T'), | |
617 | sequences.Fasta('seq2', 94 * 'A' + 'G') | |
618 | ] | |
619 | ||
620 | reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_phylip.made_by_seaview')) | |
621 | i = 0 | |
622 | for seq in reader: | |
623 | print(seq) | |
624 | self.assertEqual(expected_seqs[i], seq) | |
625 | i += 1 | |
626 | ||
627 | ||
628 | class TestOther(unittest.TestCase): | |
629 | def test_orfs_from_aa_seq(self): | |
630 | '''Test _orfs_from_aa_seq()''' | |
631 | test_seqs = ['', | |
632 | '*', | |
633 | '**', | |
634 | 'A', | |
635 | 'A*A*A', | |
636 | 'AB**CDE*AB', | |
637 | '*ABCDE*', | |
638 | '**ABCDE**'] | |
639 | ||
640 | correct_coords = [[], | |
641 | [], | |
642 | [], | |
643 | [intervals.Interval(0, 0)], | |
644 | [intervals.Interval(0, 1), intervals.Interval(2, 3),intervals.Interval(4, 4)], | |
645 | [intervals.Interval(0, 2), intervals.Interval(4, 7), intervals.Interval(8, 9)], | |
646 | [intervals.Interval(1, 6)], | |
647 | [intervals.Interval(2, 7)]] | |
648 | ||
649 | for i in range(len(test_seqs)): | |
650 | orfs = sequences._orfs_from_aa_seq(test_seqs[i]) | |
651 | self.assertListEqual(correct_coords[i], orfs) | |
652 | ||
653 | ||
654 | if __name__ == '__main__': | |
655 | unittest.main() |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import sys | |
3 | import filecmp | |
4 | import os | |
5 | import unittest | |
6 | from fastaq import tasks, sequences | |
7 | ||
8 | modules_dir = os.path.dirname(os.path.abspath(sequences.__file__)) | |
9 | data_dir = os.path.join(modules_dir, 'tests', 'data') | |
10 | ||
11 | class Error (Exception): pass | |
12 | ||
13 | ||
14 | class TestCapillaryToPairs(unittest.TestCase): | |
15 | def test_capillary_to_pairs(self): | |
16 | '''Check that capillary reads file converted to paired and unpaired''' | |
17 | tmp_prefix = 'tmp.cap_to_pairs' | |
18 | tasks.capillary_to_pairs(os.path.join(data_dir, 'sequences_test_cap_to_read_pairs.fa'), tmp_prefix) | |
19 | # sequences have been hashed, so could be in any order in | |
20 | # output files. So need to check contents of files are OK | |
21 | d_correct_paired = {} | |
22 | d_correct_unpaired = {} | |
23 | tasks.file_to_dict(os.path.join(data_dir, 'sequences_test_cap_to_read_pairs.fa.paired.gz'), d_correct_paired) | |
24 | tasks.file_to_dict(os.path.join(data_dir, 'sequences_test_cap_to_read_pairs.fa.unpaired.gz'), d_correct_unpaired) | |
25 | d_test_paired = {} | |
26 | d_test_unpaired = {} | |
27 | tasks.file_to_dict(tmp_prefix + '.paired.gz', d_test_paired) | |
28 | tasks.file_to_dict(tmp_prefix + '.unpaired.gz', d_test_unpaired) | |
29 | self.assertDictEqual(d_test_paired, d_correct_paired) | |
30 | self.assertDictEqual(d_test_unpaired, d_correct_unpaired) | |
31 | os.unlink(tmp_prefix + '.paired.gz') | |
32 | os.unlink(tmp_prefix + '.unpaired.gz') | |
33 | ||
34 | ||
35 | class TestDeinterleave(unittest.TestCase): | |
36 | def test_deinterleave(self): | |
37 | '''deinterleave should deal with an interleaved file correctly''' | |
38 | tmp_1 = 'tmp.deinterleaved_1.fa' | |
39 | tmp_2 = 'tmp.deinterleaved_2.fa' | |
40 | tasks.deinterleave(os.path.join(data_dir, 'sequences_test_interleaved.fa'), tmp_1, tmp_2) | |
41 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_deinterleaved_1.fa'), tmp_1)) | |
42 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_deinterleaved_2.fa'), tmp_2)) | |
43 | ||
44 | tasks.deinterleave(os.path.join(data_dir, 'sequences_test_interleaved.fq'), tmp_1, tmp_2, fasta_out=True) | |
45 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_deinterleaved_1.fa'), tmp_1)) | |
46 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_deinterleaved_2.fa'), tmp_2)) | |
47 | ||
48 | with self.assertRaises(tasks.Error): | |
49 | tasks.deinterleave(os.path.join(data_dir, 'sequences_test_interleaved_bad.fa'), tmp_1, tmp_2) | |
50 | os.unlink(tmp_1) | |
51 | os.unlink(tmp_2) | |
52 | ||
53 | ||
54 | class TestEnumerateNames(unittest.TestCase): | |
55 | def test_enumerate_names(self): | |
56 | '''Test enomereate_names works with all options''' | |
57 | outfile = 'tmp.enumerate_seqs.fa' | |
58 | rename_out = outfile + '.rename' | |
59 | tasks.enumerate_names(os.path.join(data_dir, 'sequences_test_enumerate_names.fa'), outfile) | |
60 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.start.1'), outfile)) | |
61 | tasks.enumerate_names(os.path.join(data_dir, 'sequences_test_enumerate_names.fa'), outfile, rename_file=rename_out) | |
62 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.start.1'), outfile)) | |
63 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.start.1.rename_file'), rename_out)) | |
64 | tasks.enumerate_names(os.path.join(data_dir, 'sequences_test_enumerate_names.fa'), outfile, start_index=2) | |
65 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.start.2'), outfile)) | |
66 | tasks.enumerate_names(os.path.join(data_dir, 'sequences_test_enumerate_names.fa'), outfile, keep_illumina_suffix=True) | |
67 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.keep_suffix'), outfile)) | |
68 | os.unlink(outfile) | |
69 | os.unlink(rename_out) | |
70 | ||
71 | ||
72 | class TestExpandNucleotides(unittest.TestCase): | |
73 | def test_expand_nucleoties(self): | |
74 | '''Test expand_nucleotides''' | |
75 | tmp = 'tmp.expanded' | |
76 | fq_in = os.path.join(data_dir, 'tasks_test_expend_nucleotides.in.fq') | |
77 | fa_in = os.path.join(data_dir, 'tasks_test_expend_nucleotides.in.fa') | |
78 | fq_expected = os.path.join(data_dir, 'tasks_test_expend_nucleotides.out.fq') | |
79 | fa_expected = os.path.join(data_dir, 'tasks_test_expend_nucleotides.out.fa') | |
80 | tasks.expand_nucleotides(fq_in, tmp) | |
81 | self.assertTrue(filecmp.cmp(fq_expected, tmp, shallow=False)) | |
82 | os.unlink(tmp) | |
83 | tasks.expand_nucleotides(fa_in, tmp) | |
84 | self.assertTrue(filecmp.cmp(fa_expected, tmp, shallow=False)) | |
85 | os.unlink(tmp) | |
86 | ||
87 | ||
88 | class TestExtendGaps(unittest.TestCase): | |
89 | def test_extend_gaps(self): | |
90 | '''Test that gap extension works''' | |
91 | outfile = 'tmp.gap_extend.fa' | |
92 | tasks.extend_gaps(os.path.join(data_dir, 'sequences_test_extend_gaps.fa'), outfile, trim=2) | |
93 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_extend_gaps.fa.out'), outfile)) | |
94 | os.unlink(outfile) | |
95 | ||
96 | ||
97 | class TestFastqToMiraXml(unittest.TestCase): | |
98 | def test_fastaq_to_mira_xml(self): | |
99 | '''check that fastaq_to_mira_xml makes the correct xml file from a fastq file''' | |
100 | tmp = 'tmp.mira.xml' | |
101 | tasks.fastaq_to_mira_xml(os.path.join(data_dir, 'sequences_test_good_file.fq'), tmp) | |
102 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_good_file_mira.xml'), tmp)) | |
103 | os.unlink(tmp) | |
104 | ||
105 | ||
106 | class TestFastaqToOrfsGFF(unittest.TestCase): | |
107 | def test_fastaq_to_orfs_gff(self): | |
108 | '''Test fastaq_to_orfs_gff''' | |
109 | outfile = 'tmp.orfs.gff' | |
110 | tasks.fastaq_to_orfs_gff(os.path.join(data_dir, 'sequences_test_orfs.fa'), outfile, min_length=120) | |
111 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_orfs.gff'), outfile, shallow=False)) | |
112 | os.unlink(outfile) | |
113 | ||
114 | ||
115 | class TestFilter(unittest.TestCase): | |
116 | def test_length_filter(self): | |
117 | '''Check that filtering by length works as expected''' | |
118 | infile = os.path.join(data_dir, 'sequences_test_length_filter.fa') | |
119 | correct_files = [os.path.join(data_dir, 'sequences_test_length_filter.min-0.max-1.fa'), | |
120 | os.path.join(data_dir, 'sequences_test_length_filter.min-0.max-inf.fa'), | |
121 | os.path.join(data_dir, 'sequences_test_length_filter.min-4.max-4.fa')] | |
122 | cutoffs = [(0, 1), (0, float('inf')), (4, 4)] | |
123 | ||
124 | for i in range(len(cutoffs)): | |
125 | outfile = 'tmp.length_filter.fa' | |
126 | tasks.filter(infile, outfile, minlength=cutoffs[i][0], maxlength=cutoffs[i][1]) | |
127 | self.assertTrue(filecmp.cmp(correct_files[i], outfile)) | |
128 | os.unlink(outfile) | |
129 | ||
130 | def test_regex_filter(self): | |
131 | '''Check that filtering by name regex works as expected''' | |
132 | infile = os.path.join(data_dir, 'sequences_test_filter_by_regex.fa') | |
133 | correct_files = [os.path.join(data_dir, 'sequences_test_filter_by_regex.numeric.fa'), | |
134 | os.path.join(data_dir, 'sequences_test_filter_by_regex.first-of-pair.fa'), | |
135 | os.path.join(data_dir, 'sequences_test_filter_by_regex.first-char-a.fa')] | |
136 | regexes = ['^[0-9]+$', '/1$', '^a'] | |
137 | ||
138 | for i in range(len(regexes)): | |
139 | outfile = 'tmp.regex_filter.fa' | |
140 | tasks.filter(infile, outfile, regex=regexes[i]) | |
141 | self.assertTrue(filecmp.cmp(correct_files[i], outfile)) | |
142 | os.unlink(outfile) | |
143 | ||
144 | def test_ids_from_file_filter(self): | |
145 | '''Test that can extract reads from a file of read names''' | |
146 | infile = os.path.join(data_dir, 'sequences_test_filter_by_ids_file.fa') | |
147 | outfile = 'tmp.ids_file_filter.fa' | |
148 | tasks.filter(infile, outfile, ids_file=infile + '.ids') | |
149 | self.assertTrue(filecmp.cmp(infile + '.filtered', outfile)) | |
150 | os.unlink(outfile) | |
151 | ||
152 | def test_invert_filter(self): | |
153 | '''Test that inverting filtering works''' | |
154 | infile = os.path.join(data_dir, 'sequences_test_filter_by_ids_file.fa') | |
155 | outfile = 'tmp.ids_file_filter.fa' | |
156 | tasks.filter(infile, outfile, ids_file=infile + '.ids', invert=True) | |
157 | self.assertTrue(filecmp.cmp(infile + '.filtered.invert', outfile)) | |
158 | os.unlink(outfile) | |
159 | ||
160 | ||
161 | class TestGetSeqsFlankingGaps(unittest.TestCase): | |
162 | def test_get_seqs_flanking_gaps(self): | |
163 | outfile = 'tmp.seqs_flanking_gaps' | |
164 | tasks.get_seqs_flanking_gaps(os.path.join(data_dir, 'sequences_test_get_seqs_flanking_gaps.fa'), outfile, 3, 3) | |
165 | self.assertTrue(filecmp.cmp(outfile, os.path.join(data_dir, 'sequences_test_get_seqs_flanking_gaps.fa.out'))) | |
166 | os.unlink(outfile) | |
167 | ||
168 | ||
169 | class TestInterleave(unittest.TestCase): | |
170 | def test_interleave(self): | |
171 | '''Check that interleave works as expected''' | |
172 | tmp = 'tmp.interleaved.fa' | |
173 | tasks.interleave(os.path.join(data_dir, 'sequences_test_deinterleaved_1.fa'), | |
174 | os.path.join(data_dir, 'sequences_test_deinterleaved_2.fa'), | |
175 | tmp) | |
176 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_interleaved.fa'), tmp)) | |
177 | ||
178 | with self.assertRaises(tasks.Error): | |
179 | tasks.interleave(os.path.join(data_dir, 'sequences_test_deinterleaved_bad_1.fa'), | |
180 | os.path.join(data_dir, 'sequences_test_deinterleaved_bad_2.fa'), | |
181 | tmp) | |
182 | ||
183 | with self.assertRaises(tasks.Error): | |
184 | tasks.interleave(os.path.join(data_dir, 'sequences_test_deinterleaved_bad2_1.fa'), | |
185 | os.path.join(data_dir, 'sequences_test_deinterleaved_bad2_2.fa'), | |
186 | tmp) | |
187 | os.unlink(tmp) | |
188 | ||
189 | ||
190 | class TestMakeRandomContigs(unittest.TestCase): | |
191 | def test_make_random_contigs(self): | |
192 | '''Test make_random_contigs()''' | |
193 | # Can't guarantee same results from random (even using same seed), so | |
194 | # just check sequence names and lengths | |
195 | def files_are_equal(file1, file2): | |
196 | seqs1 = {} | |
197 | seqs2 = {} | |
198 | tasks.file_to_dict(file1, seqs1) | |
199 | tasks.file_to_dict(file2, seqs2) | |
200 | if len(seqs1) != len(seqs2): | |
201 | return False | |
202 | ||
203 | for name in seqs1: | |
204 | seq1 = seqs1[name] | |
205 | seq2 = seqs2[name] | |
206 | if seq1.id != seq2.id: | |
207 | return False | |
208 | if len(seq1) != len(seq2): | |
209 | return False | |
210 | ||
211 | return True | |
212 | ||
213 | tmp = 'tmp.random_contigs.fa' | |
214 | tasks.make_random_contigs(2, 3, tmp) | |
215 | self.assertTrue(files_are_equal(os.path.join(data_dir, 'sequences_test_make_random_contigs.default.fa'), tmp)) | |
216 | tasks.make_random_contigs(2, 3, tmp, prefix='p') | |
217 | self.assertTrue(files_are_equal(os.path.join(data_dir, 'sequences_test_make_random_contigs.prefix-p.fa'), tmp)) | |
218 | tasks.make_random_contigs(2, 3, tmp, first_number=42) | |
219 | self.assertTrue(files_are_equal(os.path.join(data_dir, 'sequences_test_make_random_contigs.first-42.fa'), tmp)) | |
220 | tasks.make_random_contigs(28, 3, tmp, name_by_letters=True) | |
221 | self.assertTrue(files_are_equal(os.path.join(data_dir, 'sequences_test_make_random_contigs.name-by-letters.fa'), tmp)) | |
222 | os.unlink(tmp) | |
223 | ||
224 | ||
225 | class TestMakeLongReads(unittest.TestCase): | |
226 | def test_tiling_reads(self): | |
227 | tmp = 'tmp.out.fa' | |
228 | fa_in = os.path.join(data_dir, 'tasks_test_make_long_reads.input.fa') | |
229 | tasks.make_long_reads(fa_in, tmp, method='tiling', fixed_read_length=10, tile_step=5) | |
230 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'tasks_test_make_long_reads.output.fa'), tmp, shallow=False)) | |
231 | os.unlink(tmp) | |
232 | ||
233 | ||
234 | class TestMergeToOneSeq(unittest.TestCase): | |
235 | def test_merge_to_one_seq_fa(self): | |
236 | '''Test merge_to_one_seq with fasta''' | |
237 | tmp = 'tmp.merged.fa' | |
238 | tasks.merge_to_one_seq(os.path.join(data_dir, 'sequences_test_merge_to_one_seq.fa'), tmp) | |
239 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_merge_to_one_seq.merged.fa'), tmp, shallow=False)) | |
240 | os.unlink(tmp) | |
241 | ||
242 | def test_merge_to_one_seq_fq(self): | |
243 | '''Test merge_to_one_seq with fastq''' | |
244 | tmp = 'tmp.merged.fq' | |
245 | tasks.merge_to_one_seq(os.path.join(data_dir, 'sequences_test_merge_to_one_seq.fq'), tmp) | |
246 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_merge_to_one_seq.merged.fq'), tmp, shallow=False)) | |
247 | os.unlink(tmp) | |
248 | ||
249 | class TestReverseComplement(unittest.TestCase): | |
250 | def test_reverse_complement(self): | |
251 | '''reverse_complement should correctly reverse complement each seq in a file''' | |
252 | tmp = 'tmp.revcomp.fa' | |
253 | tasks.reverse_complement(os.path.join(data_dir, 'sequences_test.fa'), tmp) | |
254 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_revcomp.fa'), tmp)) | |
255 | os.unlink(tmp) | |
256 | ||
257 | ||
258 | class TestScaffoldsToContigs(unittest.TestCase): | |
259 | def test_scaffolds_to_contigs(self): | |
260 | '''Test scaffolds_to_contigs''' | |
261 | tmp = 'tmp.contigs.fa' | |
262 | tasks.scaffolds_to_contigs(os.path.join(data_dir, 'utils_test_scaffolds.fa'), tmp) | |
263 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'utils_test_scaffolds.fa.to_contigs.fa'), tmp)) | |
264 | os.unlink(tmp) | |
265 | ||
266 | def test_scaffolds_to_contigs_number_contigs(self): | |
267 | '''Test scaffolds_to_contigs with contig numbering''' | |
268 | tmp = 'tmp.contigs.fa' | |
269 | tasks.scaffolds_to_contigs(os.path.join(data_dir, 'utils_test_scaffolds.fa'), tmp, number_contigs=True) | |
270 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'utils_test_scaffolds.fa.to_contigs.number_contigs.fa'), tmp)) | |
271 | os.unlink(tmp) | |
272 | ||
273 | ||
274 | class TestSearchForSeq(unittest.TestCase): | |
275 | def test_search_for_seq(self): | |
276 | '''Test that sequence search finds all hits''' | |
277 | tmp = 'tmp.search.fa' | |
278 | tasks.search_for_seq(os.path.join(data_dir, 'sequences_test_search_string.fa'), tmp, 'AGA') | |
279 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_search_string.fa.hits'), tmp)) | |
280 | os.unlink(tmp) | |
281 | ||
282 | ||
283 | class TestSequenceTrim(unittest.TestCase): | |
284 | def test_sequence_trim(self): | |
285 | '''Test sequence_trim''' | |
286 | tmp1 = 'tmp.trimmed_1.fa' | |
287 | tmp2 = 'tmp.trimmed_2.fa' | |
288 | in1 = os.path.join(data_dir, 'tasks_test_sequence_trim_1.fa') | |
289 | in2 = os.path.join(data_dir, 'tasks_test_sequence_trim_2.fa') | |
290 | to_trim = os.path.join(data_dir, 'tasks_test_sequences_to_trim.fa') | |
291 | expected1 = os.path.join(data_dir, 'tasks_test_sequence_trim_1.trimmed.fa') | |
292 | expected2 = os.path.join(data_dir, 'tasks_test_sequence_trim_2.trimmed.fa') | |
293 | tasks.sequence_trim(in1, in2, tmp1, tmp2, to_trim, min_length=10, check_revcomp=True) | |
294 | self.assertTrue(filecmp.cmp(expected1, tmp1)) | |
295 | self.assertTrue(filecmp.cmp(expected2, tmp2)) | |
296 | os.unlink(tmp1) | |
297 | os.unlink(tmp2) | |
298 | ||
299 | ||
300 | class TestTranslate(unittest.TestCase): | |
301 | def test_translate(self): | |
302 | '''Test translate works in each frame''' | |
303 | tmp = 'tmp.translated.fa' | |
304 | for i in range(3): | |
305 | tasks.translate(os.path.join(data_dir, 'sequences_test_translate.fa'), tmp, frame=i) | |
306 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_translate.fa.frame' + str(i)), tmp)) | |
307 | ||
308 | os.unlink(tmp) | |
309 | ||
310 | ||
311 | class TestTrim(unittest.TestCase): | |
312 | def test_trim(self): | |
313 | '''trim should correctly trim each seq in a file''' | |
314 | tmp = 'tmp.trim.fq' | |
315 | tasks.trim(os.path.join(data_dir, 'sequences_test_untrimmed.fq'), tmp, 2, 1) | |
316 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_trimmed.fq'), tmp)) | |
317 | os.unlink(tmp) | |
318 | ||
319 | ||
320 | def test_trim_Ns_at_end(self): | |
321 | '''Test Ns at ends of sequences trimmed OK''' | |
322 | tmp = 'tmp.trim.fa' | |
323 | tasks.trim_Ns_at_end(os.path.join(data_dir, 'sequences_test_trim_Ns_at_end.fa'), tmp) | |
324 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_trim_Ns_at_end.fa.trimmed'), tmp)) | |
325 | os.unlink(tmp) | |
326 | ||
327 | ||
328 | class TestFileToDict(unittest.TestCase): | |
329 | def test_file_to_dict(self): | |
330 | '''check file_to_dict fills dictionary correctly''' | |
331 | d_test = {} | |
332 | d = {} | |
333 | tasks.file_to_dict(os.path.join(data_dir, 'sequences_test.fa'), d_test) | |
334 | for i in range(1,5): | |
335 | d[str(i)] = sequences.Fasta(str(i),'ACGTA') | |
336 | ||
337 | self.assertSequenceEqual(d_test.keys(),d.keys()) | |
338 | for i in range(1,5): | |
339 | key = str(i) | |
340 | self.assertEqual(d_test[key].id, d[key].id) | |
341 | self.assertEqual(d_test[key].seq, d[key].seq) | |
342 | ||
343 | ||
344 | class TestLengthsFromFai(unittest.TestCase): | |
345 | def test_lengths_from_fai(self): | |
346 | '''Check lengths_from_fai gets the length of each seq OK''' | |
347 | d = {} | |
348 | lengths = {str(x):x for x in range(1,5)} | |
349 | tasks.lengths_from_fai(os.path.join(data_dir, 'sequences_test_fai_test.fa.fai'), d) | |
350 | self.assertSequenceEqual(d.keys(), lengths.keys()) | |
351 | for i in d: | |
352 | self.assertEqual(int(i), d[i]) | |
353 | ||
354 | ||
355 | class TestSplit(unittest.TestCase): | |
356 | def test_split_by_base_count(self): | |
357 | '''Check that fasta/q files get split by base count correctly''' | |
358 | infile = os.path.join(data_dir, 'sequences_test_split_test.fa') | |
359 | outprefix = 'tmp.sequences_test_split_test.fa.test' | |
360 | length2files = {2: ['1','2','3','4'], | |
361 | 3: ['1','2','3'], | |
362 | 4: ['1', '2', '3'], | |
363 | 6: ['1', '2']} | |
364 | for l in length2files: | |
365 | tasks.split_by_base_count(infile, outprefix, l) | |
366 | for x in range(len(length2files[l])): | |
367 | file_index = str(length2files[l][x]) | |
368 | fname = outprefix + '.' + file_index | |
369 | self.assertTrue(filecmp.cmp(fname, infile + '.' + str(l) + '.' + file_index)) | |
370 | os.unlink(fname) | |
371 | ||
372 | # check that limiting the number of files works | |
373 | tasks.split_by_base_count(infile, outprefix, 6, 2) | |
374 | for i in range(1,4): | |
375 | test_file = outprefix + '.' + str(i) | |
376 | self.assertTrue(filecmp.cmp(test_file, os.path.join(data_dir, 'sequences_test_split_test.fa.6.limit2.') + str(i))) | |
377 | os.unlink(test_file) | |
378 | ||
379 | # check big sequence not broken | |
380 | tasks.split_by_base_count(os.path.join(data_dir, 'sequences_test_split_test.long.fa'), outprefix, 2) | |
381 | self.assertTrue(filecmp.cmp(outprefix + '.1', os.path.join(data_dir, 'sequences_test_split_test.long.fa.2.1'))) | |
382 | self.assertTrue(filecmp.cmp(outprefix + '.2', os.path.join(data_dir, 'sequences_test_split_test.long.fa.2.2'))) | |
383 | os.unlink(outprefix + '.1') | |
384 | os.unlink(outprefix + '.2') | |
385 | ||
386 | def test_split_by_fixed_size(self): | |
387 | '''Test fasta/q file split by fixed size''' | |
388 | infile = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa') | |
389 | outprefix = 'tmp.sequences_test_split' | |
390 | tasks.split_by_fixed_size(infile, outprefix, 4, 1) | |
391 | ||
392 | for i in range(1,7,1): | |
393 | correct = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.' + str(i)) | |
394 | test = outprefix + '.' + str(i) | |
395 | self.assertTrue(filecmp.cmp(test, correct)) | |
396 | os.unlink(test) | |
397 | ||
398 | test_coords = outprefix + '.coords' | |
399 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.coords'), test_coords)) | |
400 | os.unlink(test_coords) | |
401 | ||
402 | def test_split_by_fixed_size_exclude_Ns(self): | |
403 | infile = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa') | |
404 | outprefix = 'tmp.sequences_test_split' | |
405 | tasks.split_by_fixed_size(infile, outprefix, 4, 1, skip_if_all_Ns=True) | |
406 | ||
407 | for i in range(1,5,1): | |
408 | correct = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.' + str(i)) | |
409 | test = outprefix + '.' + str(i) | |
410 | self.assertTrue(filecmp.cmp(test, correct)) | |
411 | os.unlink(test) | |
412 | ||
413 | test_coords = outprefix + '.coords' | |
414 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.coords'), test_coords)) | |
415 | os.unlink(test_coords) | |
416 | ||
417 | class TestCountSequences(unittest.TestCase): | |
418 | def test_count_sequences(self): | |
419 | '''Check that count_sequences does as expected''' | |
420 | self.assertEqual(2, tasks.count_sequences(os.path.join(data_dir, 'sequences_test_good_file.fq'))) | |
421 | self.assertEqual(4, tasks.count_sequences(os.path.join(data_dir, 'sequences_test.fa'))) | |
422 | self.assertEqual(0, tasks.count_sequences(os.path.join(data_dir, 'sequences_test_empty_file'))) | |
423 | ||
424 | class TestGetIds(unittest.TestCase): | |
425 | def test_get_ids(self): | |
426 | '''Check that IDs extracted correctly from fasta/q file''' | |
427 | tmpfile = 'tmp.ids' | |
428 | tasks.get_ids(os.path.join(data_dir, 'sequences_test.fa'), tmpfile) | |
429 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test.fa.ids'), tmpfile)) | |
430 | os.unlink(tmpfile) | |
431 | ||
432 | ||
433 | class TestFastaToFakeQual(unittest.TestCase): | |
434 | def test_fasta_to_fake_qual(self): | |
435 | '''Test fasta_to_fake_qual''' | |
436 | tmpfile = 'tmp.qual' | |
437 | infile = os.path.join(data_dir, 'tasks_test_fasta_to_fake_qual.in.fa') | |
438 | tasks.fastaq_to_fake_qual(infile, tmpfile) | |
439 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'tasks_test_fasta_to_fake_qual.out.default.qual'), tmpfile, shallow=False)) | |
440 | os.unlink(tmpfile) | |
441 | tasks.fastaq_to_fake_qual(infile, tmpfile, q=42) | |
442 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'tasks_test_fasta_to_fake_qual.out.q42.qual'), tmpfile, shallow=False)) | |
443 | os.unlink(tmpfile) | |
444 | ||
445 | ||
446 | class TestFastaToFastq(unittest.TestCase): | |
447 | def test_fasta_to_fastq(self): | |
448 | '''Check fasta_to_fastq converts files as expected''' | |
449 | tasks.fasta_to_fastq(os.path.join(data_dir, 'sequences_test.fa'), | |
450 | os.path.join(data_dir, 'sequences_test.fa.qual'), | |
451 | 'tmp.fq') | |
452 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test.fasta_to_fastq.fq'), 'tmp.fq')) | |
453 | ||
454 | with self.assertRaises(tasks.Error): | |
455 | tasks.fasta_to_fastq(os.path.join(data_dir, 'sequences_test.fa'), | |
456 | os.path.join(data_dir, 'sequences_test.fa.qual.bad'), | |
457 | 'tmp.fq') | |
458 | ||
459 | os.unlink('tmp.fq') | |
460 | ||
461 | ||
462 | class TestReplaceBases(unittest.TestCase): | |
463 | def test_sequences_replace_bases(self): | |
464 | '''Check that fasta file gets all bases replaced OK''' | |
465 | tmpfile = 'tmp.replace_bases.fa' | |
466 | tasks.replace_bases(os.path.join(data_dir, 'sequences_test_fastaq_replace_bases.fa'), tmpfile, 'T', 'X') | |
467 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_fastaq_replace_bases.expected.fa'), tmpfile)) | |
468 | os.unlink(tmpfile) | |
469 | ||
470 | ||
471 | class TestStripIlluminaSuffix(unittest.TestCase): | |
472 | def test_strip_illumina_suffix(self): | |
473 | '''Check illumina suffixes stripped correctly off read names''' | |
474 | tmpfile = 'tmp.stripped.fa' | |
475 | tasks.strip_illumina_suffix(os.path.join(data_dir, 'sequences_test_strip_illumina_suffix.fq'), tmpfile) | |
476 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_strip_illumina_suffix.fq.stripped'), tmpfile)) | |
477 | os.unlink(tmpfile) | |
478 | ||
479 | ||
480 | class TestToFasta(unittest.TestCase): | |
481 | def test_to_fasta(self): | |
482 | '''Test to_fasta''' | |
483 | tmpfile = 'tmp.to_fasta' | |
484 | infiles = [ | |
485 | 'sequences_test_good_file.fq', | |
486 | 'sequences_test_gffv3.gff', | |
487 | 'sequences_test_gffv3.no_FASTA_line.gff', | |
488 | 'sequences_test.embl', | |
489 | 'sequences_test.gbk', | |
490 | 'sequences_test_phylip.interleaved', | |
491 | 'sequences_test_phylip.interleaved2', | |
492 | 'sequences_test_phylip.sequential' | |
493 | ] | |
494 | infiles = [os.path.join(data_dir, x) for x in infiles] | |
495 | expected_outfiles = [x + '.to_fasta' for x in infiles] | |
496 | ||
497 | for i in range(len(infiles)): | |
498 | tasks.to_fasta(infiles[i], tmpfile) | |
499 | self.assertTrue(filecmp.cmp(expected_outfiles[i], tmpfile)) | |
500 | ||
501 | tasks.to_fasta(os.path.join(data_dir, 'sequences_test.fa'), tmpfile, line_length=3) | |
502 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test.line_length3.fa'), tmpfile)) | |
503 | tasks.to_fasta(os.path.join(data_dir, 'sequences_test_strip_after_whitespace.fa'), tmpfile, strip_after_first_whitespace=True) | |
504 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_strip_after_whitespace.fa.to_fasta'), tmpfile)) | |
505 | os.unlink(tmpfile) | |
506 | ||
507 | ||
508 | class TestToUniqueByID(unittest.TestCase): | |
509 | def test_to_unique_by_id(self): | |
510 | '''Test to_unique_by_id()''' | |
511 | tmpfile = 'tmp.unique_by_id.fa' | |
512 | tasks.to_unique_by_id(os.path.join(data_dir, 'sequences_test_to_unique_by_id.fa'), tmpfile) | |
513 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_to_unique_by_id.fa.out'), tmpfile)) | |
514 | os.unlink(tmpfile) | |
515 | ||
516 | ||
517 | class TestToFastaUnion(unittest.TestCase): | |
518 | def test_to_fasta_union(self): | |
519 | '''Test to_fasta_union''' | |
520 | tmpfile = 'tmp.to_fasta_union' | |
521 | tasks.to_fasta_union(os.path.join(data_dir, 'sequences_test_to_fasta_union.in.fa'), tmpfile, seqname='testname') | |
522 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_to_fasta_union.out.fa'), tmpfile, shallow=False)) | |
523 | os.unlink(tmpfile) | |
524 | ||
525 | ||
526 | if __name__ == '__main__': | |
527 | unittest.main() | |
528 |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import sys | |
3 | import os | |
4 | import filecmp | |
5 | import unittest | |
6 | from fastaq import utils | |
7 | ||
8 | modules_dir = os.path.dirname(os.path.abspath(utils.__file__)) | |
9 | data_dir = os.path.join(modules_dir, 'tests', 'data') | |
10 | ||
11 | class TestUtils(unittest.TestCase): | |
12 | def test_write_and_read(self): | |
13 | '''open_file_write() and open_file_read() should do the right thing depending gzipped or not''' | |
14 | for filename in ['utils.tmp', 'utils.tmp.gz', 'utils.tmp.bgz']: | |
15 | f = utils.open_file_write(filename) | |
16 | for i in range(3): | |
17 | print(i, file=f) | |
18 | utils.close(f) | |
19 | ||
20 | counter = 0 | |
21 | ||
22 | f = utils.open_file_read(filename) | |
23 | for line in f: | |
24 | self.assertEqual(counter, int(line.strip())) | |
25 | counter += 1 | |
26 | utils.close(f) | |
27 | ||
28 | os.unlink(filename) | |
29 | ||
30 | f = utils.open_file_read('-') | |
31 | self.assertEqual(sys.stdin, f) | |
32 | f = utils.open_file_write('-') | |
33 | self.assertEqual(sys.stdout, f) | |
34 | ||
35 | def test_raise_exception(self): | |
36 | '''open_file_write() and open_file_read() should raise an exception when can't do the opening''' | |
37 | with self.assertRaises(utils.Error): | |
38 | utils.open_file_read('this_file_is_not_here_so_throw_error') | |
39 | with self.assertRaises(utils.Error): | |
40 | utils.open_file_read('this_file_is_not_here_so_throw_error.gz') | |
41 | with self.assertRaises(utils.Error): | |
42 | utils.open_file_read(os.path.join(data_dir, 'utils_test_not_really_zipped.gz')) | |
43 | ||
44 | with self.assertRaises(utils.Error): | |
45 | utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error')) | |
46 | with self.assertRaises(utils.Error): | |
47 | utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error.gz')) | |
48 | ||
49 | def test_file_transpose(self): | |
50 | '''Test that file_transpose() does what it should''' | |
51 | infile = os.path.join(data_dir, 'utils_test_file_transpose.txt') | |
52 | tmp_out = 'utils_test_file_transpose.tmp' | |
53 | correct_file = os.path.join(data_dir, 'utils_test_file_transposed.txt') | |
54 | utils.file_transpose(infile, tmp_out) | |
55 | self.assertTrue(filecmp.cmp(tmp_out, correct_file)) | |
56 | os.unlink(tmp_out) | |
57 | ||
58 | def test_system_call(self): | |
59 | '''Test that system call appears to work and die as it should''' | |
60 | test_file = os.path.join(data_dir, 'utils_test_system_call.txt') | |
61 | tmp_out = 'utils_test_syscall.tmp' | |
62 | utils.syscall('cat ' + test_file + ' > ' + tmp_out) | |
63 | self.assertTrue(filecmp.cmp(tmp_out, test_file)) | |
64 | os.unlink(tmp_out) | |
65 | ||
66 | with self.assertRaises(utils.Error): | |
67 | utils.syscall('thisisveryunlikelytoebarealcommandandshouldthrowerror') | |
68 | ||
69 | utils.syscall('echo "this is not the right string" > ' + tmp_out) | |
70 | self.assertFalse(filecmp.cmp(tmp_out, test_file)) | |
71 | os.unlink(tmp_out) | |
72 | ||
73 | s = utils.syscall_get_stdout('echo bingo') | |
74 | self.assertListEqual(["bingo"], s) | |
75 | with self.assertRaises(utils.Error): | |
76 | utils.syscall_get_stdout('thisisveryunlikelytoebarealcommandandshouldthrowerror') | |
77 | ||
78 | if __name__ == '__main__': | |
79 | unittest.main() |
0 | import os | |
1 | import sys | |
2 | import subprocess | |
3 | import shlex | |
4 | ||
5 | class Error (Exception): pass | |
6 | ||
7 | def open_file_read(filename): | |
8 | if filename == '-': | |
9 | f = sys.stdin | |
10 | elif filename.endswith('.gz'): | |
11 | # first check that the file is OK according to gunzip | |
12 | retcode = subprocess.call('gunzip -t ' + filename, shell=True) | |
13 | if retcode != 0: | |
14 | raise Error("Error opening for reading gzipped file '" + filename + "'") | |
15 | ||
16 | # now open the file | |
17 | f = os.popen('gunzip -c ' + filename) | |
18 | else: | |
19 | try: | |
20 | f = open(filename) | |
21 | except: | |
22 | raise Error("Error opening for reading file '" + filename + "'") | |
23 | ||
24 | return f | |
25 | ||
26 | ||
27 | def open_file_write(filename): | |
28 | if filename == '-': | |
29 | f = sys.stdout | |
30 | elif filename.endswith('.gz'): | |
31 | if not os.path.exists(os.path.abspath(os.path.dirname(filename))): | |
32 | raise Error("Error opening for writing gzipped file '" + filename + "'") | |
33 | ||
34 | try: | |
35 | f = os.popen('gzip -9 -c > ' + filename, 'w') | |
36 | except: | |
37 | raise Error("Error opening for writing gzipped file '" + filename + "'") | |
38 | else: | |
39 | try: | |
40 | f = open(filename, 'w') | |
41 | except: | |
42 | raise Error("Error opening for writing file '" + filename + "'") | |
43 | ||
44 | return f | |
45 | ||
46 | ||
47 | def close(filehandle): | |
48 | if filehandle not in [sys.stdout, sys.stderr]: | |
49 | filehandle.close() | |
50 | ||
51 | ||
52 | def file_transpose(f_in, f_out, sep_in=None, sep_out='\t'): | |
53 | rows = [] | |
54 | f = open_file_read(f_in) | |
55 | for line in f: | |
56 | rows.append(line.rstrip().split(sep_in)) | |
57 | close(f) | |
58 | ||
59 | columns_out = max([len(x) for x in rows]) | |
60 | ||
61 | for r in rows: | |
62 | r += ['.'] * (columns_out - len(r)) | |
63 | ||
64 | f = open_file_write(f_out) | |
65 | for i in range(columns_out): | |
66 | print(sep_out.join([str(rows[x][i]) for x in range(len(rows))]), file=f) | |
67 | ||
68 | close(f) | |
69 | ||
70 | ||
71 | def syscall(cmd): | |
72 | retcode = subprocess.call(cmd, shell=True) | |
73 | ||
74 | if retcode != 0: | |
75 | raise Error("Error in system call. Command was:\n" + cmd) | |
76 | ||
77 | ||
78 | def syscall_get_stdout(cmd): | |
79 | try: | |
80 | out = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE).communicate()[0].decode('utf-8').rstrip() | |
81 | return out.split('\n') | |
82 | except: | |
83 | raise Error('Error in system call. I tried to run:\n' + str(cmd)) | |
84 | ||
85 |
0 | __all__ = [ | |
1 | 'caf', | |
2 | 'common', | |
3 | 'genetic_codes', | |
4 | 'utils', | |
5 | 'sequences', | |
6 | 'tasks', | |
7 | 'intervals', | |
8 | 'runners' | |
9 | ] | |
10 | from pyfastaq import * |
0 | from pyfastaq import sequences, utils | |
1 | ||
2 | class Error (Exception): pass | |
3 | ||
4 | def file_reader(fname): | |
5 | f = utils.open_file_read(fname) | |
6 | c = Caf() | |
7 | ||
8 | while c.get_next_from_file(f): | |
9 | yield c | |
10 | ||
11 | utils.close(f) | |
12 | ||
13 | ||
14 | class Caf: | |
15 | def __init__(self): | |
16 | self.id = None | |
17 | self.seq = None | |
18 | self.insert_min = None | |
19 | self.insert_max = None | |
20 | self.ligation = None | |
21 | self.clone = None | |
22 | self.clip_start = None | |
23 | self.clip_end = None | |
24 | ||
25 | ||
26 | def __eq__(self, other): | |
27 | if type(other) is type(self): | |
28 | return self.__dict__ == other.__dict__ | |
29 | return False | |
30 | ||
31 | ||
32 | def get_next_from_file(self, f): | |
33 | self.__init__() | |
34 | line = f.readline() | |
35 | if not line: | |
36 | return None | |
37 | while line == '\n': | |
38 | line = f.readline() | |
39 | ||
40 | if not line.startswith('DNA : '): | |
41 | raise Error("Error reading caf file. Expected line starting with 'DNA : ...'") | |
42 | ||
43 | self.id = line.rstrip().split()[2] | |
44 | ||
45 | line = f.readline() | |
46 | seq = [] | |
47 | ||
48 | while line != '\n': | |
49 | seq.append(line.rstrip()) | |
50 | line = f.readline() | |
51 | ||
52 | self.seq = sequences.Fasta(self.id, ''.join(seq)) | |
53 | ||
54 | line = f.readline() | |
55 | if not line.startswith('BaseQuality : '): | |
56 | raise Error("Error reading caf file. Expected line starting with 'BaseQuality : ...'") | |
57 | ||
58 | quals = [int(x) for x in f.readline().rstrip().split()] | |
59 | self.seq = self.seq.to_Fastq(quals) | |
60 | ||
61 | line = f.readline() | |
62 | assert line == '\n' | |
63 | line = f.readline() | |
64 | ||
65 | while line not in ['', '\n']: | |
66 | a = line.rstrip().split() | |
67 | if a[0] == 'Insert_size': | |
68 | self.insert_min, self.insert_max = int(a[1]), int(a[2]) | |
69 | elif a[0] == 'Ligation_no': | |
70 | self.ligation = a[1] | |
71 | elif a[0] == 'Clone': | |
72 | self.clone = a[1] | |
73 | elif a[0] == 'Clipping' and a[1] == 'QUAL': | |
74 | self.clip_start, self.clip_end = int(a[2]) - 1, int(a[3]) - 1 | |
75 | ||
76 | line = f.readline() | |
77 | ||
78 | return True |
0 | version = '3.2.0' |
0 | codes = {} | |
1 | ||
2 | #standard genetic code | |
3 | codes[1] = { | |
4 | 'TTT': 'F', | |
5 | 'TTC': 'F', | |
6 | 'TTA': 'L', | |
7 | 'TTG': 'L', | |
8 | 'TCT': 'S', | |
9 | 'TCC': 'S', | |
10 | 'TCA': 'S', | |
11 | 'TCG': 'S', | |
12 | 'TAT': 'Y', | |
13 | 'TAC': 'Y', | |
14 | 'TAA': '*', | |
15 | 'TAG': '*', | |
16 | 'TGT': 'C', | |
17 | 'TGC': 'C', | |
18 | 'TGA': '*', | |
19 | 'TGG': 'W', | |
20 | 'CTT': 'L', | |
21 | 'CTC': 'L', | |
22 | 'CTA': 'L', | |
23 | 'CTG': 'L', | |
24 | 'CCT': 'P', | |
25 | 'CCC': 'P', | |
26 | 'CCA': 'P', | |
27 | 'CCG': 'P', | |
28 | 'CAT': 'H', | |
29 | 'CAC': 'H', | |
30 | 'CAA': 'Q', | |
31 | 'CAG': 'Q', | |
32 | 'CGT': 'R', | |
33 | 'CGC': 'R', | |
34 | 'CGA': 'R', | |
35 | 'CGG': 'R', | |
36 | 'ATT': 'I', | |
37 | 'ATC': 'I', | |
38 | 'ATA': 'I', | |
39 | 'ATG': 'M', | |
40 | 'ACT': 'T', | |
41 | 'ACC': 'T', | |
42 | 'ACA': 'T', | |
43 | 'ACG': 'T', | |
44 | 'AAT': 'N', | |
45 | 'AAC': 'N', | |
46 | 'AAA': 'K', | |
47 | 'AAG': 'K', | |
48 | 'AGT': 'S', | |
49 | 'AGC': 'S', | |
50 | 'AGA': 'R', | |
51 | 'AGG': 'R', | |
52 | 'GTT': 'V', | |
53 | 'GTC': 'V', | |
54 | 'GTA': 'V', | |
55 | 'GTG': 'V', | |
56 | 'GCT': 'A', | |
57 | 'GCC': 'A', | |
58 | 'GCA': 'A', | |
59 | 'GCG': 'A', | |
60 | 'GAT': 'D', | |
61 | 'GAC': 'D', | |
62 | 'GAA': 'E', | |
63 | 'GAG': 'E', | |
64 | 'GGT': 'G', | |
65 | 'GGC': 'G', | |
66 | 'GGA': 'G', | |
67 | 'GGG': 'G', | |
68 | } | |
69 | ||
70 | ||
71 | #mycoplasma genetic code | |
72 | codes[4] = { | |
73 | 'TTT': 'F', | |
74 | 'TTC': 'F', | |
75 | 'TTA': 'L', | |
76 | 'TTG': 'L', | |
77 | 'TCT': 'S', | |
78 | 'TCC': 'S', | |
79 | 'TCA': 'S', | |
80 | 'TCG': 'S', | |
81 | 'TAT': 'Y', | |
82 | 'TAC': 'Y', | |
83 | 'TAA': '*', | |
84 | 'TAG': '*', | |
85 | 'TGT': 'C', | |
86 | 'TGC': 'C', | |
87 | 'TGA': 'W', | |
88 | 'TGG': 'W', | |
89 | 'CTT': 'L', | |
90 | 'CTC': 'L', | |
91 | 'CTA': 'L', | |
92 | 'CTG': 'L', | |
93 | 'CCT': 'P', | |
94 | 'CCC': 'P', | |
95 | 'CCA': 'P', | |
96 | 'CCG': 'P', | |
97 | 'CAT': 'H', | |
98 | 'CAC': 'H', | |
99 | 'CAA': 'Q', | |
100 | 'CAG': 'Q', | |
101 | 'CGT': 'R', | |
102 | 'CGC': 'R', | |
103 | 'CGA': 'R', | |
104 | 'CGG': 'R', | |
105 | 'ATT': 'I', | |
106 | 'ATC': 'I', | |
107 | 'ATA': 'I', | |
108 | 'ATG': 'M', | |
109 | 'ACT': 'T', | |
110 | 'ACC': 'T', | |
111 | 'ACA': 'T', | |
112 | 'ACG': 'T', | |
113 | 'AAT': 'N', | |
114 | 'AAC': 'N', | |
115 | 'AAA': 'K', | |
116 | 'AAG': 'K', | |
117 | 'AGT': 'S', | |
118 | 'AGC': 'S', | |
119 | 'AGA': 'R', | |
120 | 'AGG': 'R', | |
121 | 'GTT': 'V', | |
122 | 'GTC': 'V', | |
123 | 'GTA': 'V', | |
124 | 'GTG': 'V', | |
125 | 'GCT': 'A', | |
126 | 'GCC': 'A', | |
127 | 'GCA': 'A', | |
128 | 'GCG': 'A', | |
129 | 'GAT': 'D', | |
130 | 'GAC': 'D', | |
131 | 'GAA': 'E', | |
132 | 'GAG': 'E', | |
133 | 'GGT': 'G', | |
134 | 'GGC': 'G', | |
135 | 'GGA': 'G', | |
136 | 'GGG': 'G' | |
137 | } | |
138 |
0 | class Error (Exception): pass | |
1 | ||
2 | ||
3 | class Interval: | |
4 | '''A class to deal with intervals in a genome. Can do things like intersections, unions etc''' | |
5 | def __init__(self, start, end): | |
6 | try: | |
7 | self.start = int(start) | |
8 | self.end = int(end) | |
9 | except ValueError: | |
10 | raise Error('Error making interval from :"' + str(start) + '" and "' + str(end) + '"') | |
11 | ||
12 | if self.end < self.start: | |
13 | raise Error('Error making interval ' + str(self) + '. end < start.') | |
14 | ||
15 | def __len__(self): | |
16 | return self.end - self.start + 1 | |
17 | ||
18 | def __eq__(self, other): | |
19 | return type(other) is type(self) and self.__dict__ == other.__dict__ | |
20 | ||
21 | def __ne__(self, other): | |
22 | return not self.__eq__(other) | |
23 | ||
24 | def __str__(self): | |
25 | return '(' + str(self.start) + ',' + str(self.end) + ')' | |
26 | ||
27 | def __lt__(self, i): | |
28 | return self.start < i.start or (self.start == i.start and self.end < i.end) | |
29 | ||
30 | def __le__(self, i): | |
31 | return self.start < i.start or (self.start == i.start and self.end <= i.end) | |
32 | ||
33 | def intersects(self, i): | |
34 | '''Returns true iff this interval intersects the interval i''' | |
35 | return self.start <= i.end and i.start <= self.end | |
36 | ||
37 | def contains(self, i): | |
38 | '''Returns true iff this interval contains the interval i''' | |
39 | return self.start <= i.start and i.end <= self.end | |
40 | ||
41 | def union(self, i): | |
42 | '''If intervals intersect, returns their union, otherwise returns None''' | |
43 | if self.intersects(i) or self.end + 1 == i.start or i.end + 1 == self.start: | |
44 | return Interval(min(self.start, i.start), max(self.end, i.end)) | |
45 | else: | |
46 | return None | |
47 | ||
48 | def union_fill_gap(self, i): | |
49 | '''Like union, but ignores whether the two intervals intersect or not''' | |
50 | return Interval(min(self.start, i.start), max(self.end, i.end)) | |
51 | ||
52 | def intersection(self, i): | |
53 | '''If intervals intersect, returns their intersection, otherwise returns None''' | |
54 | if self.intersects(i): | |
55 | return Interval(max(self.start, i.start), min(self.end, i.end)) | |
56 | else: | |
57 | return None | |
58 | ||
59 | ||
60 | def intersection(l1, l2): | |
61 | '''Returns intersection of two lists. Assumes the lists are sorted by start positions''' | |
62 | if len(l1) == 0 or len(l2) == 0: | |
63 | return [] | |
64 | ||
65 | out = [] | |
66 | l2_pos = 0 | |
67 | ||
68 | for l in l1: | |
69 | while l2_pos < len(l2) and l2[l2_pos].end < l.start: | |
70 | l2_pos += 1 | |
71 | ||
72 | if l2_pos == len(l2): | |
73 | break | |
74 | ||
75 | while l2_pos < len(l2) and l.intersects(l2[l2_pos]): | |
76 | out.append(l.intersection(l2[l2_pos])) | |
77 | l2_pos += 1 | |
78 | ||
79 | l2_pos = max(0, l2_pos - 1) | |
80 | ||
81 | return out | |
82 | ||
83 | ||
84 | def merge_overlapping_in_list(l): | |
85 | '''Sorts list, merges any overlapping intervals, and also adjacent intervals. e.g. | |
86 | [0,1], [1,2] would be merge to [0,.2].''' | |
87 | i = 0 | |
88 | l.sort() | |
89 | ||
90 | while i < len(l) - 1: | |
91 | u = l[i].union(l[i+1]) | |
92 | if u is not None: | |
93 | l[i] = u | |
94 | l.pop(i+1) | |
95 | else: | |
96 | i += 1 | |
97 | ||
98 | ||
99 | def remove_contained_in_list(l): | |
100 | '''Sorts list in place, then removes any intervals that are completely | |
101 | contained inside another interval''' | |
102 | i = 0 | |
103 | l.sort() | |
104 | ||
105 | while i < len(l) - 1: | |
106 | if l[i+1].contains(l[i]): | |
107 | l.pop(i) | |
108 | elif l[i].contains(l[i+1]): | |
109 | l.pop(i+1) | |
110 | else: | |
111 | i += 1 | |
112 | ||
113 | ||
114 | def length_sum_from_list(l): | |
115 | '''Returns total length of intervals from a list''' | |
116 | return sum([len(x) for x in l]) |
0 | import argparse | |
1 | import sys | |
2 | import random | |
3 | from pyfastaq import sequences, utils, intervals | |
4 | ||
5 | def run(description): | |
6 | parser = argparse.ArgumentParser( | |
7 | description = description, | |
8 | usage = 'fastaq add_indels [options] <infile> <outfile>') | |
9 | parser.add_argument('infile', help='Name of input file') | |
10 | parser.add_argument('outfile', help='Name of output file') | |
11 | parser.add_argument('-d','--delete', action='append', help='Delete the given bases from the given sequence. Format same as samtools view: name:start-end. This option can be used multiple times (once for each region to delete). Overlapping coords will be merged before deleting', metavar='Name:start:bases') | |
12 | parser.add_argument('--delete_range', help='Deletes bases starting at position P in each sequence of the input file. Deletes start + (n-1)*step bases from sequence n.', metavar='P,start,step') | |
13 | parser.add_argument('-i','--insert', action='append', help='Insert a random string of bases at the given position. Format is name:position:number_to_add. Bases are added after the position. This option can be used multiple times', metavar='Name:start:bases') | |
14 | parser.add_argument('--insert_range', help='Inserts random bases starting after position P in each sequence of the input file. Inserts start + (n-1)*step bases into sequence n.', metavar='P,start,step') | |
15 | options = parser.parse_args() | |
16 | ||
17 | test_ops = [int(x is not None) for x in [options.delete, options.insert, options.delete_range, options.insert_range]] | |
18 | ||
19 | if sum(test_ops) != 1: | |
20 | print('Must use one of --delete, --insert, --delete_range, --insert_range. Cannot continue', file=sys.stderr) | |
21 | sys.exit(1) | |
22 | ||
23 | ||
24 | def range2dic(range_in): | |
25 | if range_in is None: | |
26 | return {} | |
27 | (pos, start, step) = range_in.split(',') | |
28 | d = {} | |
29 | d['pos'] = int(pos) - 1 | |
30 | d['bases'] = int(start) | |
31 | d['step'] = int(step) | |
32 | return d | |
33 | ||
34 | delete_range = range2dic(options.delete_range) | |
35 | insert_range = range2dic(options.insert_range) | |
36 | ||
37 | ||
38 | # convert the -d regions into sequence name, start and end coords | |
39 | to_delete = {} | |
40 | if options.delete: | |
41 | for s in options.delete: | |
42 | id, coords = s.rsplit(':') | |
43 | start, end = [int(x)-1 for x in coords.split('-')] | |
44 | if id not in to_delete: | |
45 | to_delete[id] = [] | |
46 | to_delete[id].append(intervals.Interval(start, end)) | |
47 | ||
48 | ||
49 | to_insert = {} | |
50 | if options.insert: | |
51 | for s in options.insert: | |
52 | id, pos, bases = s.rsplit(':',2) | |
53 | pos = int(pos) - 1 | |
54 | bases = int(bases) | |
55 | if id not in to_insert: | |
56 | to_insert[id] = [] | |
57 | to_insert[id].append((pos, bases)) | |
58 | ||
59 | ||
60 | assert len(to_delete) * len(to_insert) == 0 | |
61 | ||
62 | # merge overlapping regions to be deleted | |
63 | for l in to_delete.values(): | |
64 | intervals.merge_overlapping_in_list(l) | |
65 | ||
66 | # sort positions to be inserted | |
67 | for l in to_insert.values(): | |
68 | l.sort() | |
69 | ||
70 | # read in the fasta/q file and print outfile with deleted sequences | |
71 | seq_reader = sequences.file_reader(options.infile) | |
72 | f = utils.open_file_write(options.outfile) | |
73 | ||
74 | for seq in seq_reader: | |
75 | if seq.id in to_delete: | |
76 | # delete regions for this sequence, but start at the end so the | |
77 | # coords don't get messed up after the first deletion | |
78 | for inter in reversed(to_delete[seq.id]): | |
79 | seq.seq = seq.seq[:inter.start] + seq.seq[inter.end + 1:] | |
80 | elif options.delete_range: | |
81 | seq.seq = seq.seq[:delete_range['pos']] + seq.seq[delete_range['pos'] + delete_range['bases']:] | |
82 | delete_range['bases'] += delete_range['step'] | |
83 | elif seq.id in to_insert: | |
84 | for pos, bases in reversed(to_insert[seq.id]): | |
85 | seq.seq = seq.seq[:pos + 1] + ''.join([random.choice('ACGT') for x in range(bases)]) + seq.seq[pos + 1:] | |
86 | elif options.insert_range: | |
87 | seq.seq = seq.seq[:insert_range['pos'] + 1] + ''.join([random.choice('ACGT') for x in range(insert_range['bases'])]) + seq.seq[insert_range['pos'] + 1:] | |
88 | insert_range['bases'] += insert_range['step'] | |
89 | ||
90 | print(seq, file=f) | |
91 | ||
92 | utils.close(f) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Converts CAF file to FASTQ format', | |
6 | usage = 'fastaq caf_to_fastq [options] <infile> <outfile>') | |
7 | parser.add_argument('infile', help='Name of input CAF file.') | |
8 | parser.add_argument('outfile', help='Name of output FASTQ file') | |
9 | parser.add_argument('-c', '--clip', action='store_true', help='Use clipping info to clip reads, if present in the input CAF file (as lines of the form "Clipping QUAL start end"). Default is to not clip') | |
10 | parser.add_argument('-l', '--min_length', type=int, help='Minimum length of sequence to output [%(default)s]', default=1, metavar='INT') | |
11 | options = parser.parse_args() | |
12 | ||
13 | tasks.caf_to_fastq( | |
14 | options.infile, | |
15 | options.outfile, | |
16 | trim=options.clip, | |
17 | min_length=options.min_length | |
18 | ) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Given a file of capillary reads, makes an interleaved file of read pairs (where more than read from same ligation, takes the longest read) and a file of unpaired reads. Replaces the .p1k/.q1k part of read names to denote fwd/rev reads with /1 and /2', | |
6 | usage = 'fastaq capillary_to_pairs <infile> <outfiles prefix>') | |
7 | parser.add_argument('infile', help='Name of input fasta/q file') | |
8 | parser.add_argument('outprefix', help='Prefix of output files', metavar='outfiles prefix') | |
9 | options = parser.parse_args() | |
10 | tasks.capillary_to_pairs(options.infile, options.outprefix) | |
11 |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Splits a multi sequence file into separate files. Splits sequences into chunks of a fixed size. Aims for chunk_size chunks in each file, but allows a little extra, so chunk can be up to (chunk_size + tolerance), to prevent tiny chunks made from the ends of sequences', | |
6 | usage = 'fastaq chunker [options] <infile> <out> <chunk size> <tolerance>') | |
7 | parser.add_argument('infile', help='Name of input file to be split') | |
8 | parser.add_argument('out', help='Prefix of output file. If --onefile used, then name of single output file') | |
9 | parser.add_argument('chunk_size', type=int, help='Size of each chunk') | |
10 | parser.add_argument('tolerance', type=int, help='Tolerance allowed in chunk size') | |
11 | parser.add_argument('--onefile', action='store_true', help='Output all the sequences in one file') | |
12 | parser.add_argument('--skip_all_Ns', action='store_true', help='Do not output any sequence that consists of all Ns') | |
13 | options = parser.parse_args() | |
14 | if options.onefile: | |
15 | tasks.split_by_fixed_size_onefile( | |
16 | options.infile, | |
17 | options.out, | |
18 | options.chunk_size, | |
19 | options.tolerance, | |
20 | skip_if_all_Ns=options.skip_all_Ns | |
21 | ) | |
22 | else: | |
23 | tasks.split_by_fixed_size( | |
24 | options.infile, | |
25 | options.out, | |
26 | options.chunk_size, | |
27 | options.tolerance, | |
28 | skip_if_all_Ns=options.skip_all_Ns | |
29 | ) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Prints the number of sequences in input file to stdout', | |
6 | usage = 'fastaq count_sequences <infile>') | |
7 | parser.add_argument('infile', help='Name of input file') | |
8 | options = parser.parse_args() | |
9 | print(tasks.count_sequences(options.infile)) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Deinterleaves sequence file, so that reads are written alternately between two output files', | |
6 | usage = 'fastaq deinterleave [options] <infile> <out_fwd> <out_rev>') | |
7 | parser.add_argument('--fasta_out', action='store_true', help='Use this to write output as fasta (default is same as input)', default=False) | |
8 | parser.add_argument('infile', help='Name of fasta/q file to be deinterleaved') | |
9 | parser.add_argument('out_fwd', help='Name of output fasta/q file of forwards reads') | |
10 | parser.add_argument('out_rev', help='Name of output fasta/q file of reverse reads') | |
11 | options = parser.parse_args() | |
12 | tasks.deinterleave(options.infile, options.out_fwd, options.out_rev, fasta_out=options.fasta_out) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Renames sequences in a file, calling them 1,2,3... etc', | |
6 | usage = 'fastaq enumerate_names [options] <infile> <outfile>') | |
7 | parser.add_argument('--start_index', type=int, help='Starting number [%(default)s]', default=1) | |
8 | parser.add_argument('--rename_file', help='If used, will write a file of old name to new name') | |
9 | parser.add_argument('--keep_suffix', action='store_true', help='Use this to keep a /1 or /2 suffix at the end of each name') | |
10 | parser.add_argument('--suffix', help='Add the given string to the end of every name', default=None) | |
11 | parser.add_argument('infile', help='Name of fasta/q file to be read') | |
12 | parser.add_argument('outfile', help='Name of output fasta/q file') | |
13 | options = parser.parse_args() | |
14 | tasks.enumerate_names(options.infile, | |
15 | options.outfile, | |
16 | start_index=options.start_index, | |
17 | keep_illumina_suffix=options.keep_suffix, | |
18 | rename_file=options.rename_file, | |
19 | suffix=options.suffix) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Makes all combinations of sequences in input file by using all possibilities of redundant bases. e.g. ART could be AAT or AGT. Assumes input is nucleotides, not amino acids', | |
6 | usage = 'fastaq expand_nucleotides <infile> <outfile>') | |
7 | parser.add_argument('infile', help='Name of input file') | |
8 | parser.add_argument('outfile', help='Name of output file') | |
9 | options = parser.parse_args() | |
10 | tasks.expand_nucleotides( | |
11 | options.infile, | |
12 | options.outfile, | |
13 | ) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = description, | |
6 | usage = 'fastaq fasta_to_fastq <fasta in> <qual in> <fastq out>') | |
7 | parser.add_argument('fasta', help='Name of input FASTA file', metavar='fasta in') | |
8 | parser.add_argument('qual', help='Name of input quality scores file', metavar='qual in') | |
9 | parser.add_argument('outfile', help='Name of output FASTQ file', metavar='fastq out') | |
10 | options = parser.parse_args() | |
11 | tasks.fasta_to_fastq(options.fasta, options.qual, options.outfile) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Filters a sequence file by sequence length and/or by name matching a regular expression', | |
6 | usage = 'fastaq filter [options] <infile> <outfile>') | |
7 | parser.add_argument('--min_length', type=int, help='Minimum length of sequence to keep [%(default)s]', default=0, metavar='INT') | |
8 | parser.add_argument('--max_length', type=float, help='Maximum length of sequence to keep [%(default)s]', default=float('inf'), metavar='INT') | |
9 | parser.add_argument('--regex', help='If given, only reads with a name matching the regular expression will be kept') | |
10 | parser.add_argument('--ids_file', help='If given, only reads whose ID is in th given file will be used. One ID per line of file.', metavar='FILENAME') | |
11 | parser.add_argument('-v', '--invert', action='store_true', help='Only keep sequences that do not match the filters') | |
12 | ||
13 | mate_group = parser.add_argument_group('Mate file for read pairs options') | |
14 | mate_group.add_argument('--mate_in', help='Name of mates input file. If used, must also provide --mate_out', metavar='FILENAME') | |
15 | mate_group.add_argument('--mate_out', help='Name of mates output file', metavar='FILENAME') | |
16 | mate_group.add_argument('--both_mates_pass', action='store_true', help='By default, if either mate passes filter, then both reads output. Use this flag to require that both reads of a pair pass the filter') | |
17 | ||
18 | parser.add_argument('infile', help='Name of input file to be filtered') | |
19 | parser.add_argument('outfile', help='Name of output file') | |
20 | options = parser.parse_args() | |
21 | tasks.filter(options.infile, | |
22 | options.outfile, | |
23 | minlength=options.min_length, | |
24 | maxlength=options.max_length, | |
25 | regex=options.regex, | |
26 | ids_file=options.ids_file, | |
27 | invert=options.invert, | |
28 | mate_in=options.mate_in, | |
29 | mate_out=options.mate_out, | |
30 | both_mates_pass=options.both_mates_pass, | |
31 | ) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Gets IDs from each sequence in input file', | |
6 | usage = 'fastaq get_ids <infile> <outfile>') | |
7 | parser.add_argument('infile', help='Name of input file') | |
8 | parser.add_argument('outfile', help='Name of output file') | |
9 | options = parser.parse_args() | |
10 | tasks.get_ids(options.infile, options.outfile) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = description, | |
6 | usage = 'fastaq get_seq_flanking_gaps [options] <infile> <outfile>') | |
7 | parser.add_argument('--left', type=int, help='Number of bases to get to left of gap [%(default)s]', default=25, metavar='INT') | |
8 | parser.add_argument('--right', type=int, help='Number of bases to get to right of gap [%(default)s]', default=25, metavar='INT') | |
9 | parser.add_argument('infile', help='Name of input file') | |
10 | parser.add_argument('outfile', help='Name of output file') | |
11 | options = parser.parse_args() | |
12 | tasks.get_seqs_flanking_gaps(options.infile, options.outfile, options.left, options.right) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = description, | |
6 | usage = 'fastaq interleave <infile_1> <infile_2> <outfile>') | |
7 | parser.add_argument('infile_1', help='Name of first input file') | |
8 | parser.add_argument('infile_2', help='Name of second input file') | |
9 | parser.add_argument('outfile', help='Name of output file of interleaved reads') | |
10 | options = parser.parse_args() | |
11 | tasks.interleave(options.infile_1, options.infile_2, options.outfile) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Simulates long reads from a sequence file. Can optionally make insertions into the reads, like pacbio does. If insertions made, coverage calculation is done before the insertions (so total read length may appear longer then expected).', | |
6 | usage = 'fastaq long_read_simulate [options] <infile> <outfile>') | |
7 | ||
8 | parser.add_argument('infile', help='Name of input file') | |
9 | parser.add_argument('outfile', help='Name of output FASTA file') | |
10 | ||
11 | parser.add_argument('--method', help='How to sample the read positions and lengths. Choose from 1) "tiling", where reads of fixed length are taken at equal intervals from the reference. 2) "unfiform", where reads of fixed length taken at positions sampled uniformly. 3) "gamma", where reads lengths are taken from a gamma distribution, and positions sampled uniformly. [%(default)s]', default='tiling', choices=['tiling', 'uniform', 'gamma'], metavar='tiling|uniform|gamma') | |
12 | parser.add_argument('--seed', type=int, help='Seed for random number generator [default: use python\'s default]', metavar='INT') | |
13 | parser.add_argument('--qual', help='Write a file of fake quality scores called outfile.qual, all bases same quality [%(default)s]', metavar='INT') | |
14 | parser.add_argument('--fixed_read_length', type=int, help='Length of each read. Only applies if method is tile or uniform. [%(default)s]', default=20000, metavar='INT') | |
15 | parser.add_argument('--coverage', type=float, help='Read coverage. Only applies if method is gamma or uniform. [%(default)s]', default=2, metavar='FLOAT') | |
16 | ||
17 | ||
18 | tiling_group = parser.add_argument_group('tiling options') | |
19 | tiling_group.add_argument('--tile_step', type=int, help='Distance between start of each read [%(default)s]', default=10000, metavar='INT') | |
20 | ||
21 | gamma_group = parser.add_argument_group('gamma options') | |
22 | gamma_group.add_argument('--gamma_shape', type=float, help='Shape parameter of gamma distribution [%(default)s]', default=1.2, metavar='FLOAT') | |
23 | gamma_group.add_argument('--gamma_scale', type=float, help='Scale parameter of gamma distribution [%(default)s]', default=6000, metavar='FLOAT') | |
24 | gamma_group.add_argument('--gamma_min_length', type=int, help='Minimum read length [%(default)s]', default=20000, metavar='INT') | |
25 | ||
26 | ins_group = parser.add_argument_group('options to add insertions to reads') | |
27 | ins_group.add_argument('--ins_skip', type=int, help='Insert a random base every --skip bases plus or minus --ins_window. If this option is used, must also use --ins_window.', metavar='INT') | |
28 | ins_group.add_argument('--ins_window', type=int, help='See --ins_skip. If this option is used, must also use --ins_skip.', metavar='INT') | |
29 | ||
30 | ||
31 | options = parser.parse_args() | |
32 | tasks.make_long_reads( | |
33 | options.infile, | |
34 | options.outfile, | |
35 | method=options.method, | |
36 | fixed_read_length=options.fixed_read_length, | |
37 | coverage=options.coverage, | |
38 | tile_step=options.tile_step, | |
39 | gamma_shape=options.gamma_shape, | |
40 | gamma_scale=options.gamma_scale, | |
41 | gamma_min_length=options.gamma_min_length, | |
42 | seed=options.seed, | |
43 | ins_skip=options.ins_skip, | |
44 | ins_window=options.ins_window | |
45 | ) | |
46 | ||
47 | if options.qual: | |
48 | tasks.fastaq_to_fake_qual(options.outfile, options.outfile + '.qual', q=options.qual) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Makes a multi-FASTA file of random sequences, all of the same length. Each base has equal chance of being A,C,G or T', | |
6 | usage = 'fastaq make_random_contigs [options] <contigs> <length> <outfile>') | |
7 | parser.add_argument('--first_number', type=int, help='If numbering the sequences, the first sequence gets this number [%(default)s]', default=1) | |
8 | parser.add_argument('--name_by_letters', action='store_true', help='Name the contigs A,B,C,... will start at A again if you get to Z') | |
9 | parser.add_argument('--prefix', help='Prefix to add to start of every sequence name', default='') | |
10 | parser.add_argument('--seed', type=int, help='Seed for random number generator. Default is to use python\'s default', default=None) | |
11 | parser.add_argument('contigs', type=int, help='Nunber of contigs to make') | |
12 | parser.add_argument('length', type=int, help='Length of each contig') | |
13 | parser.add_argument('outfile', help='Name of output file') | |
14 | options = parser.parse_args() | |
15 | tasks.make_random_contigs( | |
16 | options.contigs, | |
17 | options.length, | |
18 | options.outfile, | |
19 | name_by_letters=options.name_by_letters, | |
20 | prefix=options.prefix, | |
21 | seed=options.seed, | |
22 | first_number=options.first_number | |
23 | ) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = description, | |
6 | usage = 'fastaq merge [options] <infile> <outfile>') | |
7 | parser.add_argument('infile', help='Name of input file') | |
8 | parser.add_argument('outfile', help='Name of output file') | |
9 | parser.add_argument('-n', '--name', help='Name of sequence in output file [%(default)s]', default='union') | |
10 | options = parser.parse_args() | |
11 | tasks.merge_to_one_seq( | |
12 | options.infile, | |
13 | options.outfile, | |
14 | seqname=options.name | |
15 | ) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = description, | |
6 | usage = 'fastaq replace_bases <infile> <outfile> <old> <new>') | |
7 | parser.add_argument('infile', help='Name of input file') | |
8 | parser.add_argument('outfile', help='Name of output file') | |
9 | parser.add_argument('old', help='Base to be replaced') | |
10 | parser.add_argument('new', help='Replace with this letter') | |
11 | options = parser.parse_args() | |
12 | tasks.replace_bases(options.infile, options.outfile, options.old, options.new) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = description, | |
6 | usage = 'fastaq reverse_complement <infile> <outfile>') | |
7 | parser.add_argument('infile', help='Name of input file') | |
8 | parser.add_argument('outfile', help='Name of output file') | |
9 | options = parser.parse_args() | |
10 | tasks.reverse_complement(options.infile, options.outfile) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Creates a file of contigs from a file of scaffolds - i.e. breaks at every gap in the input', | |
6 | usage = 'fastaq scaffolds_to_contigs [options] <infile> <outfile>') | |
7 | parser.add_argument('--number_contigs', action='store_true', help='Use this to enumerate contig names 1,2,3,... within each scaffold') | |
8 | parser.add_argument('infile', help='Name of input file') | |
9 | parser.add_argument('outfile', help='Name of output contigs file') | |
10 | options = parser.parse_args() | |
11 | tasks.scaffolds_to_contigs(options.infile, options.outfile, number_contigs=options.number_contigs) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Searches for an exact match on a given string and its reverse complement, in every sequence of input sequence file. Case insensitive. Guaranteed to find all hits', | |
6 | usage = 'fastaq search_for_seq [options] <infile> <outfile> <search_string>') | |
7 | parser.add_argument('infile', help='Name of input file') | |
8 | parser.add_argument('outfile', help='Name of outputfile. Tab-delimited output: sequence name, position, strand') | |
9 | parser.add_argument('search_string', help='String to search for in the sequences') | |
10 | options = parser.parse_args() | |
11 | tasks.search_for_seq(options.infile, options.outfile, options.search_string) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Trims sequences off the start of all sequences in a pair of sequence files, whenever there is a perfect match. Only keeps a read pair if both reads of the pair are at least a minimum length after any trimming', | |
6 | usage = 'fastaq sequence_trim [options] <infile_1> <infile_2> <outfile_1> <outfile_2> <trim_seqs>') | |
7 | parser.add_argument('--min_length', type=int, help='Minimum length of output sequences [%(default)s]', default=50, metavar='INT') | |
8 | parser.add_argument('--revcomp', action='store_true', help='Trim the end of each sequence if it matches the reverse complement. This option is intended for PCR primer trimming') | |
9 | parser.add_argument('infile_1', help='Name of forward fasta/q file to be trimmed') | |
10 | parser.add_argument('infile_2', help='Name of reverse fasta/q file to be trimmed') | |
11 | parser.add_argument('outfile_1', help='Name of output forward fasta/q file') | |
12 | parser.add_argument('outfile_2', help='Name of output reverse fasta/q file') | |
13 | parser.add_argument('trim_seqs', help='Name of file of sequences to search for at the start of each input sequence') | |
14 | options = parser.parse_args() | |
15 | tasks.sequence_trim( | |
16 | options.infile_1, | |
17 | options.infile_2, | |
18 | options.outfile_1, | |
19 | options.outfile_2, | |
20 | options.trim_seqs, | |
21 | min_length=options.min_length, | |
22 | check_revcomp=options.revcomp | |
23 | ) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = description, | |
6 | usage = 'fastaq sort_by_size [options] <infile> <outfile>') | |
7 | parser.add_argument('infile', help='Name of input file') | |
8 | parser.add_argument('outfile', help='Name of output file') | |
9 | parser.add_argument('-r', '--reverse', action='store_true', help='Sort by shortest first instead of the default of longest first') | |
10 | options = parser.parse_args() | |
11 | tasks.sort_by_size( | |
12 | options.infile, | |
13 | options.outfile, | |
14 | smallest_first=options.reverse | |
15 | ) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Splits a multi sequence file into separate files. Does not split sequences. Puts up to max_bases into each split file. The exception is that any sequence longer than max_bases is put into its own file.', | |
6 | usage = 'fastaq split_by_base_count [options] <infile> <outprefix> <max_bases>') | |
7 | parser.add_argument('infile', help='Name of input file to be split') | |
8 | parser.add_argument('outprefix', help='Name of output file') | |
9 | parser.add_argument('max_bases', type=int, help='Max bases in each output split file', metavar='max_bases') | |
10 | parser.add_argument('--max_seqs', type=int, help='Max number of sequences in each output split file [no limit]', metavar='INT') | |
11 | ||
12 | options = parser.parse_args() | |
13 | tasks.split_by_base_count(options.infile, options.outprefix, options.max_bases, options.max_seqs) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = description, | |
6 | usage = 'fastaq strip_illumina_suffix <infile> <outfile>') | |
7 | parser.add_argument('infile', help='Name of input file') | |
8 | parser.add_argument('outfile', help='Name of output file') | |
9 | options = parser.parse_args() | |
10 | tasks.strip_illumina_suffix(options.infile, options.outfile) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = description, | |
6 | usage = 'fastaq to_fake_qual [options] <infile> <outfile>') | |
7 | parser.add_argument('infile', help='Name of input file') | |
8 | parser.add_argument('outfile', help='Name of output file') | |
9 | parser.add_argument('-q', '--qual', type=int, help='Quality score to assign to all bases [%(default)s]', default=40) | |
10 | options = parser.parse_args() | |
11 | tasks.fastaq_to_fake_qual( | |
12 | options.infile, | |
13 | options.outfile, | |
14 | q=options.qual | |
15 | ) | |
16 |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = description, | |
6 | usage = 'fastaq to_fasta [options] <infile> <outfile>') | |
7 | parser.add_argument('infile', help='Name of input file. Can be any of FASTA, FASTQ, GFF3, EMBL, GBK, Phylip') | |
8 | parser.add_argument('outfile', help='Name of output file') | |
9 | parser.add_argument('-l', '--line_length', type=int, help='Number of bases on each sequence line of output file. Set to zero for no linebreaks in sequences [%(default)s]', default=60) | |
10 | parser.add_argument('-s', '--strip_after_whitespace', action='store_true', help='Remove everything after first whitespace in every sequence name') | |
11 | options = parser.parse_args() | |
12 | ||
13 | tasks.to_fasta( | |
14 | options.infile, | |
15 | options.outfile, | |
16 | line_length=options.line_length, | |
17 | strip_after_first_whitespace=options.strip_after_whitespace | |
18 | ) | |
19 |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = description, | |
6 | usage = 'fastaq to_mira_xml <infile> <xml_out>') | |
7 | parser.add_argument('infile', help='Name of input fasta/q file') | |
8 | parser.add_argument('xml_out', help='Name of output xml file') | |
9 | options = parser.parse_args() | |
10 | tasks.fastaq_to_mira_xml(options.infile, options.xml_out) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Writes a GFF file of open reading frames from a sequence file', | |
6 | usage = 'fastaq to_orfs_gff [options] <infile> <outfile>') | |
7 | parser.add_argument('--min_length', type=int, help='Minimum length of ORF, in nucleotides [%(default)s]', default=300, metavar='INT') | |
8 | parser.add_argument('infile', help='Name of input file') | |
9 | parser.add_argument('outfile', help='Name of output GFF file') | |
10 | options = parser.parse_args() | |
11 | tasks.fastaq_to_orfs_gff(options.infile, options.gff_out, min_length=options.min_length) |
0 | import argparse | |
1 | import random | |
2 | from math import floor, ceil | |
3 | import sys | |
4 | from pyfastaq import sequences, utils | |
5 | ||
6 | def run(description): | |
7 | parser = argparse.ArgumentParser( | |
8 | description = 'Makes perfect paired end fastq reads from a sequence file, with insert sizes sampled from a normal distribution. Read orientation is innies. Output is an interleaved FASTQ file.', | |
9 | usage = 'fastaq to_perfect_reads [options] <infile> <outfile> <mean insert size> <insert std deviation> <mean coverage> <read length>') | |
10 | parser.add_argument('infile', help='Name of input file') | |
11 | parser.add_argument('outfile', help='Name of output file') | |
12 | parser.add_argument('mean_insert', type=int, help='Mean insert size of read pairs', metavar='mean insert size') | |
13 | parser.add_argument('insert_std', type=float, help='Standard devation of insert size', metavar='insert std deviation') | |
14 | parser.add_argument('coverage', type=float, help='Mean coverage of the reads', metavar='mean coverage') | |
15 | parser.add_argument('readlength', type=int, help='Length of each read', metavar='read length') | |
16 | parser.add_argument('--fragments', help='Write FASTA sequences of fragments (i.e. read pairs plus sequences in between them) to the given filename', metavar='FILENAME') | |
17 | parser.add_argument('--no_n', action='store_true', help='Don\'t allow any N or n characters in the reads') | |
18 | parser.add_argument('--seed', type=int, help='Seed for random number generator. Default is to use python\'s default', default=None, metavar='INT') | |
19 | options = parser.parse_args() | |
20 | ||
21 | random.seed(a=options.seed) | |
22 | ||
23 | seq_reader = sequences.file_reader(options.infile) | |
24 | fout = utils.open_file_write(options.outfile) | |
25 | pair_counter = 1 | |
26 | ||
27 | if options.fragments: | |
28 | fout_frags = utils.open_file_write(options.fragments) | |
29 | ||
30 | for ref in seq_reader: | |
31 | # check if current seq is long enough | |
32 | if len(ref) < options.mean_insert + 4 * options.insert_std: | |
33 | print('Warning, sequence ', ref.id, ' too short. Skipping it...', file=sys.stderr) | |
34 | continue | |
35 | ||
36 | # work out how many reads to simulate | |
37 | read_pairs = int(0.5 * options.coverage * len(ref) / options.readlength) | |
38 | ||
39 | # it's possible that we pick the same fragment twice, in which case the | |
40 | # reads would get the same name. So remember the frag coords | |
41 | used_fragments = {} # (middle_position, length) => count | |
42 | ||
43 | # do the simulation: pick insert size from normal distribution, and | |
44 | # position in genome from uniform distribution | |
45 | x = 0 | |
46 | while x < read_pairs: | |
47 | isize = int(random.normalvariate(options.mean_insert, options.insert_std)) | |
48 | while isize > len(ref) or isize < options.readlength: | |
49 | isize = int(random.normalvariate(options.mean_insert, options.insert_std)) | |
50 | middle_pos = random.randint(ceil(0.5 *isize), floor(len(ref) - 0.5 * isize)) | |
51 | read_start1 = int(middle_pos - ceil(0.5 * isize)) | |
52 | read_start2 = read_start1 + isize - options.readlength | |
53 | ||
54 | readname = ':'.join([ref.id, str(pair_counter), str(read_start1+1), str(read_start2+1)]) | |
55 | ||
56 | fragment = (middle_pos, isize) | |
57 | if fragment in used_fragments: | |
58 | used_fragments[fragment] += 1 | |
59 | readname += '.dup.' + str(used_fragments[fragment]) | |
60 | else: | |
61 | used_fragments[fragment] = 1 | |
62 | ||
63 | read1 = sequences.Fastq(readname + '/1', ref.seq[read_start1:read_start1 + options.readlength], 'I' * options.readlength) | |
64 | read2 = sequences.Fastq(readname + '/2', ref.seq[read_start2:read_start2 + options.readlength], 'I' * options.readlength) | |
65 | ||
66 | ||
67 | if options.no_n and ('n' in read1.seq or 'N' in read1.seq or 'n' in read2.seq or 'N' in read2.seq): | |
68 | continue | |
69 | ||
70 | read2.revcomp() | |
71 | ||
72 | print(read1, file=fout) | |
73 | print(read2, file=fout) | |
74 | ||
75 | if options.fragments: | |
76 | frag = sequences.Fasta(readname, ref.seq[read_start1:read_start2 + options.readlength]) | |
77 | print(frag, file=fout_frags) | |
78 | ||
79 | pair_counter += 1 | |
80 | x += 1 | |
81 | ||
82 | utils.close(fout) | |
83 | if options.fragments: | |
84 | utils.close(fout_frags) |
0 | import argparse | |
1 | import sys | |
2 | import random | |
3 | from pyfastaq import sequences, utils | |
4 | ||
5 | def run(description): | |
6 | parser = argparse.ArgumentParser( | |
7 | description = 'Takes a random subset of reads from a sequence file and optionally the corresponding read ' + | |
8 | 'from a mates file. Ouptut is interleaved if mates file given', | |
9 | usage = 'fastaq to_random_subset [options] <infile> <outfile> <percent>') | |
10 | parser.add_argument('--mate_file', help='Name of mates file') | |
11 | parser.add_argument('infile', help='Name of input file') | |
12 | parser.add_argument('outfile', help='Name of output file') | |
13 | parser.add_argument('percent', type=int, help='Per cent probability of keeping any given read (pair) in [0,100]', metavar='INT') | |
14 | options = parser.parse_args() | |
15 | ||
16 | seq_reader = sequences.file_reader(options.infile) | |
17 | fout = utils.open_file_write(options.outfile) | |
18 | ||
19 | if options.mate_file: | |
20 | mate_seq_reader = sequences.file_reader(options.mate_file) | |
21 | ||
22 | for seq in seq_reader: | |
23 | if options.mate_file: | |
24 | try: | |
25 | mate_seq = next(mate_seq_reader) | |
26 | except StopIteration: | |
27 | print('Error! Didn\'t get mate for read', seq.id, file=sys.stderr) | |
28 | sys.exit(1) | |
29 | if random.randint(0, 100) <= options.percent: | |
30 | print(seq, file=fout) | |
31 | if options.mate_file: | |
32 | print(mate_seq, file=fout) | |
33 | ||
34 | utils.close(fout) |
0 | import argparse | |
1 | import sys | |
2 | import os | |
3 | from pyfastaq import sequences, utils | |
4 | ||
5 | def run(description): | |
6 | parser = argparse.ArgumentParser( | |
7 | description = 'Takes a sequence file. Makes a BAM file containing perfect (unpaired) reads tiling the whole genome', | |
8 | usage = 'fastaq to_tiling_bam [options] <infile> <read_length> <read_step> <read_prefix> <outfile>', | |
9 | epilog = 'Important: assumes that samtools is in your path') | |
10 | parser.add_argument('infile', help='Name of input fasta/q file') | |
11 | parser.add_argument('read_length', type=int, help='Length of reads') | |
12 | parser.add_argument('read_step', type=int, help='Distance between start of each read') | |
13 | parser.add_argument('read_prefix', help='Prefix of read names') | |
14 | parser.add_argument('outfile', help='Name of output BAM file') | |
15 | parser.add_argument('--read_group', help='Add the given read group ID to all reads [%(default)s]' ,default='42') | |
16 | options = parser.parse_args() | |
17 | ||
18 | # make a header first - we need to add the @RG line to the default header made by samtools | |
19 | tmp_empty_file = options.outfile + '.tmp.empty' | |
20 | f = utils.open_file_write(tmp_empty_file) | |
21 | utils.close(f) | |
22 | try: | |
23 | f = os.popen('samtools view -H -T ' + options.infile + ' ' + tmp_empty_file) | |
24 | except IOError: | |
25 | print('Error making tmp header file', file=sys.stderr) | |
26 | sys.exit(1) | |
27 | ||
28 | header_lines = f.readlines() | |
29 | header_lines.append('@RG\tID:' + options.read_group + '\tSM:FAKE') | |
30 | f.close() | |
31 | os.unlink(tmp_empty_file) | |
32 | ||
33 | seq_reader = sequences.file_reader(options.infile) | |
34 | try: | |
35 | f = os.popen('samtools view -hbS - > ' + options.outfile, 'w') | |
36 | except IOError: | |
37 | print("Error opening for writing BAM file '" + options.outfile + "'", file=sys.stderr) | |
38 | sys.exit(1) | |
39 | ||
40 | print(''.join(header_lines), file=f) | |
41 | ||
42 | for seq in seq_reader: | |
43 | end_range = len(seq) | |
44 | if len(seq) < options.read_length: | |
45 | end_range = 1 | |
46 | for i in range(0, end_range, options.read_step): | |
47 | if len(seq) <= options.read_length: | |
48 | start = 0 | |
49 | end = len(seq) - 1 | |
50 | else: | |
51 | start = i | |
52 | end = start + options.read_length - 1 | |
53 | ||
54 | if end > len(seq) - 1: | |
55 | end = len(seq) - 1 | |
56 | start = end - options.read_length + 1 | |
57 | ||
58 | read = sequences.Fastq(options.read_prefix + ':' + seq.id + ':' + str(start + 1) + ':' + str(end + 1), seq[start:end+1], 'I' * (end - start + 1)) | |
59 | ||
60 | print ('\t'.join([read.id, | |
61 | '0', | |
62 | seq.id, | |
63 | str(start + 1), | |
64 | '60', | |
65 | str(len(read)) + 'M', | |
66 | '*', | |
67 | '*', | |
68 | '*', | |
69 | read.seq, | |
70 | read.qual, | |
71 | 'RG:Z:' + options.read_group]), file=f) | |
72 | ||
73 | if end == len(seq) - 1: | |
74 | break | |
75 | ||
76 | f.close() |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Removes duplicate sequences from input file, based on their names. If the same name is found more than once, then the longest sequence is kept. Order of sequences is preserved in output', | |
6 | usage = 'fastaq to_unique_by_id <infile> <outfile>') | |
7 | parser.add_argument('infile', help='Name of input file') | |
8 | parser.add_argument('outfile', help='Name of output file') | |
9 | options = parser.parse_args() | |
10 | tasks.to_unique_by_id(options.infile, options.outfile) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Translates all sequences in input file. Output is always FASTA format', | |
6 | usage = 'fastaq translate [options] <infile> <outfile>') | |
7 | parser.add_argument('--frame', type=int, choices=[0,1,2], help='Frame to translate [%(default)s]', default=0) | |
8 | parser.add_argument('infile', help='Name of file to be translated') | |
9 | parser.add_argument('outfile', help='Name of output FASTA file') | |
10 | options = parser.parse_args() | |
11 | tasks.translate(options.infile, options.outfile, frame=options.frame) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Trims any Ns off each sequence in input file. Does nothing to gaps in the middle, just trims the ends', | |
6 | usage = 'fastaq trim_Ns_at_end <infile> <outfile>') | |
7 | parser.add_argument('infile', help='Name of input file') | |
8 | parser.add_argument('outfile', help='Name of output file') | |
9 | options = parser.parse_args() | |
10 | tasks.trim_Ns_at_end(options.infile, options.outfile) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = 'Trims a set number of bases off the end of every contig, so gaps get bigger and contig ends are removed. Bases are replaced with Ns. Any sequence that ends up as all Ns is lost', | |
6 | usage = 'fastaq trim_contigs [options] <infile> <outfile>') | |
7 | parser.add_argument('--trim_number', type=int, help='Number of bases to trim around each gap, and off ends of each sequence [%(default)s]', default=100) | |
8 | parser.add_argument('infile', help='Name of input file') | |
9 | parser.add_argument('outfile', help='Name of output file') | |
10 | options = parser.parse_args() | |
11 | tasks.trim_contigs(options.infile, options.outfile, options.trim_number) |
0 | import argparse | |
1 | from pyfastaq import tasks | |
2 | ||
3 | def run(description): | |
4 | parser = argparse.ArgumentParser( | |
5 | description = description, | |
6 | usage = 'fastaq trim_ends <infile> <bases off start> <bases off end> <outfile>') | |
7 | parser.add_argument('infile', help='Name of input file') | |
8 | parser.add_argument('start_trim', type=int, help='Number of bases to trim off start') | |
9 | parser.add_argument('end_trim', type=int, help='Number of bases to trim off end') | |
10 | parser.add_argument('outfile', help='Name of output file') | |
11 | options = parser.parse_args() | |
12 | tasks.trim(options.infile, options.outfile, options.start_trim, options.end_trim) |
0 | import re | |
1 | import string | |
2 | import random | |
3 | import itertools | |
4 | ||
5 | from pyfastaq import utils, intervals, genetic_codes | |
6 | ||
7 | class Error (Exception): pass | |
8 | ||
9 | ||
10 | # python 3's seek is glacially slow. When we read a fasta file, we know | |
11 | # we've reached the end of a sequence when we get a new line starting with | |
12 | # '>'. Instead of using seek and tell, we just remember the previous line | |
13 | # of the file, for any given filehandle | |
14 | previous_lines = {} | |
15 | ||
16 | ||
17 | codon2aa = genetic_codes.codes[1] | |
18 | ||
19 | redundant_nts = { | |
20 | 'R': ('A', 'G'), | |
21 | 'Y': ('C', 'T'), | |
22 | 'S': ('C', 'G'), | |
23 | 'W': ('A', 'T'), | |
24 | 'K': ('G', 'T'), | |
25 | 'M': ('A', 'C'), | |
26 | 'B': ('C', 'G', 'T'), | |
27 | 'D': ('A', 'G', 'T'), | |
28 | 'H': ('A', 'C', 'T'), | |
29 | 'V': ('A', 'C', 'G'), | |
30 | 'N': ('A', 'C', 'G', 'T') | |
31 | } | |
32 | ||
33 | def file_reader(fname, read_quals=False): | |
34 | '''Iterates over a FASTA or FASTQ file, yielding the next sequence in the file until there are no more sequences''' | |
35 | f = utils.open_file_read(fname) | |
36 | line = f.readline() | |
37 | phylip_regex = re.compile('^\s*[0-9]+\s+[0-9]+$') | |
38 | gbk_regex = re.compile('^LOCUS\s+\S') | |
39 | ||
40 | if line.startswith('>'): | |
41 | seq = Fasta() | |
42 | previous_lines[f] = line | |
43 | elif line.startswith('##gff-version 3'): | |
44 | seq = Fasta() | |
45 | # if a GFF file, need to skip past all the annotation | |
46 | # and get to the fasta sequences at the end of the file | |
47 | while not line.startswith('>'): | |
48 | line = f.readline() | |
49 | if not line: | |
50 | utils.close(f) | |
51 | raise Error('No sequences found in GFF file "' + fname + '"') | |
52 | ||
53 | seq = Fasta() | |
54 | previous_lines[f] = line | |
55 | elif line.startswith('ID ') and line[5] != ' ': | |
56 | seq = Embl() | |
57 | previous_lines[f] = line | |
58 | elif gbk_regex.search(line): | |
59 | seq = Embl() | |
60 | previous_lines[f] = line | |
61 | elif line.startswith('@'): | |
62 | seq = Fastq() | |
63 | previous_lines[f] = line | |
64 | elif phylip_regex.search(line): | |
65 | # phylip format could be interleaved or not, need to look at next | |
66 | # couple of lines to figure that out. Don't expect these files to | |
67 | # be too huge, so just store all the sequences in memory | |
68 | number_of_seqs, bases_per_seq = line.strip().split() | |
69 | number_of_seqs = int(number_of_seqs) | |
70 | bases_per_seq = int(bases_per_seq) | |
71 | got_blank_line = False | |
72 | ||
73 | first_line = line | |
74 | seq_lines = [] | |
75 | while 1: | |
76 | line = f.readline() | |
77 | if line == '': | |
78 | break | |
79 | elif line == '\n': | |
80 | got_blank_line = True | |
81 | else: | |
82 | seq_lines.append(line.rstrip()) | |
83 | utils.close(f) | |
84 | ||
85 | if len(seq_lines) == 1 or len(seq_lines) == number_of_seqs: | |
86 | sequential = True | |
87 | elif seq_lines[0][10] != ' ' and seq_lines[1][10] == ' ': | |
88 | sequential = True | |
89 | else: | |
90 | sequential = False | |
91 | ||
92 | # if the 11th char of second sequence line is a space, then the file is sequential, e.g.: | |
93 | # GAGCCCGGGC AATACAGGGT AT | |
94 | # as opposed to: | |
95 | # Salmo gairAAGCCTTGGC AGTGCAGGGT | |
96 | if sequential: | |
97 | current_id = None | |
98 | current_seq = '' | |
99 | for line in seq_lines: | |
100 | if len(current_seq) == bases_per_seq or len(current_seq) == 0: | |
101 | if current_id is not None: | |
102 | yield Fasta(current_id, current_seq.replace('-', '')) | |
103 | current_seq = '' | |
104 | current_id, new_bases = line[0:10].rstrip(), line.rstrip()[10:] | |
105 | else: | |
106 | new_bases = line.rstrip() | |
107 | ||
108 | current_seq += new_bases.replace(' ','') | |
109 | ||
110 | yield Fasta(current_id, current_seq.replace('-', '')) | |
111 | else: | |
112 | # seaview files start all seqs at pos >=12. Other files start | |
113 | # their sequence at the start of the line | |
114 | if seq_lines[number_of_seqs + 1][0] == ' ': | |
115 | first_gap_pos = seq_lines[0].find(' ') | |
116 | end_of_gap = first_gap_pos | |
117 | while seq_lines[0][end_of_gap] == ' ': | |
118 | end_of_gap += 1 | |
119 | first_seq_base = end_of_gap | |
120 | else: | |
121 | first_seq_base = 10 | |
122 | ||
123 | seqs = [] | |
124 | for i in range(number_of_seqs): | |
125 | name, bases = seq_lines[i][0:first_seq_base].rstrip(), seq_lines[i][first_seq_base:] | |
126 | seqs.append(Fasta(name, bases)) | |
127 | ||
128 | for i in range(number_of_seqs, len(seq_lines)): | |
129 | seqs[i%number_of_seqs].seq += seq_lines[i] | |
130 | ||
131 | for fa in seqs: | |
132 | fa.seq = fa.seq.replace(' ','').replace('-','') | |
133 | yield fa | |
134 | ||
135 | return | |
136 | elif line == '': | |
137 | utils.close(f) | |
138 | return | |
139 | else: | |
140 | utils.close(f) | |
141 | raise Error('Error determining file type from file "' + fname + '". First line is:\n' + line.rstrip()) | |
142 | ||
143 | try: | |
144 | while seq.get_next_from_file(f, read_quals): | |
145 | yield seq | |
146 | finally: | |
147 | utils.close(f) | |
148 | ||
149 | ||
150 | class Fasta: | |
151 | '''Class to store and manipulate FASTA sequences. They have two things: a name and a sequence''' | |
152 | # this defines the line length when printing sequences | |
153 | line_length = 60 | |
154 | ||
155 | def _get_id_from_header_line(self, line): | |
156 | if line.startswith('>'): | |
157 | return line.rstrip()[1:] | |
158 | else: | |
159 | raise Error('Error! expected line starting with ">", but got this:\n', line) | |
160 | ||
161 | ||
162 | def __init__(self, id_in=None, seq_in=None): | |
163 | self.id = id_in | |
164 | self.seq = seq_in | |
165 | ||
166 | def __eq__(self, other): | |
167 | return type(other) is type(self) and self.__dict__ == other.__dict__ | |
168 | ||
169 | def __ne__(self, other): | |
170 | return not self.__eq__(other) | |
171 | ||
172 | def __len__(self): | |
173 | return len(self.seq) | |
174 | ||
175 | def subseq(self, start, end): | |
176 | '''Returns Fasta object with the same name, of the bases from start to end, but not including end''' | |
177 | return Fasta(self.id, self.seq[start:end]) | |
178 | ||
179 | def split_capillary_id(self): | |
180 | '''Gets the prefix and suffix of an name of a capillary read, e.g. xxxxx.p1k or xxxx.q1k. Returns a tuple (prefix, suffx)''' | |
181 | try: | |
182 | a = self.id.rsplit('.', 1) | |
183 | if a[1].startswith('p'): | |
184 | dir = 'fwd' | |
185 | elif a[1].startswith('q'): | |
186 | dir = 'rev' | |
187 | else: | |
188 | dir = 'unk' | |
189 | ||
190 | return {'prefix': a[0], 'dir': dir, 'suffix':a[1]} | |
191 | except: | |
192 | raise Error('Error in split_capillary_id() on ID', self.id) | |
193 | ||
194 | def expand_nucleotides(self): | |
195 | '''Assumes sequence is nucleotides. Returns list of all combinations of redundant nucleotides. e.g. R is A or G, so CRT would have combinations CAT and CGT''' | |
196 | s = list(self.seq) | |
197 | for i in range(len(s)): | |
198 | if s[i] in redundant_nts: | |
199 | s[i] = ''.join(redundant_nts[s[i]]) | |
200 | ||
201 | seqs = [] | |
202 | for x in itertools.product(*s): | |
203 | seqs.append(Fasta(self.id + '.' + str(len(seqs) + 1), ''.join(x))) | |
204 | return seqs | |
205 | ||
206 | def strip_after_first_whitespace(self): | |
207 | '''Removes everything in the name after the first whitespace character''' | |
208 | self.id = self.id.split()[0] | |
209 | ||
210 | def strip_illumina_suffix(self): | |
211 | '''Removes any trailing /1 or /2 off the end of the name''' | |
212 | if self.id.endswith('/1') or self.id.endswith('/2'): | |
213 | self.id = self.id[:-2] | |
214 | ||
215 | def revcomp(self): | |
216 | '''Reverse complements the sequence''' | |
217 | self.seq = self.seq.translate(str.maketrans("ATCGatcg", "TAGCtagc"))[::-1] | |
218 | ||
219 | def is_all_Ns(self, start=0, end=None): | |
220 | '''Returns true if the sequence is all Ns (upper or lower case)''' | |
221 | if end is not None: | |
222 | if start > end: | |
223 | raise Error('Error in is_all_Ns. Start coord must be <= end coord') | |
224 | end += 1 | |
225 | else: | |
226 | end = len(self) | |
227 | ||
228 | if len(self) == 0: | |
229 | return False | |
230 | else: | |
231 | return re.search('[^Nn]', self.seq[start:end]) is None | |
232 | ||
233 | def trim_Ns(self): | |
234 | '''Removes any leading or trailing N or n characters from the sequence''' | |
235 | self.seq = self.seq.strip('Nn') | |
236 | ||
237 | def add_insertions(self, skip=10, window=1, test=False): | |
238 | '''Adds a random base within window bases around every skip bases. e.g. skip=10, window=1 means a random base added somwhere in theintervals [9,11], [19,21] ... ''' | |
239 | assert 2 * window < skip | |
240 | new_seq = list(self.seq) | |
241 | for i in range(len(self) - skip, 0, -skip): | |
242 | pos = random.randrange(i - window, i + window + 1) | |
243 | base = random.choice(['A', 'C', 'G', 'T']) | |
244 | if test: | |
245 | base = 'N' | |
246 | new_seq.insert(pos, base) | |
247 | ||
248 | self.seq = ''.join(new_seq) | |
249 | ||
250 | def replace_bases(self, old, new): | |
251 | '''Replaces all occurences of 'old' with 'new' ''' | |
252 | self.seq = self.seq.replace(old, new) | |
253 | ||
254 | def replace_interval(self, start, end, new): | |
255 | '''Replaces the sequence from start to end with the sequence "new"''' | |
256 | if start > end or start > len(self) - 1 or end > len(self) - 1: | |
257 | raise Error('Error replacing bases ' + str(start) + '-' + str(end) + ' in sequence ' + self.id) | |
258 | ||
259 | self.seq = self.seq[0:start] + new + self.seq[end + 1:] | |
260 | ||
261 | def gaps(self, min_length = 1): | |
262 | '''Finds the positions of all gaps in the sequence that are at least min_length long. Returns a list of Intervals. Coords are zero-based''' | |
263 | gaps = [] | |
264 | regex = re.compile('N+', re.IGNORECASE) | |
265 | for m in regex.finditer(self.seq): | |
266 | if m.span()[1] - m.span()[0] + 1 >= min_length: | |
267 | gaps.append(intervals.Interval(m.span()[0], m.span()[1] - 1)) | |
268 | return gaps | |
269 | ||
270 | def contig_coords(self): | |
271 | '''Finds coords of contigs, i.e. everything that's not a gap (N or n). Returns a list of Intervals. Coords are zero-based''' | |
272 | # contigs are the opposite of gaps, so work out the coords from the gap coords | |
273 | gaps = self.gaps() | |
274 | ||
275 | if len(gaps) == 0: | |
276 | return [intervals.Interval(0, len(self) - 1)] | |
277 | ||
278 | coords = [0] | |
279 | for g in gaps: | |
280 | if g.start == 0: | |
281 | coords = [g.end + 1] | |
282 | else: | |
283 | coords += [g.start - 1, g.end + 1] | |
284 | ||
285 | if coords[-1] < len(self): | |
286 | coords.append(len(self) - 1) | |
287 | ||
288 | return [intervals.Interval(coords[i], coords[i+1]) for i in range(0, len(coords)-1,2)] | |
289 | ||
290 | ||
291 | ||
292 | ||
293 | def orfs(self, frame=0, revcomp=False): | |
294 | assert frame in [0,1,2] | |
295 | if revcomp: | |
296 | self.revcomp() | |
297 | ||
298 | aa_seq = self.translate(frame=frame).seq.rstrip('X') | |
299 | if revcomp: | |
300 | self.revcomp() | |
301 | ||
302 | orfs = _orfs_from_aa_seq(aa_seq) | |
303 | for i in range(len(orfs)): | |
304 | if revcomp: | |
305 | start = len(self) - (orfs[i].end * 3 + 3) - frame | |
306 | end = len(self) - (orfs[i].start * 3) - 1 - frame | |
307 | else: | |
308 | start = orfs[i].start * 3 + frame | |
309 | end = orfs[i].end * 3 + 2 + frame | |
310 | ||
311 | orfs[i] = intervals.Interval(start, end) | |
312 | ||
313 | return orfs | |
314 | ||
315 | ||
316 | def all_orfs(self, min_length=300): | |
317 | orfs = [] | |
318 | for frame in [0,1,2]: | |
319 | for revcomp in [False, True]: | |
320 | orfs.extend([(t, revcomp) for t in self.orfs(frame=frame, revcomp=revcomp) if len(t)>=min_length]) | |
321 | ||
322 | return sorted(orfs, key=lambda t:t[0]) | |
323 | ||
324 | # Fills the object with the next sequence in the file. Returns | |
325 | # True if this was successful, False if no more sequences in the file. | |
326 | # If reading a file of quality scores, set read_quals = True | |
327 | def get_next_from_file(self, f, read_quals=False): | |
328 | if f in previous_lines: | |
329 | if previous_lines[f] == None: | |
330 | self.id = self.seq = None | |
331 | return False | |
332 | else: | |
333 | self.id = self._get_id_from_header_line(previous_lines[f]) | |
334 | else: | |
335 | line = '\n' | |
336 | while line == '\n': | |
337 | line = f.readline() | |
338 | self.id = self._get_id_from_header_line(line) | |
339 | ||
340 | self.seq = '' | |
341 | seq_lines = [] # much faster to store the seq lines in an array, | |
342 | # then join at the end | |
343 | ||
344 | while 1: | |
345 | line = f.readline() | |
346 | ||
347 | if line.startswith('>'): | |
348 | previous_lines[f] = line.rstrip() | |
349 | break | |
350 | elif line == '': | |
351 | previous_lines[f] = None | |
352 | break | |
353 | else: | |
354 | seq_lines.append(line.rstrip()) | |
355 | ||
356 | if read_quals: | |
357 | self.seq = ' '.join(seq_lines) | |
358 | else: | |
359 | self.seq = ''.join(seq_lines) | |
360 | return True | |
361 | ||
362 | def __str__(self): | |
363 | if Fasta.line_length == 0: | |
364 | return '>' + self.id + '\n' + self.seq | |
365 | else: | |
366 | return '>' + self.id + '\n' + '\n'.join(self.seq[i:i+Fasta.line_length] for i in range(0, len(self), Fasta.line_length)) | |
367 | ||
368 | def __getitem__(self, index): | |
369 | return self.seq[index] | |
370 | ||
371 | def trim(self, start, end): | |
372 | '''Removes first 'start'/'end' bases off the start/end of the sequence''' | |
373 | self.seq = self.seq[start:len(self.seq) - end] | |
374 | ||
375 | # qual_scores should be a list of quality scores | |
376 | def to_Fastq(self, qual_scores): | |
377 | '''Returns a Fastq object. qual_scores expected to be a list of numbers, like you would get in a .qual file''' | |
378 | if len(self) != len(qual_scores): | |
379 | raise Error('Error making Fastq from Fasta, lengths differ.', self.id) | |
380 | return Fastq(self.id, self.seq, ''.join([chr(max(0, min(x, 93)) + 33) for x in qual_scores])) | |
381 | ||
382 | def search(self, search_string): | |
383 | '''Finds every occurence (including overlapping ones) of the search_string, including on the reverse strand. Returns a list where each element is a tuple (position, strand) where strand is in ['-', '+']. Positions are zero-based''' | |
384 | seq = self.seq.upper() | |
385 | search_string = search_string.upper() | |
386 | pos = 0 | |
387 | found = seq.find(search_string, pos) | |
388 | hits = [] | |
389 | ||
390 | while found != -1: | |
391 | hits.append((found, '+')) | |
392 | pos = found + 1 | |
393 | found = seq.find(search_string, pos) | |
394 | ||
395 | ||
396 | pos = 0 | |
397 | search_string = Fasta('x', search_string) | |
398 | search_string.revcomp() | |
399 | search_string = search_string.seq | |
400 | found = seq.find(search_string, pos) | |
401 | ||
402 | while found != -1: | |
403 | hits.append((found, '-')) | |
404 | pos = found + 1 | |
405 | found = seq.find(search_string, pos) | |
406 | ||
407 | return hits | |
408 | ||
409 | def translate(self, frame=0): | |
410 | '''Returns a Fasta sequence, translated into amino acids. Starts translating from 'frame', where frame expected to be 0,1 or 2''' | |
411 | return Fasta(self.id, ''.join([codon2aa.get(self.seq[x:x+3].upper(), 'X') for x in range(frame, len(self)-1-frame, 3)])) | |
412 | ||
413 | ||
414 | class Embl(Fasta): | |
415 | '''Exactly the same as Fasta, but reading seqs from a file works differently''' | |
416 | def __eq__(self, other): | |
417 | return type(other) in [Fasta, Embl] and type(self) in [Fasta, Embl] and self.__dict__ == other.__dict__ | |
418 | ||
419 | def _get_id_from_header_line(self, line): | |
420 | if line.startswith('ID ') and line[5] != ' ': | |
421 | return line.split()[1].rstrip(';') | |
422 | elif line.startswith('LOCUS'): | |
423 | return line.split()[1] | |
424 | else: | |
425 | raise Error('Error! expected line starting with "ID" or "LOCUS", but got this:\n', line) | |
426 | ||
427 | def get_next_from_file(self, f, read_quals=False): | |
428 | if f in previous_lines: | |
429 | line = '' | |
430 | if previous_lines[f] == None: | |
431 | self.id = self.seq = None | |
432 | return False | |
433 | else: | |
434 | self.id = self._get_id_from_header_line(previous_lines[f]) | |
435 | else: | |
436 | line = '\n' | |
437 | while line == '\n': | |
438 | line = f.readline() | |
439 | self.id = self._get_id_from_header_line(line) | |
440 | ||
441 | self.seq = '' | |
442 | seq_lines = [] | |
443 | ||
444 | while not (line.startswith('SQ') or line.rstrip() == 'ORIGIN'): | |
445 | line = f.readline() | |
446 | if line == '': | |
447 | raise Error('Error! No SQ or ORIGIN line found for sequence ' + self.id) | |
448 | ||
449 | line = f.readline() | |
450 | ||
451 | while not line.startswith('//'): | |
452 | if line == '' or line[0] != ' ': | |
453 | raise Error('Error! Did not find end of sequence ' + self.id) | |
454 | seq_lines.append(''.join(line.rstrip().strip(' 0123456789').split())) | |
455 | line = f.readline() | |
456 | ||
457 | ||
458 | while 1: | |
459 | if line.startswith('ID') or line.startswith('LOCUS'): | |
460 | previous_lines[f] = line.rstrip() | |
461 | break | |
462 | elif line == '': | |
463 | previous_lines[f] = None | |
464 | break | |
465 | ||
466 | line = f.readline() | |
467 | ||
468 | self.seq = ''.join(seq_lines) | |
469 | return True | |
470 | ||
471 | class Fastq(Fasta): | |
472 | '''Class to store and manipulate FASTQ sequences. They have three things: a name, sequence and string of quality scores''' | |
473 | def __init__(self, id_in=None, seq_in=None, qual_in=None): | |
474 | super().__init__(id_in, seq_in) | |
475 | self.qual = qual_in | |
476 | if (not self.seq == self.qual == None) and len(self.qual) != len(self.seq): | |
477 | raise Error('Error constructing Fastq. Mismatch in sequence and quality length\n' + str(self)) | |
478 | ||
479 | def __str__(self): | |
480 | return '@' + self.id + '\n' + self.seq + '\n+\n' + self.qual | |
481 | ||
482 | def __eq__(self, other): | |
483 | return type(other) is type(self) and self.__dict__ == other.__dict__ | |
484 | ||
485 | def subseq(self, start, end): | |
486 | '''Returns Fastq object with the same name, of the bases from start to end, but not including end''' | |
487 | return Fastq(self.id, self.seq[start:end], self.qual[start:end]) | |
488 | ||
489 | def get_next_from_file(self, f, read_quals=False): | |
490 | if f in previous_lines: | |
491 | line = previous_lines[f] | |
492 | del previous_lines[f] | |
493 | else: | |
494 | line = f.readline() | |
495 | ||
496 | while line == '\n': | |
497 | line = f.readline() | |
498 | ||
499 | if not line: | |
500 | self = Fastq('', '', '') | |
501 | return False | |
502 | ||
503 | if not line.startswith('@'): | |
504 | raise Error('Error getting next sequence from fastq file. Got line:\n' + line) | |
505 | ||
506 | self.id = line.rstrip()[1:] | |
507 | line = f.readline() | |
508 | if not line: | |
509 | raise Error('Error getting next sequence from fastq file, sequence has ID ' + self.id) | |
510 | ||
511 | self.seq = line.strip() | |
512 | ||
513 | line = f.readline() | |
514 | if not (line and line.startswith('+')): | |
515 | raise Error('Error getting next sequence from fastq file, no line starting with +, sequence has ID ' + self.id) | |
516 | ||
517 | line = f.readline() | |
518 | if not line: | |
519 | raise Error('Error getting next sequence from fastq file, sequence has ID ' + self.id) | |
520 | ||
521 | self.qual = line.rstrip() | |
522 | return True | |
523 | ||
524 | def revcomp(self): | |
525 | '''Reverse complements the sequence''' | |
526 | super().revcomp() | |
527 | self.qual = self.qual[::-1] | |
528 | ||
529 | def trim(self, start, end): | |
530 | '''Removes first 'start'/'end' bases off the start/end of the sequence''' | |
531 | super().trim(start, end) | |
532 | self.qual = self.qual[start:len(self.qual) - end] | |
533 | ||
534 | def to_Fasta_and_qual(self): | |
535 | quals = [ord(x) - 33 for x in self.qual] | |
536 | return (Fasta(self.id, self.seq), quals) | |
537 | ||
538 | def expand_nucleotides(self): | |
539 | return [Fastq(x.id, x.seq, self.qual) for x in super().expand_nucleotides()] | |
540 | ||
541 | def trim_Ns(self): | |
542 | '''Removes any leading or trailing N or n characters from the sequence''' | |
543 | # get index of first base that is not an N | |
544 | i = 0 | |
545 | while i < len(self) and self.seq[i] in 'nN': | |
546 | i += 1 | |
547 | ||
548 | # strip off start of sequence and quality | |
549 | self.seq = self.seq[i:] | |
550 | self.qual = self.qual[i:] | |
551 | ||
552 | # strip the ends | |
553 | self.seq = self.seq.rstrip('Nn') | |
554 | self.qual = self.qual[:len(self.seq)] | |
555 | ||
556 | def replace_interval(self, start, end, new, qual_string): | |
557 | '''Replaces the sequence from start to end with the sequence "new"''' | |
558 | if len(new) != len(qual_string): | |
559 | raise Error('Length of new seq and qual string in replace_interval() must be equal. Cannot continue') | |
560 | super().replace_interval(start, end, new) | |
561 | self.qual = self.qual[0:start] + qual_string + self.qual[end + 1:] | |
562 | ||
563 | def translate(self): | |
564 | '''Returns a Fasta sequence, translated into amino acids. Starts translating from 'frame', where frame expected to be 0,1 or 2''' | |
565 | fa = super().translate() | |
566 | return Fastq(fa.id, fa.seq, 'I'*len(fa.seq)) | |
567 | ||
568 | ||
569 | def _orfs_from_aa_seq(seq): | |
570 | orfs = [] | |
571 | pos = 0 | |
572 | while pos < len(seq): | |
573 | next_stop = seq.find('*', pos) | |
574 | if next_stop == -1: | |
575 | orfs.append(intervals.Interval(pos, len(seq)-1)) | |
576 | break | |
577 | elif next_stop > pos: | |
578 | orfs.append(intervals.Interval(pos, next_stop)) | |
579 | pos = next_stop + 1 | |
580 | return orfs |
0 | import re | |
1 | import sys | |
2 | import copy | |
3 | import random | |
4 | import numpy | |
5 | from pyfastaq import sequences, utils, caf | |
6 | ||
7 | class Error (Exception): pass | |
8 | ||
9 | def caf_to_fastq(infile, outfile, min_length=0, trim=False): | |
10 | '''Convert a CAF file to fastq. Reads shorter than min_length are not output. If clipping information is in the CAF file (with a line Clipping QUAL ...) and trim=True, then trim the reads''' | |
11 | caf_reader = caf.file_reader(infile) | |
12 | fout = utils.open_file_write(outfile) | |
13 | ||
14 | for c in caf_reader: | |
15 | if trim: | |
16 | if c.clip_start is not None and c.clip_end is not None: | |
17 | c.seq.seq = c.seq.seq[c.clip_start:c.clip_end + 1] | |
18 | c.seq.qual = c.seq.qual[c.clip_start:c.clip_end + 1] | |
19 | else: | |
20 | print('Warning: no clipping info for sequence', c.id, file=sys.stderr) | |
21 | ||
22 | ||
23 | if len(c.seq) >= min_length: | |
24 | print(c.seq, file=fout) | |
25 | ||
26 | utils.close(fout) | |
27 | ||
28 | ||
29 | def capillary_to_pairs(infile, outprefix): | |
30 | # hash the sequences, only taking longest where an end has been sequenced more than once | |
31 | seq_reader = sequences.file_reader(infile) | |
32 | fwd_seqs = {} | |
33 | rev_seqs = {} | |
34 | unpaired_seqs = {} | |
35 | ||
36 | for seq in seq_reader: | |
37 | id_info = seq.split_capillary_id() | |
38 | if id_info['dir'] == 'fwd': | |
39 | seq.id = id_info['prefix'] + '/1' | |
40 | h = fwd_seqs | |
41 | elif id_info['dir'] == 'rev': | |
42 | seq.id = id_info['prefix'] + '/2' | |
43 | h = rev_seqs | |
44 | else: | |
45 | seq.id = id_info['prefix'] | |
46 | h = unpaired_seqs | |
47 | ||
48 | key = id_info['prefix'] | |
49 | ||
50 | if key not in h or len(h[key]) < len(seq): | |
51 | h[key] = copy.copy(seq) | |
52 | ||
53 | # write the output files | |
54 | f_pe = utils.open_file_write(outprefix + '.paired.gz') | |
55 | f_up = utils.open_file_write(outprefix + '.unpaired.gz') | |
56 | ||
57 | for id in fwd_seqs: | |
58 | if id in rev_seqs: | |
59 | print(fwd_seqs[id], file=f_pe) | |
60 | print(rev_seqs[id], file=f_pe) | |
61 | del rev_seqs[id] | |
62 | else: | |
63 | print(fwd_seqs[id], file=f_up) | |
64 | ||
65 | for seq in rev_seqs.values(): | |
66 | print(seq, file=f_up) | |
67 | ||
68 | for seq in unpaired_seqs.values(): | |
69 | print(seq, file=f_up) | |
70 | ||
71 | utils.close(f_pe) | |
72 | utils.close(f_up) | |
73 | ||
74 | ||
75 | def count_sequences(infile): | |
76 | '''Returns the number of sequences in a file''' | |
77 | seq_reader = sequences.file_reader(infile) | |
78 | n = 0 | |
79 | for seq in seq_reader: | |
80 | n += 1 | |
81 | return n | |
82 | ||
83 | ||
84 | def deinterleave(infile, outfile_1, outfile_2, fasta_out=False): | |
85 | seq_reader = sequences.file_reader(infile) | |
86 | f_1 = utils.open_file_write(outfile_1) | |
87 | f_2 = utils.open_file_write(outfile_2) | |
88 | for seq in seq_reader: | |
89 | if fasta_out: | |
90 | print(sequences.Fasta(seq.id, seq.seq), file=f_1) | |
91 | else: | |
92 | print(seq, file=f_1) | |
93 | try: | |
94 | next(seq_reader) | |
95 | except StopIteration: | |
96 | utils.close(f_1) | |
97 | utils.close(f_2) | |
98 | raise Error('Error getting mate for sequence. Cannot continue') | |
99 | if fasta_out: | |
100 | print(sequences.Fasta(seq.id, seq.seq), file=f_2) | |
101 | else: | |
102 | print(seq, file=f_2) | |
103 | ||
104 | utils.close(f_1) | |
105 | utils.close(f_2) | |
106 | ||
107 | ||
108 | def enumerate_names(infile, outfile, start_index=1, keep_illumina_suffix=False, rename_file=None, suffix=None): | |
109 | seq_reader = sequences.file_reader(infile) | |
110 | fout_seqs = utils.open_file_write(outfile) | |
111 | counter = start_index | |
112 | ||
113 | if keep_illumina_suffix: | |
114 | sequence_suffixes = ['/1', '/2'] | |
115 | else: | |
116 | sequence_suffixes = [] | |
117 | ||
118 | ||
119 | if rename_file is not None: | |
120 | fout_rename = utils.open_file_write(rename_file) | |
121 | print('#old\tnew', file=fout_rename) | |
122 | ||
123 | for seq in seq_reader: | |
124 | old_id = seq.id | |
125 | seq.id = str(counter) | |
126 | ||
127 | for suff in sequence_suffixes: | |
128 | if old_id.endswith(suff): | |
129 | seq.id += suff | |
130 | break | |
131 | ||
132 | if rename_file is not None: | |
133 | print(old_id, seq.id, sep='\t', file=fout_rename) | |
134 | ||
135 | if suffix is not None: | |
136 | seq.id += suffix | |
137 | ||
138 | print(seq, file=fout_seqs) | |
139 | counter += 1 | |
140 | ||
141 | utils.close(fout_seqs) | |
142 | ||
143 | if rename_file is not None: | |
144 | utils.close(fout_rename) | |
145 | ||
146 | ||
147 | def expand_nucleotides(infile, outfile): | |
148 | seq_reader = sequences.file_reader(infile) | |
149 | fout = utils.open_file_write(outfile) | |
150 | ||
151 | for seq in seq_reader: | |
152 | seqs = seq.expand_nucleotides() | |
153 | if len(seqs) > 1: | |
154 | for s in seqs: | |
155 | print(s, file=fout) | |
156 | else: | |
157 | print(seq, file=fout) | |
158 | ||
159 | ||
160 | def trim_contigs(infile, outfile, trim): | |
161 | seq_reader = sequences.file_reader(infile) | |
162 | fout = utils.open_file_write(outfile) | |
163 | ||
164 | for seq in seq_reader: | |
165 | if len(seq) < 2 * trim: | |
166 | continue | |
167 | ||
168 | gaps = seq.gaps() | |
169 | bases = list(seq.seq) | |
170 | ||
171 | # extend the length of each gap | |
172 | for gap in gaps: | |
173 | left_start = max(gap.start - trim, 0) | |
174 | right_end = min(gap.end + trim + 1, len(seq)) | |
175 | ||
176 | for i in range(left_start, gap.start): | |
177 | bases[i] = 'N' | |
178 | ||
179 | for i in range(gap.end, right_end): | |
180 | bases[i] = 'N' | |
181 | ||
182 | seq.seq = ''.join(bases) | |
183 | ||
184 | # trim start/end bases and tidy up any resulting Ns at either end of the trimmed seq | |
185 | seq.trim(trim, trim) | |
186 | seq.trim_Ns() | |
187 | ||
188 | # check that there is some non-N sequence left over | |
189 | regex = re.compile('[^nN]') | |
190 | if regex.search(seq.seq) is not None: | |
191 | print(seq, file=fout) | |
192 | ||
193 | utils.close(fout) | |
194 | ||
195 | ||
196 | def fastaq_to_fake_qual(infile, outfile, q=40): | |
197 | seq_reader = sequences.file_reader(infile) | |
198 | fout = utils.open_file_write(outfile) | |
199 | ||
200 | for seq in seq_reader: | |
201 | print('>' + seq.id, file=fout) | |
202 | if sequences.Fasta.line_length == 0: | |
203 | print(' '.join([str(q)] * len(seq)), file=fout) | |
204 | else: | |
205 | for i in range(0, len(seq), sequences.Fasta.line_length): | |
206 | print(' '.join([str(q)] * min(sequences.Fasta.line_length, len(seq) - i)), file=fout) | |
207 | ||
208 | utils.close(fout) | |
209 | ||
210 | ||
211 | def fasta_to_fastq(fasta_in, qual_in, outfile): | |
212 | fa_reader = sequences.file_reader(fasta_in) | |
213 | qual_reader = sequences.file_reader(qual_in, read_quals=True) | |
214 | f_out = utils.open_file_write(outfile) | |
215 | ||
216 | for seq in fa_reader: | |
217 | qual = next(qual_reader) | |
218 | if seq.id != qual.id: | |
219 | utils.close(f_out) | |
220 | raise Error('Mismatch in names from fasta and qual file', seq.id, qual.id) | |
221 | ||
222 | qual.seq = [int(x) for x in qual.seq.split()] | |
223 | print(seq.to_Fastq(qual.seq), file=f_out) | |
224 | ||
225 | utils.close(f_out) | |
226 | ||
227 | ||
228 | def fastaq_to_mira_xml(infile, outfile): | |
229 | seq_reader = sequences.file_reader(infile) | |
230 | fout = utils.open_file_write(outfile) | |
231 | print('<?xml version="1.0"?>', '<trace_volume>', sep='\n', file=fout) | |
232 | ||
233 | for seq in seq_reader: | |
234 | print(' <trace>', | |
235 | ' <trace_name>' + seq.id + '</trace_name>', | |
236 | ' <clip_quality_right>' + str(len(seq)) + '</clip_quality_right>', | |
237 | ' <clip_vector_left>1</clip_vector_left>', | |
238 | ' </trace>', sep='\n', file=fout) | |
239 | ||
240 | ||
241 | print('</trace_volume>', file=fout) | |
242 | utils.close(fout) | |
243 | ||
244 | ||
245 | def fastaq_to_orfs_gff(infile, outfile, min_length=300, tool_name='fastaq'): | |
246 | seq_reader = sequences.file_reader(infile) | |
247 | fout = utils.open_file_write(outfile) | |
248 | for seq in seq_reader: | |
249 | orfs = seq.all_orfs(min_length=min_length) | |
250 | for coords, revcomp in orfs: | |
251 | if revcomp: | |
252 | strand = '-' | |
253 | else: | |
254 | strand = '+' | |
255 | ||
256 | print(seq.id, tool_name, 'CDS', coords.start+1, coords.end+1, '.', strand, '.', sep='\t', file=fout) | |
257 | ||
258 | utils.close(fout) | |
259 | ||
260 | ||
261 | def file_to_dict(infile, d): | |
262 | seq_reader = sequences.file_reader(infile) | |
263 | for seq in seq_reader: | |
264 | d[seq.id] = copy.copy(seq) | |
265 | ||
266 | ||
267 | def filter( | |
268 | infile, | |
269 | outfile, | |
270 | minlength=0, | |
271 | maxlength=float('inf'), | |
272 | regex=None, | |
273 | ids_file=None, | |
274 | invert=False, | |
275 | mate_in=None, | |
276 | mate_out=None, | |
277 | both_mates_pass=True, | |
278 | ): | |
279 | ||
280 | ids_from_file = set() | |
281 | if ids_file is not None: | |
282 | f = utils.open_file_read(ids_file) | |
283 | for line in f: | |
284 | ids_from_file.add(line.rstrip()) | |
285 | utils.close(f) | |
286 | ||
287 | if mate_in: | |
288 | if mate_out is None: | |
289 | raise Error('Error in filter! mate_in provided. Must also provide mate_out') | |
290 | ||
291 | seq_reader_mate = sequences.file_reader(mate_in) | |
292 | f_out_mate = utils.open_file_write(mate_out) | |
293 | ||
294 | seq_reader = sequences.file_reader(infile) | |
295 | f_out = utils.open_file_write(outfile) | |
296 | if regex is not None: | |
297 | r = re.compile(regex) | |
298 | ||
299 | ||
300 | def passes(seq): | |
301 | return minlength <= len(seq) <= maxlength \ | |
302 | and (regex is None or r.search(seq.id) is not None) \ | |
303 | and (ids_file is None or seq.id in ids_from_file) | |
304 | ||
305 | for seq in seq_reader: | |
306 | seq_passes = passes(seq) | |
307 | if mate_in: | |
308 | try: | |
309 | seq_mate = next(seq_reader_mate) | |
310 | except: | |
311 | utils.close(f_out) | |
312 | raise Error('Error getting mate for sequence', seq.id, ' ... cannot continue') | |
313 | ||
314 | mate_passes = passes(seq_mate) | |
315 | want_the_pair = (seq_passes and mate_passes) \ | |
316 | or (( seq_passes or mate_passes) and not both_mates_pass) | |
317 | if want_the_pair != invert: | |
318 | print(seq, file=f_out) | |
319 | print(seq_mate, file=f_out_mate) | |
320 | elif seq_passes != invert: | |
321 | print(seq, file=f_out) | |
322 | utils.close(f_out) | |
323 | if mate_in: | |
324 | utils.close(f_out_mate) | |
325 | ||
326 | ||
327 | def get_ids(infile, outfile): | |
328 | seq_reader = sequences.file_reader(infile) | |
329 | f_out = utils.open_file_write(outfile) | |
330 | for seq in seq_reader: | |
331 | print(seq.id, file=f_out) | |
332 | utils.close(f_out) | |
333 | ||
334 | ||
335 | def get_seqs_flanking_gaps(infile, outfile, left, right): | |
336 | seq_reader = sequences.file_reader(infile) | |
337 | fout = utils.open_file_write(outfile) | |
338 | ||
339 | print('#id', 'gap_start', 'gap_end', 'left_bases', 'right_bases', sep='\t', file=fout) | |
340 | ||
341 | for seq in seq_reader: | |
342 | gaps = seq.gaps() | |
343 | ||
344 | for gap in gaps: | |
345 | left_start = max(gap.start - left, 0) | |
346 | right_end = min(gap.end + right + 1, len(seq)) | |
347 | print(seq.id, | |
348 | gap.start + 1, | |
349 | gap.end + 1, | |
350 | seq.seq[left_start:gap.start], | |
351 | seq.seq[gap.end + 1:right_end], | |
352 | sep='\t', file=fout) | |
353 | ||
354 | utils.close(fout) | |
355 | ||
356 | ||
357 | def interleave(infile_1, infile_2, outfile): | |
358 | seq_reader_1 = sequences.file_reader(infile_1) | |
359 | seq_reader_2 = sequences.file_reader(infile_2) | |
360 | f_out = utils.open_file_write(outfile) | |
361 | ||
362 | for seq_1 in seq_reader_1: | |
363 | try: | |
364 | seq_2 = next(seq_reader_2) | |
365 | except: | |
366 | utils.close(f_out) | |
367 | raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue') | |
368 | ||
369 | print(seq_1, file=f_out) | |
370 | print(seq_2, file=f_out) | |
371 | ||
372 | try: | |
373 | seq_2 = next(seq_reader_2) | |
374 | except: | |
375 | seq_2 = None | |
376 | ||
377 | if seq_2 is not None: | |
378 | utils.close(f_out) | |
379 | raise Error('Error getting mate for sequence', seq_2.id, ' ... cannot continue') | |
380 | ||
381 | utils.close(f_out) | |
382 | ||
383 | ||
384 | def make_random_contigs(contigs, length, outfile, name_by_letters=False, prefix='', seed=None, first_number=1): | |
385 | '''Makes a multi fasta file of random sequences, all the same length''' | |
386 | random.seed(a=seed) | |
387 | fout = utils.open_file_write(outfile) | |
388 | letters = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') | |
389 | letters_index = 0 | |
390 | ||
391 | for i in range(contigs): | |
392 | if name_by_letters: | |
393 | name = letters[letters_index] | |
394 | letters_index += 1 | |
395 | if letters_index == len(letters): | |
396 | letters_index = 0 | |
397 | else: | |
398 | name = str(i + first_number) | |
399 | ||
400 | fa = sequences.Fasta(prefix + name, ''.join([random.choice('ACGT') for x in range(length)])) | |
401 | print(fa, file=fout) | |
402 | ||
403 | utils.close(fout) | |
404 | ||
405 | ||
406 | def make_long_reads(infile, outfile, method='tiling', fixed_read_length=20000, tile_step=10000, gamma_shape=1.2, gamma_scale=6000, coverage=10, gamma_min_length=20000, seed=None, ins_skip=None, ins_window=None,): | |
407 | assert method in ['tiling', 'gamma', 'uniform'] | |
408 | assert ins_skip == ins_window == None or None not in [ins_skip, ins_window] | |
409 | if seed is not None: | |
410 | random.seed(a=seed) | |
411 | seq_reader = sequences.file_reader(infile) | |
412 | f = utils.open_file_write(outfile) | |
413 | ||
414 | for seq in seq_reader: | |
415 | if method == 'tiling': | |
416 | if len(seq) < fixed_read_length: | |
417 | print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr) | |
418 | continue | |
419 | for i in range(0, len(seq), tile_step): | |
420 | end = min(len(seq), i + fixed_read_length) | |
421 | fa = sequences.Fasta('_'.join([seq.id, str(i + 1), str(end)]), seq[i:end]) | |
422 | if ins_skip: | |
423 | fa.add_insertions(skip=ins_skip, window=ins_window) | |
424 | print(fa, file=f) | |
425 | if end >= len(seq): | |
426 | break | |
427 | elif method == 'gamma': | |
428 | if len(seq) < gamma_min_length: | |
429 | print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr) | |
430 | continue | |
431 | total_read_length = 0 | |
432 | while total_read_length < coverage * len(seq) - 0.5 * gamma_min_length: | |
433 | read_length = int(numpy.random.gamma(gamma_shape, scale=gamma_scale)) | |
434 | while read_length < gamma_min_length or read_length > len(seq): | |
435 | read_length = int(numpy.random.gamma(gamma_shape, scale=gamma_scale)) | |
436 | ||
437 | start = random.randint(0, len(seq) - read_length) | |
438 | end = start + read_length - 1 | |
439 | fa = sequences.Fasta('_'.join([seq.id, str(start + 1), str(end + 1)]), seq[start:end+1]) | |
440 | total_read_length += len(fa) | |
441 | if ins_skip: | |
442 | fa.add_insertions(skip=ins_skip, window=ins_window) | |
443 | print(fa, file=f) | |
444 | elif method == 'uniform': | |
445 | if len(seq) < fixed_read_length: | |
446 | print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr) | |
447 | continue | |
448 | total_read_length = 0 | |
449 | while total_read_length < coverage * len(seq) - 0.5 * fixed_read_length: | |
450 | start = random.randint(0, len(seq) - fixed_read_length) | |
451 | end = start + fixed_read_length - 1 | |
452 | fa = sequences.Fasta('_'.join([seq.id, str(start + 1), str(end + 1)]), seq[start:end+1]) | |
453 | total_read_length += len(fa) | |
454 | if ins_skip: | |
455 | fa.add_insertions(skip=ins_skip, window=ins_window) | |
456 | print(fa, file=f) | |
457 | ||
458 | ||
459 | utils.close(f) | |
460 | ||
461 | ||
462 | def mean_length(infile, limit=None): | |
463 | '''Returns the mean length of the sequences in the input file. By default uses all sequences. To limit to the first N sequences, use limit=N''' | |
464 | total = 0 | |
465 | count = 0 | |
466 | seq_reader = sequences.file_reader(infile) | |
467 | for seq in seq_reader: | |
468 | total += len(seq) | |
469 | count += 1 | |
470 | if limit is not None and count >= limit: | |
471 | break | |
472 | ||
473 | assert count > 0 | |
474 | return total / count | |
475 | ||
476 | ||
477 | def merge_to_one_seq(infile, outfile, seqname='union'): | |
478 | '''Takes a multi fasta or fastq file and writes a new file that contains just one sequence, with the original sequences catted together, preserving their order''' | |
479 | seq_reader = sequences.file_reader(infile) | |
480 | seqs = [] | |
481 | ||
482 | for seq in seq_reader: | |
483 | seqs.append(copy.copy(seq)) | |
484 | ||
485 | new_seq = ''.join([seq.seq for seq in seqs]) | |
486 | ||
487 | if type(seqs[0]) == sequences.Fastq: | |
488 | new_qual = ''.join([seq.qual for seq in seqs]) | |
489 | seqs[:] = [] | |
490 | merged = sequences.Fastq(seqname, new_seq, new_qual) | |
491 | else: | |
492 | merged = sequences.Fasta(seqname, new_seq) | |
493 | seqs[:] = [] | |
494 | ||
495 | f = utils.open_file_write(outfile) | |
496 | print(merged, file=f) | |
497 | utils.close(f) | |
498 | ||
499 | ||
500 | def reverse_complement(infile, outfile): | |
501 | seq_reader = sequences.file_reader(infile) | |
502 | fout = utils.open_file_write(outfile) | |
503 | ||
504 | for seq in seq_reader: | |
505 | seq.revcomp() | |
506 | print(seq, file=fout) | |
507 | ||
508 | utils.close(fout) | |
509 | ||
510 | ||
511 | def scaffolds_to_contigs(infile, outfile, number_contigs=False): | |
512 | '''Makes a file of contigs from scaffolds by splitting at every N. | |
513 | Use number_contigs=True to add .1, .2, etc onto end of each | |
514 | contig, instead of default to append coordinates.''' | |
515 | seq_reader = sequences.file_reader(infile) | |
516 | fout = utils.open_file_write(outfile) | |
517 | ||
518 | for seq in seq_reader: | |
519 | contigs = seq.contig_coords() | |
520 | counter = 1 | |
521 | for contig in contigs: | |
522 | if number_contigs: | |
523 | name = seq.id + '.' + str(counter) | |
524 | counter += 1 | |
525 | else: | |
526 | name = '.'.join([seq.id, str(contig.start + 1), str(contig.end + 1)]) | |
527 | print(sequences.Fasta(name, seq[contig.start:contig.end+1]), file=fout) | |
528 | ||
529 | utils.close(fout) | |
530 | ||
531 | ||
532 | def search_for_seq(infile, outfile, search_string): | |
533 | seq_reader = sequences.file_reader(infile) | |
534 | fout = utils.open_file_write(outfile) | |
535 | ||
536 | for seq in seq_reader: | |
537 | hits = seq.search(search_string) | |
538 | for hit in hits: | |
539 | print(seq.id, hit[0]+1, hit[1], sep='\t', file=fout) | |
540 | ||
541 | utils.close(fout) | |
542 | ||
543 | ||
544 | def sequence_trim(infile_1, infile_2, outfile_1, outfile_2, to_trim_file, min_length=50, check_revcomp=False): | |
545 | to_trim_seqs = {} | |
546 | file_to_dict(to_trim_file, to_trim_seqs) | |
547 | trim_seqs = [x.seq for x in to_trim_seqs.values()] | |
548 | if check_revcomp: | |
549 | for seq in to_trim_seqs.values(): | |
550 | seq.revcomp() | |
551 | trim_seqs_revcomp = [x.seq for x in to_trim_seqs.values()] | |
552 | else: | |
553 | trim_seqs_revcomp = [] | |
554 | ||
555 | seq_reader_1 = sequences.file_reader(infile_1) | |
556 | seq_reader_2 = sequences.file_reader(infile_2) | |
557 | f_out_1 = utils.open_file_write(outfile_1) | |
558 | f_out_2 = utils.open_file_write(outfile_2) | |
559 | ||
560 | for seq_1 in seq_reader_1: | |
561 | try: | |
562 | seq_2 = next(seq_reader_2) | |
563 | except: | |
564 | utils.close(f_out) | |
565 | raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue') | |
566 | ||
567 | for seq in seq_1, seq_2: | |
568 | for trim_seq in trim_seqs: | |
569 | if seq.seq.startswith(trim_seq): | |
570 | seq.trim(len(trim_seq),0) | |
571 | break | |
572 | ||
573 | for trim_seq in trim_seqs_revcomp: | |
574 | if seq.seq.endswith(trim_seq): | |
575 | seq.trim(0,len(trim_seq)) | |
576 | break | |
577 | ||
578 | if len(seq_1) >= min_length and len(seq_2) >= min_length: | |
579 | print(seq_1, file=f_out_1) | |
580 | print(seq_2, file=f_out_2) | |
581 | ||
582 | ||
583 | utils.close(f_out_1) | |
584 | utils.close(f_out_2) | |
585 | ||
586 | ||
587 | def sort_by_size(infile, outfile, smallest_first=False): | |
588 | '''Sorts input sequence file by biggest sequence first, writes sorted output file. Set smallest_first=True to have smallest first''' | |
589 | seqs = {} | |
590 | file_to_dict(infile, seqs) | |
591 | seqs = list(seqs.values()) | |
592 | seqs.sort(key=lambda x: len(x), reverse=not smallest_first) | |
593 | fout = utils.open_file_write(outfile) | |
594 | for seq in seqs: | |
595 | print(seq, file=fout) | |
596 | utils.close(fout) | |
597 | ||
598 | ||
599 | def translate(infile, outfile, frame=0): | |
600 | seq_reader = sequences.file_reader(infile) | |
601 | fout = utils.open_file_write(outfile) | |
602 | ||
603 | for seq in seq_reader: | |
604 | print(seq.translate(frame=frame), file=fout) | |
605 | ||
606 | utils.close(fout) | |
607 | ||
608 | ||
609 | def trim(infile, outfile, start, end): | |
610 | seq_reader = sequences.file_reader(infile) | |
611 | fout = utils.open_file_write(outfile) | |
612 | ||
613 | for seq in seq_reader: | |
614 | seq.trim(start, end) | |
615 | if len(seq): | |
616 | print(seq, file=fout) | |
617 | ||
618 | utils.close(fout) | |
619 | ||
620 | ||
621 | def trim_Ns_at_end(infile, outfile): | |
622 | seq_reader = sequences.file_reader(infile) | |
623 | fout = utils.open_file_write(outfile) | |
624 | ||
625 | for seq in seq_reader: | |
626 | seq.trim_Ns() | |
627 | if len(seq): | |
628 | print(seq, file=fout) | |
629 | ||
630 | utils.close(fout) | |
631 | ||
632 | ||
633 | def lengths_from_fai(fai_file, d): | |
634 | f = utils.open_file_read(fai_file) | |
635 | for line in f: | |
636 | (id, length) = line.rstrip().split()[:2] | |
637 | d[id] = int(length) | |
638 | utils.close(f) | |
639 | ||
640 | ||
641 | def split_by_base_count(infile, outfiles_prefix, max_bases, max_seqs=None): | |
642 | '''Splits a fasta/q file into separate files, file size determined by number of bases. | |
643 | ||
644 | Puts <= max_bases in each split file The exception is a single sequence >=max_bases | |
645 | is put in its own file. This does not split sequences. | |
646 | ''' | |
647 | seq_reader = sequences.file_reader(infile) | |
648 | base_count = 0 | |
649 | file_count = 1 | |
650 | seq_count = 0 | |
651 | fout = None | |
652 | if max_seqs is None: | |
653 | max_seqs = float('inf') | |
654 | ||
655 | for seq in seq_reader: | |
656 | if base_count == 0: | |
657 | fout = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) | |
658 | file_count += 1 | |
659 | ||
660 | if base_count + len(seq) > max_bases or seq_count >= max_seqs: | |
661 | if base_count == 0: | |
662 | print(seq, file=fout) | |
663 | utils.close(fout) | |
664 | else: | |
665 | utils.close(fout) | |
666 | fout = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) | |
667 | print(seq, file=fout) | |
668 | base_count = len(seq) | |
669 | file_count += 1 | |
670 | seq_count = 1 | |
671 | else: | |
672 | base_count += len(seq) | |
673 | seq_count += 1 | |
674 | print(seq, file=fout) | |
675 | ||
676 | utils.close(fout) | |
677 | ||
678 | ||
679 | def split_by_fixed_size(infile, outfiles_prefix, chunk_size, tolerance, skip_if_all_Ns=False): | |
680 | '''Splits fasta/q file into separate files, with up to (chunk_size + tolerance) bases in each file''' | |
681 | file_count = 1 | |
682 | coords = [] | |
683 | small_sequences = [] # sequences shorter than chunk_size | |
684 | seq_reader = sequences.file_reader(infile) | |
685 | f_coords = utils.open_file_write(outfiles_prefix + '.coords') | |
686 | ||
687 | for seq in seq_reader: | |
688 | if skip_if_all_Ns and seq.is_all_Ns(): | |
689 | continue | |
690 | if len(seq) < chunk_size: | |
691 | small_sequences.append(copy.copy(seq)) | |
692 | elif len(seq) <= chunk_size + tolerance: | |
693 | f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) | |
694 | print(seq, file=f) | |
695 | utils.close(f) | |
696 | file_count += 1 | |
697 | else: | |
698 | # make list of chunk coords | |
699 | chunks = [(x,x+chunk_size) for x in range(0, len(seq), chunk_size)] | |
700 | if chunks[-1][1] - 1 > len(seq): | |
701 | chunks[-1] = (chunks[-1][0], len(seq)) | |
702 | if len(chunks) > 1 and (chunks[-1][1] - chunks[-1][0]) <= tolerance: | |
703 | chunks[-2] = (chunks[-2][0], chunks[-1][1]) | |
704 | chunks.pop() | |
705 | ||
706 | # write one output file per chunk | |
707 | offset = 0 | |
708 | for chunk in chunks: | |
709 | if not(skip_if_all_Ns and seq.is_all_Ns(start=chunk[0], end=chunk[1]-1)): | |
710 | f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) | |
711 | chunk_id = seq.id + ':' + str(chunk[0]+1) + '-' + str(chunk[1]) | |
712 | print(sequences.Fasta(chunk_id, seq[chunk[0]:chunk[1]]), file=f) | |
713 | print(chunk_id, seq.id, offset, sep='\t', file=f_coords) | |
714 | utils.close(f) | |
715 | file_count += 1 | |
716 | ||
717 | offset += chunk[1] - chunk[0] | |
718 | ||
719 | # write files of small sequences | |
720 | if len(small_sequences): | |
721 | f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) | |
722 | file_count += 1 | |
723 | base_count = 0 | |
724 | for seq in small_sequences: | |
725 | if base_count > 0 and base_count + len(seq) > chunk_size + tolerance: | |
726 | utils.close(f) | |
727 | f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) | |
728 | file_count += 1 | |
729 | base_count = 0 | |
730 | ||
731 | print(seq, file=f) | |
732 | base_count += len(seq) | |
733 | ||
734 | utils.close(f) | |
735 | ||
736 | ||
737 | def split_by_fixed_size_onefile(infile, outfile, chunk_size, tolerance, skip_if_all_Ns=False): | |
738 | '''Splits each sequence in infile into chunks of fixed size, last chunk can be up to | |
739 | (chunk_size + tolerance) in length''' | |
740 | seq_reader = sequences.file_reader(infile) | |
741 | f_out = utils.open_file_write(outfile) | |
742 | for seq in seq_reader: | |
743 | for i in range(0, len(seq), chunk_size): | |
744 | if i + chunk_size + tolerance >= len(seq): | |
745 | end = len(seq) | |
746 | else: | |
747 | end = i + chunk_size | |
748 | ||
749 | subseq = seq.subseq(i, end) | |
750 | if not (skip_if_all_Ns and subseq.is_all_Ns()): | |
751 | subseq.id += '.' + str(i+1) + '_' + str(end) | |
752 | print(subseq, file=f_out) | |
753 | ||
754 | if end == len(seq): | |
755 | break | |
756 | ||
757 | utils.close(f_out) | |
758 | ||
759 | ||
760 | def replace_bases(infile, outfile, old, new): | |
761 | seq_reader = sequences.file_reader(infile) | |
762 | f_out = utils.open_file_write(outfile) | |
763 | ||
764 | for seq in seq_reader: | |
765 | seq.replace_bases(old, new) | |
766 | print(seq, file=f_out) | |
767 | ||
768 | utils.close(f_out) | |
769 | ||
770 | ||
771 | def strip_illumina_suffix(infile, outfile): | |
772 | seq_reader = sequences.file_reader(infile) | |
773 | f_out = utils.open_file_write(outfile) | |
774 | ||
775 | for seq in seq_reader: | |
776 | seq.strip_illumina_suffix() | |
777 | print(seq, file=f_out) | |
778 | ||
779 | utils.close(f_out) | |
780 | ||
781 | ||
782 | def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False): | |
783 | seq_reader = sequences.file_reader(infile) | |
784 | f_out = utils.open_file_write(outfile) | |
785 | original_line_length = sequences.Fasta.line_length | |
786 | sequences.Fasta.line_length = line_length | |
787 | ||
788 | for seq in seq_reader: | |
789 | if strip_after_first_whitespace: | |
790 | seq.strip_after_first_whitespace() | |
791 | ||
792 | if type(seq) == sequences.Fastq: | |
793 | print(sequences.Fasta(seq.id, seq.seq), file=f_out) | |
794 | else: | |
795 | print(seq, file=f_out) | |
796 | ||
797 | utils.close(f_out) | |
798 | sequences.Fasta.line_length = original_line_length | |
799 | ||
800 | ||
801 | def to_fasta_union(infile, outfile, seqname='union'): | |
802 | seq_reader = sequences.file_reader(infile) | |
803 | new_seq = [] | |
804 | ||
805 | for seq in seq_reader: | |
806 | new_seq.append(seq.seq) | |
807 | ||
808 | f_out = utils.open_file_write(outfile) | |
809 | print(sequences.Fasta(seqname, ''.join(new_seq)), file=f_out) | |
810 | utils.close(f_out) | |
811 | ||
812 | ||
813 | ||
814 | def to_unique_by_id(infile, outfile): | |
815 | seq_reader = sequences.file_reader(infile) | |
816 | seqs = {} | |
817 | ids_in_order = [] | |
818 | ||
819 | # has the reads, keeping the longest one when we get the same | |
820 | # name more than once | |
821 | for seq in seq_reader: | |
822 | if len(seq) == 0: | |
823 | continue | |
824 | if seq.id not in seqs: | |
825 | seqs[seq.id] = copy.copy(seq) | |
826 | ids_in_order.append(seq.id) | |
827 | elif len(seqs[seq.id]) < len(seq): | |
828 | seqs[seq.id] = copy.copy(seq) | |
829 | ||
830 | # write the output | |
831 | f_out = utils.open_file_write(outfile) | |
832 | for id in ids_in_order: | |
833 | print(seqs[id], file=f_out) | |
834 | utils.close(f_out) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import os | |
3 | import unittest | |
4 | from pyfastaq import caf, utils, sequences | |
5 | ||
6 | modules_dir = os.path.dirname(os.path.abspath(caf.__file__)) | |
7 | data_dir = os.path.join(modules_dir, 'tests', 'data') | |
8 | ||
9 | class TestCaf(unittest.TestCase): | |
10 | def test_get_next_from_file(self): | |
11 | '''Test get_next_from_file()''' | |
12 | ||
13 | f_in = utils.open_file_read(os.path.join(data_dir, 'caf_test.caf')) | |
14 | ||
15 | c = caf.Caf() | |
16 | c.get_next_from_file(f_in) | |
17 | read = caf.Caf() | |
18 | read.id = 'read1.p1k' | |
19 | read.seq = sequences.Fasta(read.id, 'NACGTAN') | |
20 | read.seq = read.seq.to_Fastq([4, 24, 42, 43, 40, 30, 8]) | |
21 | read.insert_min = 2000 | |
22 | read.insert_max = 4000 | |
23 | read.ligation = '12345' | |
24 | read.clone = 'clone1' | |
25 | read.clip_start = 1 | |
26 | read.clip_end = 5 | |
27 | self.assertEqual(c, read) | |
28 | ||
29 | c.get_next_from_file(f_in) | |
30 | read = caf.Caf() | |
31 | read.id = 'read2.p1k' | |
32 | read.seq = sequences.Fasta(read.id, 'CGACGTT') | |
33 | read.seq = read.seq.to_Fastq([9, 9, 40, 41, 42, 42, 4]) | |
34 | read.insert_min = 2000 | |
35 | read.insert_max = 4000 | |
36 | read.ligation = '23456' | |
37 | read.clone = 'clone2' | |
38 | read.clip_start = None | |
39 | read.clip_end = None | |
40 | self.assertEqual(c, read) | |
41 | ||
42 | utils.close(f_in) | |
43 | ||
44 | ||
45 | if __name__ == '__main__': | |
46 | unittest.main() |
0 | ||
1 | DNA : read1.p1k | |
2 | NACG | |
3 | TAN | |
4 | ||
5 | BaseQuality : read1.p1k | |
6 | 4 24 42 43 40 30 8 | |
7 | ||
8 | Sequence : read1.p1k | |
9 | Is_read | |
10 | SCF_File read1.p1kSCF | |
11 | Template read1 | |
12 | Insert_size 2000 4000 | |
13 | Ligation_no 12345 | |
14 | Primer Universal_primer | |
15 | Strand Forward | |
16 | Dye Dye_terminator | |
17 | Clone clone1 | |
18 | Seq_vec SVEC 1 15 puc19 | |
19 | Sequencing_vector "puc19" | |
20 | Clipping QUAL 2 6 | |
21 | ProcessStatus PASS | |
22 | Asped 2006-7-5 | |
23 | Unpadded | |
24 | Align_to_SCF 1 1272 1 1272 | |
25 | ||
26 | DNA : read2.p1k | |
27 | CG | |
28 | ACGTT | |
29 | ||
30 | BaseQuality : read2.p1k | |
31 | 9 9 40 41 42 42 4 | |
32 | ||
33 | Sequence : read2.p1k | |
34 | Is_read | |
35 | SCF_File read2.p1kSCF | |
36 | Template read2 | |
37 | Insert_size 2000 4000 | |
38 | Ligation_no 23456 | |
39 | Primer Universal_primer | |
40 | Strand Forward | |
41 | Dye Dye_terminator | |
42 | Clone clone2 | |
43 | Seq_vec SVEC 1 32 puc19 | |
44 | Sequencing_vector "puc19" | |
45 | ProcessStatus PASS | |
46 | Unpadded | |
47 | Align_to_SCF 1 1347 1 1347 |
0 | ID seq1; SV 1; linear; mRNA; STD; PLN; 1859 BP. | |
1 | XX | |
2 | AC X56734; S46826; | |
3 | XX | |
4 | DT 12-SEP-1991 (Rel. 29, Created) | |
5 | DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) | |
6 | XX | |
7 | DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase | |
8 | XX | |
9 | KW beta-glucosidase. | |
10 | XX | |
11 | OS Trifolium repens (white clover) | |
12 | OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; | |
13 | OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; | |
14 | OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. | |
15 | XX | |
16 | RN [5] | |
17 | RP 1-1859 | |
18 | RX DOI; 10.1007/BF00039495. | |
19 | RX PUBMED; 1907511. | |
20 | RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; | |
21 | RT "Nucleotide and derived amino acid sequence of the cyanogenic | |
22 | RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; | |
23 | RL Plant Mol. Biol. 17(2):209-219(1991). | |
24 | XX | |
25 | RN [6] | |
26 | RP 1-1859 | |
27 | RA Hughes M.A.; | |
28 | RT ; | |
29 | RL Submitted (19-NOV-1990) to the INSDC. | |
30 | RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle | |
31 | RL Upon Tyne, NE2 4HH, UK | |
32 | XX | |
33 | DR EuropePMC; PMC99098; 11752244. | |
34 | XX | |
35 | FH Key Location/Qualifiers | |
36 | FH | |
37 | FT source 1..1859 | |
38 | FT /organism="Trifolium repens" | |
39 | FT /mol_type="mRNA" | |
40 | FT /clone_lib="lambda gt10" | |
41 | FT /clone="TRE361" | |
42 | FT /tissue_type="leaves" | |
43 | FT /db_xref="taxon:3899" | |
44 | FT mRNA 1..1859 | |
45 | FT /experiment="experimental evidence, no additional details | |
46 | FT recorded" | |
47 | FT CDS 14..1495 | |
48 | FT /product="beta-glucosidase" | |
49 | FT /EC_number="3.2.1.21" | |
50 | FT /note="non-cyanogenic" | |
51 | FT /db_xref="GOA:P26204" | |
52 | FT /db_xref="InterPro:IPR001360" | |
53 | FT /db_xref="InterPro:IPR013781" | |
54 | FT /db_xref="InterPro:IPR017853" | |
55 | FT /db_xref="InterPro:IPR018120" | |
56 | FT /db_xref="UniProtKB/Swiss-Prot:P26204" | |
57 | FT /protein_id="CAA40058.1" | |
58 | FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI | |
59 | FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK | |
60 | FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ | |
61 | FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR | |
62 | FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD | |
63 | FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF | |
64 | FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ | |
65 | FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA | |
66 | FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" | |
67 | XX | |
68 | SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; | |
69 | aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 | |
70 | cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 | |
71 | tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 | |
72 | aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 | |
73 | tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 | |
74 | caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 | |
75 | ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 | |
76 | atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 | |
77 | ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 | |
78 | tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 | |
79 | gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 | |
80 | aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 | |
81 | aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 | |
82 | taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 | |
83 | gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 | |
84 | cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 | |
85 | gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 | |
86 | ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 | |
87 | acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 | |
88 | acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 | |
89 | gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 | |
90 | gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 | |
91 | agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 | |
92 | ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 | |
93 | taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 | |
94 | tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 | |
95 | ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 | |
96 | tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 | |
97 | aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 | |
98 | agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 | |
99 | tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 | |
100 | // | |
101 | ID seq2; SV 1; linear; mRNA; STD; PLN; 1859 BP. | |
102 | XX | |
103 | AC X56734; S46826; | |
104 | XX | |
105 | DT 12-SEP-1991 (Rel. 29, Created) | |
106 | DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) | |
107 | XX | |
108 | DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase | |
109 | XX | |
110 | KW beta-glucosidase. | |
111 | XX | |
112 | OS Trifolium repens (white clover) | |
113 | OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; | |
114 | OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; | |
115 | OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. | |
116 | XX | |
117 | RN [5] | |
118 | RP 1-1859 | |
119 | RX DOI; 10.1007/BF00039495. | |
120 | RX PUBMED; 1907511. | |
121 | RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; | |
122 | RT "Nucleotide and derived amino acid sequence of the cyanogenic | |
123 | RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; | |
124 | RL Plant Mol. Biol. 17(2):209-219(1991). | |
125 | XX | |
126 | RN [6] | |
127 | RP 1-1859 | |
128 | RA Hughes M.A.; | |
129 | RT ; | |
130 | RL Submitted (19-NOV-1990) to the INSDC. | |
131 | RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle | |
132 | RL Upon Tyne, NE2 4HH, UK | |
133 | XX | |
134 | DR EuropePMC; PMC99098; 11752244. | |
135 | XX | |
136 | FH Key Location/Qualifiers | |
137 | FH | |
138 | FT source 1..1859 | |
139 | FT /organism="Trifolium repens" | |
140 | FT /mol_type="mRNA" | |
141 | FT /clone_lib="lambda gt10" | |
142 | FT /clone="TRE361" | |
143 | FT /tissue_type="leaves" | |
144 | FT /db_xref="taxon:3899" | |
145 | FT mRNA 1..1859 | |
146 | FT /experiment="experimental evidence, no additional details | |
147 | FT recorded" | |
148 | FT CDS 14..1495 | |
149 | FT /product="beta-glucosidase" | |
150 | FT /EC_number="3.2.1.21" | |
151 | FT /note="non-cyanogenic" | |
152 | FT /db_xref="GOA:P26204" | |
153 | FT /db_xref="InterPro:IPR001360" | |
154 | FT /db_xref="InterPro:IPR013781" | |
155 | FT /db_xref="InterPro:IPR017853" | |
156 | FT /db_xref="InterPro:IPR018120" | |
157 | FT /db_xref="UniProtKB/Swiss-Prot:P26204" | |
158 | FT /protein_id="CAA40058.1" | |
159 | FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI | |
160 | FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK | |
161 | FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ | |
162 | FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR | |
163 | FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD | |
164 | FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF | |
165 | FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ | |
166 | FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA | |
167 | FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" | |
168 | XX | |
169 | SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; | |
170 | aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 | |
171 | cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 | |
172 | tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 | |
173 | aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 | |
174 | tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 | |
175 | caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 | |
176 | ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 | |
177 | atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 | |
178 | ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 | |
179 | tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 | |
180 | gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 | |
181 | aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 | |
182 | aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 | |
183 | taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 | |
184 | gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 | |
185 | cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 | |
186 | gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 | |
187 | ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 | |
188 | acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 | |
189 | acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 | |
190 | gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 | |
191 | gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 | |
192 | agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 | |
193 | ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 | |
194 | taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 | |
195 | tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 | |
196 | ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 | |
197 | tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 | |
198 | aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 | |
199 | agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 | |
200 | tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa ccccccccc 1859 | |
201 | // | |
202 |
0 | ID seq1; SV 1; linear; mRNA; STD; PLN; 1859 BP. | |
1 | XX | |
2 | AC X56734; S46826; | |
3 | XX | |
4 | DT 12-SEP-1991 (Rel. 29, Created) | |
5 | DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) | |
6 | XX | |
7 | DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase | |
8 | XX | |
9 | KW beta-glucosidase. | |
10 | XX | |
11 | OS Trifolium repens (white clover) | |
12 | OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; | |
13 | OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; | |
14 | OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. | |
15 | XX | |
16 | RN [5] | |
17 | RP 1-1859 | |
18 | RX DOI; 10.1007/BF00039495. | |
19 | RX PUBMED; 1907511. | |
20 | RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; | |
21 | RT "Nucleotide and derived amino acid sequence of the cyanogenic | |
22 | RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; | |
23 | RL Plant Mol. Biol. 17(2):209-219(1991). | |
24 | XX | |
25 | RN [6] | |
26 | RP 1-1859 | |
27 | RA Hughes M.A.; | |
28 | RT ; | |
29 | RL Submitted (19-NOV-1990) to the INSDC. | |
30 | RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle | |
31 | RL Upon Tyne, NE2 4HH, UK | |
32 | XX | |
33 | DR EuropePMC; PMC99098; 11752244. | |
34 | XX | |
35 | FH Key Location/Qualifiers | |
36 | FH | |
37 | FT source 1..1859 | |
38 | FT /organism="Trifolium repens" | |
39 | FT /mol_type="mRNA" | |
40 | FT /clone_lib="lambda gt10" | |
41 | FT /clone="TRE361" | |
42 | FT /tissue_type="leaves" | |
43 | FT /db_xref="taxon:3899" | |
44 | FT mRNA 1..1859 | |
45 | FT /experiment="experimental evidence, no additional details | |
46 | FT recorded" | |
47 | FT CDS 14..1495 | |
48 | FT /product="beta-glucosidase" | |
49 | FT /EC_number="3.2.1.21" | |
50 | FT /note="non-cyanogenic" | |
51 | FT /db_xref="GOA:P26204" | |
52 | FT /db_xref="InterPro:IPR001360" | |
53 | FT /db_xref="InterPro:IPR013781" | |
54 | FT /db_xref="InterPro:IPR017853" | |
55 | FT /db_xref="InterPro:IPR018120" | |
56 | FT /db_xref="UniProtKB/Swiss-Prot:P26204" | |
57 | FT /protein_id="CAA40058.1" | |
58 | FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI | |
59 | FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK | |
60 | FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ | |
61 | FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR | |
62 | FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD | |
63 | FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF | |
64 | FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ | |
65 | FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA | |
66 | FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" | |
67 | XX | |
68 | SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; | |
69 | aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 | |
70 | cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 | |
71 | tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 | |
72 | aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 | |
73 | tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 | |
74 | caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 | |
75 | ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 | |
76 | atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 | |
77 | ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 | |
78 | tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 | |
79 | gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 | |
80 | aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 | |
81 | aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 | |
82 | taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 | |
83 | gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 | |
84 | cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 | |
85 | gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 | |
86 | ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 | |
87 | acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 | |
88 | acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 | |
89 | gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 | |
90 | gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 | |
91 | agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 | |
92 | ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 | |
93 | taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 | |
94 | tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 | |
95 | ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 | |
96 | tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 | |
97 | aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 | |
98 | agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 | |
99 | tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 | |
100 | // | |
101 | ID seq2; SV 1; linear; mRNA; STD; PLN; 1859 BP. | |
102 | XX | |
103 | AC X56734; S46826; | |
104 | XX | |
105 | DT 12-SEP-1991 (Rel. 29, Created) | |
106 | DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) | |
107 | XX | |
108 | DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase | |
109 | XX | |
110 | KW beta-glucosidase. | |
111 | XX | |
112 | OS Trifolium repens (white clover) | |
113 | OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; | |
114 | OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; | |
115 | OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. | |
116 | XX | |
117 | RN [5] | |
118 | RP 1-1859 | |
119 | RX DOI; 10.1007/BF00039495. | |
120 | RX PUBMED; 1907511. | |
121 | RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; | |
122 | RT "Nucleotide and derived amino acid sequence of the cyanogenic | |
123 | RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; | |
124 | RL Plant Mol. Biol. 17(2):209-219(1991). | |
125 | XX | |
126 | RN [6] | |
127 | RP 1-1859 | |
128 | RA Hughes M.A.; | |
129 | RT ; | |
130 | RL Submitted (19-NOV-1990) to the INSDC. | |
131 | RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle | |
132 | RL Upon Tyne, NE2 4HH, UK | |
133 | XX | |
134 | DR EuropePMC; PMC99098; 11752244. | |
135 | XX | |
136 | FH Key Location/Qualifiers | |
137 | FH | |
138 | FT source 1..1859 | |
139 | FT /organism="Trifolium repens" | |
140 | FT /mol_type="mRNA" | |
141 | FT /clone_lib="lambda gt10" | |
142 | FT /clone="TRE361" | |
143 | FT /tissue_type="leaves" | |
144 | FT /db_xref="taxon:3899" | |
145 | FT mRNA 1..1859 | |
146 | FT /experiment="experimental evidence, no additional details | |
147 | FT recorded" | |
148 | FT CDS 14..1495 | |
149 | FT /product="beta-glucosidase" | |
150 | FT /EC_number="3.2.1.21" | |
151 | FT /note="non-cyanogenic" | |
152 | FT /db_xref="GOA:P26204" | |
153 | FT /db_xref="InterPro:IPR001360" | |
154 | FT /db_xref="InterPro:IPR013781" | |
155 | FT /db_xref="InterPro:IPR017853" | |
156 | FT /db_xref="InterPro:IPR018120" | |
157 | FT /db_xref="UniProtKB/Swiss-Prot:P26204" | |
158 | FT /protein_id="CAA40058.1" | |
159 | FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI | |
160 | FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK | |
161 | FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ | |
162 | FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR | |
163 | FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD | |
164 | FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF | |
165 | FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ | |
166 | FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA | |
167 | FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" | |
168 | XX | |
169 | aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 | |
170 | cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 | |
171 | tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 | |
172 | aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 | |
173 | tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 | |
174 | caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 | |
175 | ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 | |
176 | atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 | |
177 | ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 | |
178 | tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 | |
179 | gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 | |
180 | aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 | |
181 | aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 | |
182 | taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 | |
183 | gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 | |
184 | cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 | |
185 | gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 | |
186 | ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 | |
187 | acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 | |
188 | acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 | |
189 | gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 | |
190 | gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 | |
191 | agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 | |
192 | ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 | |
193 | taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 | |
194 | tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 | |
195 | ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 | |
196 | tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 | |
197 | aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 | |
198 | agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 | |
199 | tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa ccccccccc 1859 | |
200 | // | |
201 |
0 | ID seq1; SV 1; linear; mRNA; STD; PLN; 1859 BP. | |
1 | XX | |
2 | AC X56734; S46826; | |
3 | XX | |
4 | DT 12-SEP-1991 (Rel. 29, Created) | |
5 | DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) | |
6 | XX | |
7 | DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase | |
8 | XX | |
9 | KW beta-glucosidase. | |
10 | XX | |
11 | OS Trifolium repens (white clover) | |
12 | OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; | |
13 | OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; | |
14 | OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. | |
15 | XX | |
16 | RN [5] | |
17 | RP 1-1859 | |
18 | RX DOI; 10.1007/BF00039495. | |
19 | RX PUBMED; 1907511. | |
20 | RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; | |
21 | RT "Nucleotide and derived amino acid sequence of the cyanogenic | |
22 | RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; | |
23 | RL Plant Mol. Biol. 17(2):209-219(1991). | |
24 | XX | |
25 | RN [6] | |
26 | RP 1-1859 | |
27 | RA Hughes M.A.; | |
28 | RT ; | |
29 | RL Submitted (19-NOV-1990) to the INSDC. | |
30 | RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle | |
31 | RL Upon Tyne, NE2 4HH, UK | |
32 | XX | |
33 | DR EuropePMC; PMC99098; 11752244. | |
34 | XX | |
35 | FH Key Location/Qualifiers | |
36 | FH | |
37 | FT source 1..1859 | |
38 | FT /organism="Trifolium repens" | |
39 | FT /mol_type="mRNA" | |
40 | FT /clone_lib="lambda gt10" | |
41 | FT /clone="TRE361" | |
42 | FT /tissue_type="leaves" | |
43 | FT /db_xref="taxon:3899" | |
44 | FT mRNA 1..1859 | |
45 | FT /experiment="experimental evidence, no additional details | |
46 | FT recorded" | |
47 | FT CDS 14..1495 | |
48 | FT /product="beta-glucosidase" | |
49 | FT /EC_number="3.2.1.21" | |
50 | FT /note="non-cyanogenic" | |
51 | FT /db_xref="GOA:P26204" | |
52 | FT /db_xref="InterPro:IPR001360" | |
53 | FT /db_xref="InterPro:IPR013781" | |
54 | FT /db_xref="InterPro:IPR017853" | |
55 | FT /db_xref="InterPro:IPR018120" | |
56 | FT /db_xref="UniProtKB/Swiss-Prot:P26204" | |
57 | FT /protein_id="CAA40058.1" | |
58 | FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI | |
59 | FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK | |
60 | FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ | |
61 | FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR | |
62 | FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD | |
63 | FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF | |
64 | FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ | |
65 | FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA | |
66 | FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" | |
67 | XX | |
68 | SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; | |
69 | aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 | |
70 | cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 | |
71 | tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 | |
72 | aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 | |
73 | tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 | |
74 | caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 | |
75 | ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 | |
76 | atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 | |
77 | ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 | |
78 | tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 | |
79 | gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 | |
80 | aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 | |
81 | aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 | |
82 | taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 | |
83 | gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 | |
84 | cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 | |
85 | gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 | |
86 | ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 | |
87 | acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 | |
88 | acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 | |
89 | gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 | |
90 | gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 | |
91 | agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 | |
92 | ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 | |
93 | taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 | |
94 | tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 | |
95 | ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 | |
96 | tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 | |
97 | aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 | |
98 | agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 | |
99 | tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 | |
100 | ID seq2; SV 1; linear; mRNA; STD; PLN; 1859 BP. | |
101 | XX | |
102 | AC X56734; S46826; | |
103 | XX | |
104 | DT 12-SEP-1991 (Rel. 29, Created) | |
105 | DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) | |
106 | XX | |
107 | DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase | |
108 | XX | |
109 | KW beta-glucosidase. | |
110 | XX | |
111 | OS Trifolium repens (white clover) | |
112 | OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; | |
113 | OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; | |
114 | OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. | |
115 | XX | |
116 | RN [5] | |
117 | RP 1-1859 | |
118 | RX DOI; 10.1007/BF00039495. | |
119 | RX PUBMED; 1907511. | |
120 | RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; | |
121 | RT "Nucleotide and derived amino acid sequence of the cyanogenic | |
122 | RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; | |
123 | RL Plant Mol. Biol. 17(2):209-219(1991). | |
124 | XX | |
125 | RN [6] | |
126 | RP 1-1859 | |
127 | RA Hughes M.A.; | |
128 | RT ; | |
129 | RL Submitted (19-NOV-1990) to the INSDC. | |
130 | RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle | |
131 | RL Upon Tyne, NE2 4HH, UK | |
132 | XX | |
133 | DR EuropePMC; PMC99098; 11752244. | |
134 | XX | |
135 | FH Key Location/Qualifiers | |
136 | FH | |
137 | FT source 1..1859 | |
138 | FT /organism="Trifolium repens" | |
139 | FT /mol_type="mRNA" | |
140 | FT /clone_lib="lambda gt10" | |
141 | FT /clone="TRE361" | |
142 | FT /tissue_type="leaves" | |
143 | FT /db_xref="taxon:3899" | |
144 | FT mRNA 1..1859 | |
145 | FT /experiment="experimental evidence, no additional details | |
146 | FT recorded" | |
147 | FT CDS 14..1495 | |
148 | FT /product="beta-glucosidase" | |
149 | FT /EC_number="3.2.1.21" | |
150 | FT /note="non-cyanogenic" | |
151 | FT /db_xref="GOA:P26204" | |
152 | FT /db_xref="InterPro:IPR001360" | |
153 | FT /db_xref="InterPro:IPR013781" | |
154 | FT /db_xref="InterPro:IPR017853" | |
155 | FT /db_xref="InterPro:IPR018120" | |
156 | FT /db_xref="UniProtKB/Swiss-Prot:P26204" | |
157 | FT /protein_id="CAA40058.1" | |
158 | FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI | |
159 | FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK | |
160 | FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ | |
161 | FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR | |
162 | FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD | |
163 | FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF | |
164 | FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ | |
165 | FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA | |
166 | FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" | |
167 | XX | |
168 | SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; | |
169 | aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 | |
170 | cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 | |
171 | tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 | |
172 | aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 | |
173 | tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 | |
174 | caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 | |
175 | ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 | |
176 | atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 | |
177 | ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 | |
178 | tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 | |
179 | gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 | |
180 | aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 | |
181 | aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 | |
182 | taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 | |
183 | gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 | |
184 | cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 | |
185 | gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 | |
186 | ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 | |
187 | acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 | |
188 | acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 | |
189 | gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 | |
190 | gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 | |
191 | agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 | |
192 | ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 | |
193 | taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 | |
194 | tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 | |
195 | ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 | |
196 | tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 | |
197 | aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 | |
198 | agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 | |
199 | tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa ccccccccc 1859 | |
200 | // | |
201 |
0 | >seq1 | |
1 | aaacaaaccaaatatggattttattgtagccatatttgctctgtttgttattagctcatt | |
2 | cacaattacttccacaaatgcagttgaagcttctactcttcttgacataggtaacctgag | |
3 | tcggagcagttttcctcgtggcttcatctttggtgctggatcttcagcataccaatttga | |
4 | aggtgcagtaaacgaaggcggtagaggaccaagtatttgggataccttcacccataaata | |
5 | tccagaaaaaataagggatggaagcaatgcagacatcacggttgaccaatatcaccgcta | |
6 | caaggaagatgttgggattatgaaggatcaaaatatggattcgtatagattctcaatctc | |
7 | ttggccaagaatactcccaaagggaaagttgagcggaggcataaatcacgaaggaatcaa | |
8 | atattacaacaaccttatcaacgaactattggctaacggtatacaaccatttgtaactct | |
9 | ttttcattgggatcttccccaagtcttagaagatgagtatggtggtttcttaaactccgg | |
10 | tgtaataaatgattttcgagactatacggatctttgcttcaaggaatttggagatagagt | |
11 | gaggtattggagtactctaaatgagccatgggtgtttagcaattctggatatgcactagg | |
12 | aacaaatgcaccaggtcgatgttcggcctccaacgtggccaagcctggtgattctggaac | |
13 | aggaccttatatagttacacacaatcaaattcttgctcatgcagaagctgtacatgtgta | |
14 | taagactaaataccaggcatatcaaaagggaaagataggcataacgttggtatctaactg | |
15 | gttaatgccacttgatgataatagcataccagatataaaggctgccgagagatcacttga | |
16 | cttccaatttggattgtttatggaacaattaacaacaggagattattctaagagcatgcg | |
17 | gcgtatagttaaaaaccgattacctaagttctcaaaattcgaatcaagcctagtgaatgg | |
18 | ttcatttgattttattggtataaactattactcttctagttatattagcaatgccccttc | |
19 | acatggcaatgccaaacccagttactcaacaaatcctatgaccaatatttcatttgaaaa | |
20 | acatgggatacccttaggtccaagggctgcttcaatttggatatatgtttatccatatat | |
21 | gtttatccaagaggacttcgagatcttttgttacatattaaaaataaatataacaatcct | |
22 | gcaattttcaatcactgaaaatggtatgaatgaattcaacgatgcaacacttccagtaga | |
23 | agaagctcttttgaatacttacagaattgattactattaccgtcacttatactacattcg | |
24 | ttctgcaatcagggctggctcaaatgtgaagggtttttacgcatggtcatttttggactg | |
25 | taatgaatggtttgcaggctttactgttcgttttggattaaactttgtagattagaaaga | |
26 | tggattaaaaaggtaccctaagctttctgcccaatggtacaagaactttctcaaaagaaa | |
27 | ctagctagtattattaaaagaactttgtagtagattacagtacatcgtttgaagttgagt | |
28 | tggtgcacctaattaaataaaagaggttactcttaacatatttttaggccattcgttgtg | |
29 | aagttgttaggctgttatttctattatactatgttgtagtaataagtgcattgttgtacc | |
30 | agaagctatgatcataactataggttgatccttcatgtatcagtttgatgttgagaatac | |
31 | tttgaattaaaagtctttttttatttttttaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | |
32 | >seq2 | |
33 | aaacaaaccaaatatggattttattgtagccatatttgctctgtttgttattagctcatt | |
34 | cacaattacttccacaaatgcagttgaagcttctactcttcttgacataggtaacctgag | |
35 | tcggagcagttttcctcgtggcttcatctttggtgctggatcttcagcataccaatttga | |
36 | aggtgcagtaaacgaaggcggtagaggaccaagtatttgggataccttcacccataaata | |
37 | tccagaaaaaataagggatggaagcaatgcagacatcacggttgaccaatatcaccgcta | |
38 | caaggaagatgttgggattatgaaggatcaaaatatggattcgtatagattctcaatctc | |
39 | ttggccaagaatactcccaaagggaaagttgagcggaggcataaatcacgaaggaatcaa | |
40 | atattacaacaaccttatcaacgaactattggctaacggtatacaaccatttgtaactct | |
41 | ttttcattgggatcttccccaagtcttagaagatgagtatggtggtttcttaaactccgg | |
42 | tgtaataaatgattttcgagactatacggatctttgcttcaaggaatttggagatagagt | |
43 | gaggtattggagtactctaaatgagccatgggtgtttagcaattctggatatgcactagg | |
44 | aacaaatgcaccaggtcgatgttcggcctccaacgtggccaagcctggtgattctggaac | |
45 | aggaccttatatagttacacacaatcaaattcttgctcatgcagaagctgtacatgtgta | |
46 | taagactaaataccaggcatatcaaaagggaaagataggcataacgttggtatctaactg | |
47 | gttaatgccacttgatgataatagcataccagatataaaggctgccgagagatcacttga | |
48 | cttccaatttggattgtttatggaacaattaacaacaggagattattctaagagcatgcg | |
49 | gcgtatagttaaaaaccgattacctaagttctcaaaattcgaatcaagcctagtgaatgg | |
50 | ttcatttgattttattggtataaactattactcttctagttatattagcaatgccccttc | |
51 | acatggcaatgccaaacccagttactcaacaaatcctatgaccaatatttcatttgaaaa | |
52 | acatgggatacccttaggtccaagggctgcttcaatttggatatatgtttatccatatat | |
53 | gtttatccaagaggacttcgagatcttttgttacatattaaaaataaatataacaatcct | |
54 | gcaattttcaatcactgaaaatggtatgaatgaattcaacgatgcaacacttccagtaga | |
55 | agaagctcttttgaatacttacagaattgattactattaccgtcacttatactacattcg | |
56 | ttctgcaatcagggctggctcaaatgtgaagggtttttacgcatggtcatttttggactg | |
57 | taatgaatggtttgcaggctttactgttcgttttggattaaactttgtagattagaaaga | |
58 | tggattaaaaaggtaccctaagctttctgcccaatggtacaagaactttctcaaaagaaa | |
59 | ctagctagtattattaaaagaactttgtagtagattacagtacatcgtttgaagttgagt | |
60 | tggtgcacctaattaaataaaagaggttactcttaacatatttttaggccattcgttgtg | |
61 | aagttgttaggctgttatttctattatactatgttgtagtaataagtgcattgttgtacc | |
62 | agaagctatgatcataactataggttgatccttcatgtatcagtttgatgttgagaatac | |
63 | tttgaattaaaagtctttttttatttttttaaaaaaaaaaaaaaaaaaaaccccccccc |
0 | >1 | |
1 | 40 40 40 | |
2 | 40 40 | |
3 | ||
4 | >2 | |
5 | 40 | |
6 | 40 | |
7 | ||
8 | 40 | |
9 | 40 40 | |
10 | >3 | |
11 | ||
12 | 40 40 40 40 40 | |
13 | ||
14 | >4 | |
15 | 40 40 40 40 40 | |
16 |
0 | >1 | |
1 | 40 40 40 | |
2 | 40 40 | |
3 | ||
4 | >3 | |
5 | 40 | |
6 | 40 | |
7 | ||
8 | 40 | |
9 | 40 40 | |
10 | >3 | |
11 | ||
12 | 40 40 40 40 40 | |
13 | ||
14 | >4 | |
15 | 40 40 40 40 40 | |
16 |
0 | @1 | |
1 | ACGTA | |
2 | + | |
3 | IIIII | |
4 | @2 | |
5 | ACGTA | |
6 | + | |
7 | IIIII | |
8 | @3 | |
9 | ACGTA | |
10 | + | |
11 | IIIII | |
12 | @4 | |
13 | ACGTA | |
14 | + | |
15 | IIIII |
0 | LOCUS NAME1 5028 bp DNA PLN 21-JUN-1999 | |
1 | DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p | |
2 | (AXL2) and Rev7p (REV7) genes, complete cds. | |
3 | ACCESSION U49845 | |
4 | VERSION U49845.1 GI:1293613 | |
5 | KEYWORDS . | |
6 | SOURCE Saccharomyces cerevisiae (baker's yeast) | |
7 | ORGANISM Saccharomyces cerevisiae | |
8 | Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; | |
9 | Saccharomycetales; Saccharomycetaceae; Saccharomyces. | |
10 | REFERENCE 1 (bases 1 to 5028) | |
11 | AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W. | |
12 | TITLE Cloning and sequence of REV7, a gene whose function is required for | |
13 | DNA damage-induced mutagenesis in Saccharomyces cerevisiae | |
14 | JOURNAL Yeast 10 (11), 1503-1509 (1994) | |
15 | PUBMED 7871890 | |
16 | REFERENCE 2 (bases 1 to 5028) | |
17 | AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. | |
18 | TITLE Selection of axial growth sites in yeast requires Axl2p, a novel | |
19 | plasma membrane glycoprotein | |
20 | JOURNAL Genes Dev. 10 (7), 777-793 (1996) | |
21 | PUBMED 8846915 | |
22 | REFERENCE 3 (bases 1 to 5028) | |
23 | AUTHORS Roemer,T. | |
24 | TITLE Direct Submission | |
25 | JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New | |
26 | Haven, CT, USA | |
27 | FEATURES Location/Qualifiers | |
28 | source 1..5028 | |
29 | /organism="Saccharomyces cerevisiae" | |
30 | /db_xref="taxon:4932" | |
31 | /chromosome="IX" | |
32 | /map="9" | |
33 | CDS <1..206 | |
34 | /codon_start=3 | |
35 | /product="TCP1-beta" | |
36 | /protein_id="AAA98665.1" | |
37 | /db_xref="GI:1293614" | |
38 | /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA | |
39 | AEVLLRVDNIIRARPRTANRQHM" | |
40 | gene 687..3158 | |
41 | /gene="AXL2" | |
42 | CDS 687..3158 | |
43 | /gene="AXL2" | |
44 | /note="plasma membrane glycoprotein" | |
45 | /codon_start=1 | |
46 | /function="required for axial budding pattern of S. | |
47 | cerevisiae" | |
48 | /product="Axl2p" | |
49 | /protein_id="AAA98666.1" | |
50 | /db_xref="GI:1293615" | |
51 | /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF | |
52 | TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN | |
53 | VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE | |
54 | VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE | |
55 | TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV | |
56 | YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG | |
57 | DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ | |
58 | DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA | |
59 | NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA | |
60 | CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN | |
61 | NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ | |
62 | SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS | |
63 | YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK | |
64 | HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL | |
65 | VDFSNKSNVNVGQVKDIHGRIPEML" | |
66 | gene complement(3300..4037) | |
67 | /gene="REV7" | |
68 | CDS complement(3300..4037) | |
69 | /gene="REV7" | |
70 | /codon_start=1 | |
71 | /product="Rev7p" | |
72 | /protein_id="AAA98667.1" | |
73 | /db_xref="GI:1293616" | |
74 | /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ | |
75 | FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD | |
76 | KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR | |
77 | RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK | |
78 | LISGDDKILNGVYSQYEEGESIFGSLF" | |
79 | ORIGIN | |
80 | 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg | |
81 | 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct | |
82 | 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa | |
83 | 181 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc | |
84 | // | |
85 | LOCUS NAME2 5028 bp DNA PLN 21-JUN-1999 | |
86 | DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p | |
87 | (AXL2) and Rev7p (REV7) genes, complete cds. | |
88 | ACCESSION U49845 | |
89 | VERSION U49845.1 GI:1293613 | |
90 | KEYWORDS . | |
91 | SOURCE Saccharomyces cerevisiae (baker's yeast) | |
92 | ORGANISM Saccharomyces cerevisiae | |
93 | Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; | |
94 | Saccharomycetales; Saccharomycetaceae; Saccharomyces. | |
95 | REFERENCE 1 (bases 1 to 5028) | |
96 | AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W. | |
97 | TITLE Cloning and sequence of REV7, a gene whose function is required for | |
98 | DNA damage-induced mutagenesis in Saccharomyces cerevisiae | |
99 | JOURNAL Yeast 10 (11), 1503-1509 (1994) | |
100 | PUBMED 7871890 | |
101 | REFERENCE 2 (bases 1 to 5028) | |
102 | AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. | |
103 | TITLE Selection of axial growth sites in yeast requires Axl2p, a novel | |
104 | plasma membrane glycoprotein | |
105 | JOURNAL Genes Dev. 10 (7), 777-793 (1996) | |
106 | PUBMED 8846915 | |
107 | REFERENCE 3 (bases 1 to 5028) | |
108 | AUTHORS Roemer,T. | |
109 | TITLE Direct Submission | |
110 | JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New | |
111 | Haven, CT, USA | |
112 | FEATURES Location/Qualifiers | |
113 | source 1..5028 | |
114 | /organism="Saccharomyces cerevisiae" | |
115 | /db_xref="taxon:4932" | |
116 | /chromosome="IX" | |
117 | /map="9" | |
118 | CDS <1..206 | |
119 | /codon_start=3 | |
120 | /product="TCP1-beta" | |
121 | /protein_id="AAA98665.1" | |
122 | /db_xref="GI:1293614" | |
123 | /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA | |
124 | AEVLLRVDNIIRARPRTANRQHM" | |
125 | gene 687..3158 | |
126 | /gene="AXL2" | |
127 | CDS 687..3158 | |
128 | /gene="AXL2" | |
129 | /note="plasma membrane glycoprotein" | |
130 | /codon_start=1 | |
131 | /function="required for axial budding pattern of S. | |
132 | cerevisiae" | |
133 | /product="Axl2p" | |
134 | /protein_id="AAA98666.1" | |
135 | /db_xref="GI:1293615" | |
136 | /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF | |
137 | TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN | |
138 | VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE | |
139 | VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE | |
140 | TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV | |
141 | YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG | |
142 | DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ | |
143 | DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA | |
144 | NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA | |
145 | CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN | |
146 | NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ | |
147 | SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS | |
148 | YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK | |
149 | HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL | |
150 | VDFSNKSNVNVGQVKDIHGRIPEML" | |
151 | gene complement(3300..4037) | |
152 | /gene="REV7" | |
153 | CDS complement(3300..4037) | |
154 | /gene="REV7" | |
155 | /codon_start=1 | |
156 | /product="Rev7p" | |
157 | /protein_id="AAA98667.1" | |
158 | /db_xref="GI:1293616" | |
159 | /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ | |
160 | FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD | |
161 | KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR | |
162 | RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK | |
163 | LISGDDKILNGVYSQYEEGESIFGSLF" | |
164 | ORIGIN | |
165 | 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg | |
166 | 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct | |
167 | 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa | |
168 | 181 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgaaa | |
169 | // |
0 | >NAME1 | |
1 | gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattg | |
2 | ccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagct | |
3 | ctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaa | |
4 | tgccatgactcagattctaattttaagctattcaatttctctttgatc | |
5 | >NAME2 | |
6 | gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattg | |
7 | ccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagct | |
8 | ctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaa | |
9 | tgccatgactcagattctaattttaagctattcaatttctctttgaaa |
0 | >ID | |
1 | A | |
2 | >ID | |
3 | AA | |
4 | >ID | |
5 | AAA | |
6 | >ID | |
7 | AAA | |
8 | A | |
9 | >ID | |
10 | AAA | |
11 | AA | |
12 | >ID | |
13 | AAA | |
14 | AAA | |
15 | >ID | |
16 | AAA | |
17 | AAA | |
18 | A |
0 | >one.p1k | |
1 | ACGT | |
2 | >one.q1k | |
3 | CCCC | |
4 | >two.p1k | |
5 | A | |
6 | >two.q1k | |
7 | C | |
8 | >one.p1k | |
9 | TTTTTTTTTT | |
10 | >three.q1k | |
11 | A | |
12 | >four.x | |
13 | T | |
14 | >five.p1k | |
15 | G |
Binary diff not shown
Binary diff not shown
0 | ##gff-version 3 | |
1 | # comment | |
2 | ##sequence-region seq1 1 10 | |
3 | seq1 . gene 3 7 . + . ID=gene1;name=name1 | |
4 | ##FASTA | |
5 | >seq1 | |
6 | ACGTACGTAC | |
7 | >seq2 | |
8 | ACGTACGTAC |
0 | ##gff-version 3 | |
1 | # comment | |
2 | ##sequence-region seq1 1 10 | |
3 | seq1 . gene 3 7 . + . ID=gene1;name=name1 | |
4 | >seq1 | |
5 | ACGTACGTAC | |
6 | >seq2 | |
7 | ACGTACGTAC |
0 | ##gff-version 3 | |
1 | # comment | |
2 | ##sequence-region seq1 1 10 | |
3 | seq1 . gene 3 7 . + . ID=gene1;name=name1 | |
4 | ##FASTA | |
5 | oops |
0 | ##gff-version 3 | |
1 | # comment | |
2 | ##sequence-region seq1 1 10 | |
3 | seq1 . gene 3 7 . + . ID=gene1;name=name1 |
0 | <?xml version="1.0"?> | |
1 | <trace_volume> | |
2 | <trace> | |
3 | <trace_name>ID</trace_name> | |
4 | <clip_quality_right>5</clip_quality_right> | |
5 | <clip_vector_left>1</clip_vector_left> | |
6 | </trace> | |
7 | <trace> | |
8 | <trace_name>ID</trace_name> | |
9 | <clip_quality_right>5</clip_quality_right> | |
10 | <clip_vector_left>1</clip_vector_left> | |
11 | </trace> | |
12 | </trace_volume> |
0 | >A | |
1 | ACG | |
2 | >B | |
3 | ACG | |
4 | >C | |
5 | ACG | |
6 | >D | |
7 | ACG | |
8 | >E | |
9 | ACG | |
10 | >F | |
11 | ACG | |
12 | >G | |
13 | ACG | |
14 | >H | |
15 | ACG | |
16 | >I | |
17 | ACG | |
18 | >J | |
19 | ACG | |
20 | >K | |
21 | ACG | |
22 | >L | |
23 | ACG | |
24 | >M | |
25 | ACG | |
26 | >N | |
27 | ACG | |
28 | >O | |
29 | ACG | |
30 | >P | |
31 | ACG | |
32 | >Q | |
33 | ACG | |
34 | >R | |
35 | ACG | |
36 | >S | |
37 | ACG | |
38 | >T | |
39 | ACG | |
40 | >U | |
41 | ACG | |
42 | >V | |
43 | ACG | |
44 | >W | |
45 | ACG | |
46 | >X | |
47 | ACG | |
48 | >Y | |
49 | ACG | |
50 | >Z | |
51 | ACG | |
52 | >A | |
53 | ACG | |
54 | >B | |
55 | ACG |
0 | i am not a fasta or fastq file |
0 | >1 | |
1 | GTATGACGACTTCTCGGTCAAAGGTAAGGTGAACAAGGGATTGAATGCTTAAATCCCGTG | |
2 | CCTACACTCAGTACCGGTGCTTGGCTGAAGCGTTCCTATGCAAGAATGAGAACTGGCAAC | |
3 | ACGTCGCGGCCAGCCCGGGACCATCAGGACCCGAACGTGTACCGCGAATGTTTACATTTC | |
4 | ACCCAGTTACCCGGATTCGGGCCAAAGCAGGAGAGCCTCTGAATTAGATGGTGCCACGTA | |
5 | AGTCTATTTTCGCACGTTTTATTGATTCAAGTGAGTGTCAACGTAGATTTATTGGTGCTT | |
6 | GGCTAAAGACGTATGGATCACGGGATGGAACATCTGGATCCCCCATGTACGTAAGTGTGT | |
7 | CGTCAAACAAAATTCTGTATCCCGTCGCTCCTGCCAGGGCAATCGCGGAGCTACGGACAT | |
8 | AGTCCTTAGTGAACTAATGATGATGAACATCTCGAACCAGGTTAACACGATACGATGAAG | |
9 | CGGGTTACTGAACACACTTAACAGGAGCCTGAGCAAATGTCATTTACAAAAGGTTTCTAG | |
10 | ACCCCCTTGGTAAGTCACTTGACACGTCTCATGCGGGGCCTACGGTAAACCAGATGCTAG | |
11 | AGTAGCGAACGGTGGGTGCGCAGGCATGTCCGGTCTCTCGATGGTGCACTTACGGACATC | |
12 | TCCCTATACAGATCTATTCAGTCACGAAGGTCAGCGAACATAACCCACGGGAGTTATCTC | |
13 | AACGAGTACGGGAGCGAACGGTGCACGGATCTGTCTTAGCTCAGAGGCGTCACGCGGTCC | |
14 | TATCTAACGCAAGAGCATGTGCCATTCCGGCCCTCTGATGTGCCTATGTACATAGAGCCG | |
15 | ACCCCGGCGGATTGGAGTCCCTAGCTACCGTCGACAGAGACGCAAAGACTCAATTGCTAT | |
16 | GTATATTGTTACTCTTCAACCACTGGAAAGACAAATAATTGCGGGCAAGTGCGTTACCCA | |
17 | TCACTCTGTTCTGTACACGAAAGGCTGAATAGCAAGTGGC |
0 | 1 fastaq CDS 28 222 . + . | |
1 | 1 fastaq CDS 45 227 . + . | |
2 | 1 fastaq CDS 49 171 . - . | |
3 | 1 fastaq CDS 110 241 . + . | |
4 | 1 fastaq CDS 144 266 . - . | |
5 | 1 fastaq CDS 228 422 . + . | |
6 | 1 fastaq CDS 278 433 . - . | |
7 | 1 fastaq CDS 287 478 . + . | |
8 | 1 fastaq CDS 289 519 . - . | |
9 | 1 fastaq CDS 563 703 . + . | |
10 | 1 fastaq CDS 601 759 . + . | |
11 | 1 fastaq CDS 606 818 . + . | |
12 | 1 fastaq CDS 819 938 . + . | |
13 | 1 fastaq CDS 836 988 . + . | |
14 | 1 fastaq CDS 865 999 . + . |
0 | 3 42 | |
1 | Turkey AA-CTNGGGC ATTTCAGGGT | |
2 | Salmo_gairAAGCCTTGGC AGTGCAGGGT | |
3 | H. SapiensACCGGTTGGC CGTTCAGGGT | |
4 | ||
5 | GAGCCCGGGC AATACAGGGT AT | |
6 | GAGCCGTGGC CGGGCACGGT AT | |
7 | ACAGGTTGGC CGTTCAGGGT AA |
0 | >Turkey | |
1 | AACTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT | |
2 | >Salmo_gair | |
3 | AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT | |
4 | >H. Sapiens | |
5 | ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA |
0 | 3 42 | |
1 | Turkey AA-CTNGGGC ATTTCAGGGT | |
2 | Salmo_gairAAGCCTTGGC AGTGCAGGGT | |
3 | H. SapiensACCGGTTGGC CGTTCAGGGT | |
4 | GAGCCCGGGC AATACAGGGT AT | |
5 | GAGCCGTGGC CGGGCACGGT AT | |
6 | ACAGGTTGGC CGTTCAGGGT AA |
0 | >Turkey | |
1 | AACTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT | |
2 | >Salmo_gair | |
3 | AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT | |
4 | >H. Sapiens | |
5 | ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA |
0 | 2 97 | |
1 | seq1 GGGGGGGGGG GGGGGGGGGG GGGGGGGGGG GGGGGGGGGG GGGGGGGGGG GGGGGGGGGG | |
2 | seq2 AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA | |
3 | ||
4 | GGGGGGGGGG GGGGGGGGGG GGGGGGGGGG GGGGGGT | |
5 | AAAAAAAAAA AAAAAAAAAA AAAAAAAAA- -AAAAAG |
0 | >seq1 | |
1 | GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG | |
2 | GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGT | |
3 | >seq2 | |
4 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA | |
5 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG |
0 | 3 42 | |
1 | Turkey AA-CTNGGGC ATTTCAGGGT | |
2 | GAGCCCGGGC AATACAGGGT AT | |
3 | Salmo_gairAAGCCTTGGC AGTGCAGGGT | |
4 | GAGCCGTGGC CGGGCACGGT AT | |
5 | H. SapiensACCGGTTGGC CGTTCAGGGT | |
6 | ACAGGTTGGC CGTTCAGGGT AA |
0 | >Turkey | |
1 | AACTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT | |
2 | >Salmo_gair | |
3 | AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT | |
4 | >H. Sapiens | |
5 | ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA |
0 | seq1:1-4 seq1 0 |
0 | >seq1 | |
1 | A | |
2 | >seq2 | |
3 | NC | |
4 | >seq3 | |
5 | ACG | |
6 | >seq4 | |
7 | ACGT | |
8 | >seq5 | |
9 | NNNTA | |
10 | >seq6 | |
11 | ACGTAC | |
12 | >seq7 | |
13 | ACGTACG | |
14 | >seq8 | |
15 | ACGTACGT | |
16 | >seq9 | |
17 | ACGTACGTA |
0 | >seq1.1_1 | |
1 | A | |
2 | >seq2.1_2 | |
3 | NC | |
4 | >seq3.1_3 | |
5 | ACG | |
6 | >seq4.1_4 | |
7 | ACGT | |
8 | >seq5.1_3 | |
9 | NNN | |
10 | >seq5.4_5 | |
11 | TA | |
12 | >seq6.1_3 | |
13 | ACG | |
14 | >seq6.4_6 | |
15 | TAC | |
16 | >seq7.1_3 | |
17 | ACG | |
18 | >seq7.4_7 | |
19 | TACG | |
20 | >seq8.1_3 | |
21 | ACG | |
22 | >seq8.4_6 | |
23 | TAC | |
24 | >seq8.7_8 | |
25 | GT | |
26 | >seq9.1_3 | |
27 | ACG | |
28 | >seq9.4_6 | |
29 | TAC | |
30 | >seq9.7_9 | |
31 | GTA |
0 | >seq1.1_1 | |
1 | A | |
2 | >seq2.1_2 | |
3 | NC | |
4 | >seq3.1_3 | |
5 | ACG | |
6 | >seq4.1_4 | |
7 | ACGT | |
8 | >seq5.4_5 | |
9 | TA | |
10 | >seq6.1_3 | |
11 | ACG | |
12 | >seq6.4_6 | |
13 | TAC | |
14 | >seq7.1_3 | |
15 | ACG | |
16 | >seq7.4_7 | |
17 | TACG | |
18 | >seq8.1_3 | |
19 | ACG | |
20 | >seq8.4_6 | |
21 | TAC | |
22 | >seq8.7_8 | |
23 | GT | |
24 | >seq9.1_3 | |
25 | ACG | |
26 | >seq9.4_6 | |
27 | TAC | |
28 | >seq9.7_9 | |
29 | GTA |
0 | >seq | |
1 | GCAGCCGCGGCTAGAAGGCGACGCCGGCGTAACAATGACGATTGCTGTGAAGAGCAACAGGGAGGCGGGGGTCACCATATAATCATTTTATTGCTACTCCTGCTTAAAAAGATGTTCTTTCCACCCCCGCCTAGCAGTTCATCCTCGTCTACAACCACGACTTGGTACTATGTAGTCGTGGTTTAATAGTGA |
0 | >1 | |
1 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 | |
2 | 40 40 | |
3 | >2 | |
4 | 40 40 40 40 40 |
0 | >1 | |
1 | 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 | |
2 | 42 42 | |
3 | >2 | |
4 | 42 42 42 42 42 |
0 | >1_1_10 | |
1 | ACGCTCTCGA | |
2 | >1_6_15 | |
3 | CTCGAGCGCG | |
4 | >1_11_20 | |
5 | GCGCGAGCGC | |
6 | >1_16_25 | |
7 | AGCGCGAGCG | |
8 | >1_21_27 | |
9 | GAGCGAC |
0 | >1/1 | |
1 | 1234567890 | |
2 | >2/1 | |
3 | AACG123456789 | |
4 | >3/1 | |
5 | 1234567890 | |
6 | >4/1 | |
7 | AACG1234567890 | |
8 | >5/1 | |
9 | 1234567890 | |
10 | >6/1 | |
11 | AACG1234567890 | |
12 | >7/1 | |
13 | 123456789AGGC | |
14 | >8/1 | |
15 | 123456789 | |
16 | >9/1 | |
17 | 1234567890AGGC | |
18 | >10/1 | |
19 | AACG123456789CGTT | |
20 | >11/1 | |
21 | AACG1234567890CGTT | |
22 | >12/1 | |
23 | AACG1234567890CGTT |
0 | >1/1 | |
1 | 1234567890 | |
2 | >4/1 | |
3 | 1234567890 | |
4 | >5/1 | |
5 | 1234567890 | |
6 | >6/1 | |
7 | 1234567890 | |
8 | >9/1 | |
9 | 1234567890 | |
10 | >12/1 | |
11 | 1234567890 |
0 | >1/2 | |
1 | 1234567890 | |
2 | >2/2 | |
3 | 1234567890 | |
4 | >3/2 | |
5 | AACG123456789 | |
6 | >4/2 | |
7 | 1234567890 | |
8 | >5/2 | |
9 | AACG1234567890 | |
10 | >6/2 | |
11 | GCCT1234567890 | |
12 | >7/2 | |
13 | 1234567890 | |
14 | >8/2 | |
15 | 123456789AGGC | |
16 | >9/2 | |
17 | 1234567890CGTT | |
18 | >10/2 | |
19 | AACG1234567890CGTT | |
20 | >11/2 | |
21 | AACG123456789CGTT | |
22 | >12/2 | |
23 | AACG1234567890CGTT |
0 | >1/2 | |
1 | 1234567890 | |
2 | >4/2 | |
3 | 1234567890 | |
4 | >5/2 | |
5 | 1234567890 | |
6 | >6/2 | |
7 | 1234567890 | |
8 | >9/2 | |
9 | 1234567890 | |
10 | >12/2 | |
11 | 1234567890 |
0 | this is the contents of system call test file |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import unittest | |
3 | from pyfastaq import intervals | |
4 | ||
5 | class TestIntervals(unittest.TestCase): | |
6 | def test_init(self): | |
7 | '''Throw error if try to construct genome_interval from a non-int, or end<start''' | |
8 | with self.assertRaises(intervals.Error): | |
9 | intervals.Interval('a', 1) | |
10 | with self.assertRaises(intervals.Error): | |
11 | intervals.Interval(1, 'a') | |
12 | with self.assertRaises(intervals.Error): | |
13 | intervals.Interval('a', 'a') | |
14 | with self.assertRaises(intervals.Error): | |
15 | intervals.Interval(3, 2) | |
16 | ||
17 | def test_comparisons(self): | |
18 | '''<, <=, == should work as expected''' | |
19 | self.assertTrue(intervals.Interval(1,2) < intervals.Interval(2,2)) | |
20 | self.assertTrue(intervals.Interval(1,2) <= intervals.Interval(2,2)) | |
21 | self.assertFalse(intervals.Interval(2,2) <= intervals.Interval(1,2)) | |
22 | self.assertFalse(intervals.Interval(2,2) < intervals.Interval(1,2)) | |
23 | self.assertFalse(intervals.Interval(2,2) < intervals.Interval(2,2)) | |
24 | self.assertTrue(intervals.Interval(1,2) == intervals.Interval(1,2)) | |
25 | self.assertFalse(intervals.Interval(1,2) == intervals.Interval(1,3)) | |
26 | self.assertTrue(intervals.Interval(1,2) != intervals.Interval(1,3)) | |
27 | self.assertFalse(intervals.Interval(1,2) != intervals.Interval(1,2)) | |
28 | ||
29 | def test_len(self): | |
30 | self.assertEqual(len(intervals.Interval(1,2)), 2) | |
31 | self.assertEqual(len(intervals.Interval(1,1)), 1) | |
32 | self.assertEqual(len(intervals.Interval(10,20)), 11) | |
33 | ||
34 | def test_intersects(self): | |
35 | '''Intersection of two intervals should do the right thing''' | |
36 | a = intervals.Interval(5, 10) | |
37 | no_intersect = [intervals.Interval(3, 4), | |
38 | intervals.Interval(11,20)] | |
39 | intersect = [intervals.Interval(3,5), | |
40 | intervals.Interval(3,6), | |
41 | intervals.Interval(9,12), | |
42 | intervals.Interval(10,12), | |
43 | intervals.Interval(6,7), | |
44 | intervals.Interval(1,20)] | |
45 | ||
46 | for i in no_intersect: | |
47 | self.assertFalse(a.intersects(i), 'shouldn\'t intersect: ' + str(a) + ', ' + str(i)) | |
48 | ||
49 | for i in intersect: | |
50 | self.assertTrue(a.intersects(i), 'should intersect: ' + str(a) + ', ' + str(i)) | |
51 | ||
52 | def test_contains(self): | |
53 | '''Check that contains() works as expected''' | |
54 | a = intervals.Interval(5, 10) | |
55 | not_contained = [intervals.Interval(1,2), | |
56 | intervals.Interval(4,5), | |
57 | intervals.Interval(4,10), | |
58 | intervals.Interval(4,11), | |
59 | intervals.Interval(5,11), | |
60 | intervals.Interval(1,2), | |
61 | intervals.Interval(9,11), | |
62 | intervals.Interval(10,11), | |
63 | intervals.Interval(11,20)] | |
64 | ||
65 | ||
66 | contained = [intervals.Interval(5,5), | |
67 | intervals.Interval(5,10), | |
68 | intervals.Interval(6,7), | |
69 | intervals.Interval(6,10), | |
70 | intervals.Interval(10,10)] | |
71 | ||
72 | for i in not_contained: | |
73 | self.assertFalse(a.contains(i), 'shouldn\'t contain: ' + str(a) + ', ' + str(i)) | |
74 | ||
75 | for i in contained: | |
76 | self.assertTrue(a.contains(i), 'should contain: ' + str(a) + ', ' + str(i)) | |
77 | ||
78 | def test_union(self): | |
79 | '''Union should either return None or the correct union''' | |
80 | a = intervals.Interval(5, 10) | |
81 | b = intervals.Interval(8, 15) | |
82 | c = intervals.Interval(12, 20) | |
83 | d = intervals.Interval(21,22) | |
84 | self.assertEqual(a.union(c), None) | |
85 | self.assertEqual(c.union(a), None) | |
86 | self.assertEqual(a.union(b), intervals.Interval(5,15)) | |
87 | self.assertEqual(b.union(a), intervals.Interval(5,15)) | |
88 | self.assertEqual(c.union(d), intervals.Interval(12,22)) | |
89 | self.assertEqual(d.union(c), intervals.Interval(12,22)) | |
90 | ||
91 | def test_union_flll_gap(self): | |
92 | '''union_fill_gap() should ignore intersections and return the maximum range of coords''' | |
93 | a = intervals.Interval(5, 10) | |
94 | b = intervals.Interval(8, 15) | |
95 | c = intervals.Interval(12, 20) | |
96 | d = intervals.Interval(21,22) | |
97 | self.assertEqual(a.union_fill_gap(c), intervals.Interval(5,20)) | |
98 | self.assertEqual(c.union_fill_gap(a), intervals.Interval(5,20)) | |
99 | self.assertEqual(a.union_fill_gap(b), intervals.Interval(5,15)) | |
100 | self.assertEqual(b.union_fill_gap(a), intervals.Interval(5,15)) | |
101 | self.assertEqual(c.union_fill_gap(d), intervals.Interval(12,22)) | |
102 | self.assertEqual(d.union_fill_gap(c), intervals.Interval(12,22)) | |
103 | ||
104 | ||
105 | def test_intersection(self): | |
106 | '''Intersection should either return None or the correct intersection''' | |
107 | a = intervals.Interval(5, 10) | |
108 | b = intervals.Interval(8, 15) | |
109 | c = intervals.Interval(12, 20) | |
110 | self.assertEqual(a.intersection(c), None) | |
111 | self.assertEqual(a.intersection(b), intervals.Interval(8,10)) | |
112 | ||
113 | class Test_intersection(unittest.TestCase): | |
114 | def test_intersection(self): | |
115 | '''intersection() should correctly intersect two lists of intervals''' | |
116 | a = [intervals.Interval(1,2), | |
117 | intervals.Interval(10,20), | |
118 | intervals.Interval(51,52), | |
119 | intervals.Interval(54,55), | |
120 | intervals.Interval(57,58)] | |
121 | ||
122 | b = [intervals.Interval(5,6), | |
123 | intervals.Interval(9,11), | |
124 | intervals.Interval(13,14), | |
125 | intervals.Interval(17,18), | |
126 | intervals.Interval(20,25), | |
127 | intervals.Interval(50,60)] | |
128 | ||
129 | c = [intervals.Interval(100,200)] | |
130 | ||
131 | i = [intervals.Interval(10,11), | |
132 | intervals.Interval(13,14), | |
133 | intervals.Interval(17,18), | |
134 | intervals.Interval(20,20), | |
135 | intervals.Interval(51,52), | |
136 | intervals.Interval(54,55), | |
137 | intervals.Interval(57,58)] | |
138 | ||
139 | self.assertSequenceEqual(intervals.intersection(a,b), i) | |
140 | self.assertSequenceEqual(intervals.intersection(b,a), i) | |
141 | self.assertSequenceEqual(intervals.intersection(c,a), []) | |
142 | self.assertEqual(intervals.intersection([],a), []) | |
143 | self.assertEqual(intervals.intersection(a,[]), []) | |
144 | ||
145 | class Test_merge_overlapping_in_list(unittest.TestCase): | |
146 | def test_merge_overlapping_in_list(self): | |
147 | '''merge_overlapping_in_list() merges correctly''' | |
148 | a = [intervals.Interval(1,2), | |
149 | intervals.Interval(51,60), | |
150 | intervals.Interval(10,20), | |
151 | intervals.Interval(20,30), | |
152 | intervals.Interval(20,30), | |
153 | intervals.Interval(29,50), | |
154 | intervals.Interval(65,70)] | |
155 | ||
156 | b = [intervals.Interval(1,2), | |
157 | intervals.Interval(10,60), | |
158 | intervals.Interval(65,70)] | |
159 | ||
160 | intervals.merge_overlapping_in_list(a) | |
161 | self.assertSequenceEqual(a, b) | |
162 | ||
163 | class Test_remove_contained_in_list(unittest.TestCase): | |
164 | def test_remove_contained_in_list(self): | |
165 | '''test_remove_contained_in_list removes the right elements of list''' | |
166 | a = [intervals.Interval(1,2), | |
167 | intervals.Interval(4,4), | |
168 | intervals.Interval(4,5), | |
169 | intervals.Interval(5,6), | |
170 | intervals.Interval(7,9), | |
171 | intervals.Interval(8,10), | |
172 | intervals.Interval(9,11), | |
173 | intervals.Interval(20,25), | |
174 | intervals.Interval(20,24), | |
175 | intervals.Interval(20,26), | |
176 | intervals.Interval(30,38), | |
177 | intervals.Interval(30,37), | |
178 | intervals.Interval(30,36), | |
179 | intervals.Interval(30,35), | |
180 | intervals.Interval(30,35), | |
181 | intervals.Interval(32,33), | |
182 | intervals.Interval(38,50), | |
183 | intervals.Interval(65,70), | |
184 | intervals.Interval(67,70)] | |
185 | ||
186 | b = [intervals.Interval(1,2), | |
187 | intervals.Interval(4,5), | |
188 | intervals.Interval(5,6), | |
189 | intervals.Interval(7,9), | |
190 | intervals.Interval(8,10), | |
191 | intervals.Interval(9,11), | |
192 | intervals.Interval(20,26), | |
193 | intervals.Interval(30,38), | |
194 | intervals.Interval(38,50), | |
195 | intervals.Interval(65,70)] | |
196 | ||
197 | intervals.remove_contained_in_list(a) | |
198 | self.assertSequenceEqual(a, b) | |
199 | ||
200 | class Test_length_sum_from_list(unittest.TestCase): | |
201 | def test_length_sum_from_list(self): | |
202 | '''Test that total length of intervals is summed correctly''' | |
203 | a = [intervals.Interval(1,2), | |
204 | intervals.Interval(4,5), | |
205 | intervals.Interval(10,19)] | |
206 | ||
207 | self.assertEqual(14, intervals.length_sum_from_list(a)) | |
208 | ||
209 | ||
210 | if __name__ == '__main__': | |
211 | unittest.main() |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import sys | |
3 | import filecmp | |
4 | import os | |
5 | import unittest | |
6 | from pyfastaq import sequences, utils, intervals, tasks | |
7 | ||
8 | modules_dir = os.path.dirname(os.path.abspath(sequences.__file__)) | |
9 | data_dir = os.path.join(modules_dir, 'tests', 'data') | |
10 | ||
11 | class Error (Exception): pass | |
12 | ||
13 | expected_embl = [ | |
14 | 'aaacaaaccaaatatggattttattgtagccatatttgctctgtttgttattagctcattcacaattacttccacaaatgcagttgaagcttctactcttcttgacataggtaacctgagtcggagcagttttcctcgtggcttcatctttggtgctggatcttcagcataccaatttgaaggtgcagtaaacgaaggcggtagaggaccaagtatttgggataccttcacccataaatatccagaaaaaataagggatggaagcaatgcagacatcacggttgaccaatatcaccgctacaaggaagatgttgggattatgaaggatcaaaatatggattcgtatagattctcaatctcttggccaagaatactcccaaagggaaagttgagcggaggcataaatcacgaaggaatcaaatattacaacaaccttatcaacgaactattggctaacggtatacaaccatttgtaactctttttcattgggatcttccccaagtcttagaagatgagtatggtggtttcttaaactccggtgtaataaatgattttcgagactatacggatctttgcttcaaggaatttggagatagagtgaggtattggagtactctaaatgagccatgggtgtttagcaattctggatatgcactaggaacaaatgcaccaggtcgatgttcggcctccaacgtggccaagcctggtgattctggaacaggaccttatatagttacacacaatcaaattcttgctcatgcagaagctgtacatgtgtataagactaaataccaggcatatcaaaagggaaagataggcataacgttggtatctaactggttaatgccacttgatgataatagcataccagatataaaggctgccgagagatcacttgacttccaatttggattgtttatggaacaattaacaacaggagattattctaagagcatgcggcgtatagttaaaaaccgattacctaagttctcaaaattcgaatcaagcctagtgaatggttcatttgattttattggtataaactattactcttctagttatattagcaatgccccttcacatggcaatgccaaacccagttactcaacaaatcctatgaccaatatttcatttgaaaaacatgggatacccttaggtccaagggctgcttcaatttggatatatgtttatccatatatgtttatccaagaggacttcgagatcttttgttacatattaaaaataaatataacaatcctgcaattttcaatcactgaaaatggtatgaatgaattcaacgatgcaacacttccagtagaagaagctcttttgaatacttacagaattgattactattaccgtcacttatactacattcgttctgcaatcagggctggctcaaatgtgaagggtttttacgcatggtcatttttggactgtaatgaatggtttgcaggctttactgttcgttttggattaaactttgtagattagaaagatggattaaaaaggtaccctaagctttctgcccaatggtacaagaactttctcaaaagaaactagctagtattattaaaagaactttgtagtagattacagtacatcgtttgaagttgagttggtgcacctaattaaataaaagaggttactcttaacatatttttaggccattcgttgtgaagttgttaggctgttatttctattatactatgttgtagtaataagtgcattgttgtaccagaagctatgatcataactataggttgatccttcatgtatcagtttgatgttgagaatactttgaattaaaagtctttttttatttttttaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', | |
15 | 'aaacaaaccaaatatggattttattgtagccatatttgctctgtttgttattagctcattcacaattacttccacaaatgcagttgaagcttctactcttcttgacataggtaacctgagtcggagcagttttcctcgtggcttcatctttggtgctggatcttcagcataccaatttgaaggtgcagtaaacgaaggcggtagaggaccaagtatttgggataccttcacccataaatatccagaaaaaataagggatggaagcaatgcagacatcacggttgaccaatatcaccgctacaaggaagatgttgggattatgaaggatcaaaatatggattcgtatagattctcaatctcttggccaagaatactcccaaagggaaagttgagcggaggcataaatcacgaaggaatcaaatattacaacaaccttatcaacgaactattggctaacggtatacaaccatttgtaactctttttcattgggatcttccccaagtcttagaagatgagtatggtggtttcttaaactccggtgtaataaatgattttcgagactatacggatctttgcttcaaggaatttggagatagagtgaggtattggagtactctaaatgagccatgggtgtttagcaattctggatatgcactaggaacaaatgcaccaggtcgatgttcggcctccaacgtggccaagcctggtgattctggaacaggaccttatatagttacacacaatcaaattcttgctcatgcagaagctgtacatgtgtataagactaaataccaggcatatcaaaagggaaagataggcataacgttggtatctaactggttaatgccacttgatgataatagcataccagatataaaggctgccgagagatcacttgacttccaatttggattgtttatggaacaattaacaacaggagattattctaagagcatgcggcgtatagttaaaaaccgattacctaagttctcaaaattcgaatcaagcctagtgaatggttcatttgattttattggtataaactattactcttctagttatattagcaatgccccttcacatggcaatgccaaacccagttactcaacaaatcctatgaccaatatttcatttgaaaaacatgggatacccttaggtccaagggctgcttcaatttggatatatgtttatccatatatgtttatccaagaggacttcgagatcttttgttacatattaaaaataaatataacaatcctgcaattttcaatcactgaaaatggtatgaatgaattcaacgatgcaacacttccagtagaagaagctcttttgaatacttacagaattgattactattaccgtcacttatactacattcgttctgcaatcagggctggctcaaatgtgaagggtttttacgcatggtcatttttggactgtaatgaatggtttgcaggctttactgttcgttttggattaaactttgtagattagaaagatggattaaaaaggtaccctaagctttctgcccaatggtacaagaactttctcaaaagaaactagctagtattattaaaagaactttgtagtagattacagtacatcgtttgaagttgagttggtgcacctaattaaataaaagaggttactcttaacatatttttaggccattcgttgtgaagttgttaggctgttatttctattatactatgttgtagtaataagtgcattgttgtaccagaagctatgatcataactataggttgatccttcatgtatcagtttgatgttgagaatactttgaattaaaagtctttttttatttttttaaaaaaaaaaaaaaaaaaaaccccccccc', | |
16 | ] | |
17 | class TestFasta(unittest.TestCase): | |
18 | def setUp(self): | |
19 | self.fasta = sequences.Fasta('ID', 'ACGTA') | |
20 | ||
21 | def test_equality(self): | |
22 | self.assertTrue(self.fasta == sequences.Fasta('ID', 'ACGTA')) | |
23 | self.assertFalse(self.fasta == sequences.Fasta('I', 'ACGTA')) | |
24 | self.assertFalse(self.fasta == sequences.Fasta('ID', 'ACGT')) | |
25 | self.assertFalse(self.fasta != sequences.Fasta('ID', 'ACGTA')) | |
26 | self.assertTrue(self.fasta != sequences.Fasta('I', 'ACGTA')) | |
27 | self.assertTrue(self.fasta != sequences.Fasta('ID', 'ACGT')) | |
28 | ||
29 | def test_init(self): | |
30 | '''__init__ should get the ID and sequence correctly''' | |
31 | self.assertEqual(self.fasta.id, 'ID') | |
32 | self.assertEqual(self.fasta.seq, 'ACGTA') | |
33 | ||
34 | def test_get_next_from_file(self): | |
35 | '''get_next_from_file() should read seqs from OK, including weirdness in file''' | |
36 | f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.fa')) | |
37 | fa = sequences.Fasta() | |
38 | counter = 1 | |
39 | ||
40 | while fa.get_next_from_file(f_in): | |
41 | self.assertEqual(fa, sequences.Fasta(str(counter), 'ACGTA')) | |
42 | counter += 1 | |
43 | ||
44 | utils.close(f_in) | |
45 | ||
46 | def test_get_id_from_header_line(self): | |
47 | '''Check that can get ID from header line or die properly''' | |
48 | self.assertEqual(sequences.Fasta._get_id_from_header_line(self.fasta, '>X'), 'X') | |
49 | with self.assertRaises(sequences.Error): | |
50 | self.assertEqual(sequences.Fasta._get_id_from_header_line(self.fasta, 'X'), 'X') | |
51 | ||
52 | def test_getitem(self): | |
53 | '''getitem() should return the right subsequence''' | |
54 | seq = 'AACGTGTCA' | |
55 | fa = sequences.Fasta('x', seq) | |
56 | self.assertEqual(seq[1], fa[1]) | |
57 | self.assertEqual(seq[0:2], fa[0:2]) | |
58 | self.assertEqual(seq[1:], fa[1:]) | |
59 | ||
60 | def test_len(self): | |
61 | '''len() should return the length of the sequence''' | |
62 | self.assertEqual(5, len(self.fasta)) | |
63 | ||
64 | def test_subseq(self): | |
65 | '''Test subseq''' | |
66 | fa = sequences.Fasta('name', 'ACGTA') | |
67 | self.assertEqual(fa.subseq(1,4), sequences.Fasta('name', 'CGT')) | |
68 | self.assertEqual(fa.subseq(None,4), sequences.Fasta('name', 'ACGT')) | |
69 | self.assertEqual(fa.subseq(1,None), sequences.Fasta('name', 'CGTA')) | |
70 | ||
71 | def test_print_line_length(self): | |
72 | '''__str__ should be formatted correctly with the right number of chars per line of sequence''' | |
73 | line_lengths = [0, 3] | |
74 | correct_files = [os.path.join(data_dir, x) for x in ['sequences_test_one-per-line.fa', 'sequences_test_3-per-line.fa']] | |
75 | ||
76 | for i in range(len(line_lengths)): | |
77 | seq_reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_one-per-line.fa')) | |
78 | sequences.Fasta.line_length = line_lengths[i] | |
79 | tmp_out = 'tmp.line_length_test.fa' | |
80 | f = utils.open_file_write(tmp_out) | |
81 | for s in seq_reader: | |
82 | print(s, file=f) | |
83 | utils.close(f) | |
84 | self.assertTrue(filecmp.cmp(correct_files[i], tmp_out)) | |
85 | os.unlink(tmp_out) | |
86 | ||
87 | sequences.Fasta.line_length = 60 | |
88 | ||
89 | def test_strip_after_first_whitespace(self): | |
90 | '''Test strip_after_first_whitespace()''' | |
91 | seqs = [ | |
92 | sequences.Fasta('name', 'A'), | |
93 | sequences.Fasta('name foo', 'A'), | |
94 | sequences.Fasta('name foo bar', 'A'), | |
95 | sequences.Fasta('name\tfoo', 'A'), | |
96 | ] | |
97 | ||
98 | for seq in seqs: | |
99 | seq.strip_after_first_whitespace() | |
100 | ||
101 | for seq in seqs: | |
102 | self.assertEqual(seq.id, 'name') | |
103 | ||
104 | def test_strip_illumina_suffix(self): | |
105 | '''Check that /1 and /2 removed correctly from IDs''' | |
106 | seqs = [sequences.Fasta('name/1', 'A'), | |
107 | sequences.Fasta('name/2', 'A'), | |
108 | sequences.Fasta('name', 'A'), | |
109 | sequences.Fasta('name/1/2', 'A'), | |
110 | sequences.Fasta('name/2/1', 'A'), | |
111 | sequences.Fasta('name/3', 'A')] | |
112 | ||
113 | correct_names = ['name', 'name', 'name', 'name/1', 'name/2', 'name/3'] | |
114 | ||
115 | for seq in seqs: | |
116 | seq.strip_illumina_suffix() | |
117 | ||
118 | for i in range(len(seqs)): | |
119 | self.assertEqual(seqs[i].id, correct_names[i]) | |
120 | ||
121 | def test_revcomp(self): | |
122 | '''revcomp() should correctly reverse complement a sequence''' | |
123 | fa = sequences.Fasta('ID', 'ACGTNacgtn') | |
124 | fa.revcomp() | |
125 | self.assertEqual(fa, sequences.Fasta('ID', 'nacgtNACGT')) | |
126 | ||
127 | def test_gaps(self): | |
128 | '''gaps() should find the gaps in a sequence correctly''' | |
129 | test_seqs = [sequences.Fasta('ID', 'ACGT'), | |
130 | sequences.Fasta('ID', 'NACGT'), | |
131 | sequences.Fasta('ID', 'NACGTN'), | |
132 | sequences.Fasta('ID', 'ANNCGT'), | |
133 | sequences.Fasta('ID', 'NANNCGTNN')] | |
134 | ||
135 | correct_gaps = [[], | |
136 | [intervals.Interval(0, 0)], | |
137 | [intervals.Interval(0, 0), intervals.Interval(5, 5)], | |
138 | [intervals.Interval(1, 2)], | |
139 | [intervals.Interval(0, 0), intervals.Interval(2, 3), intervals.Interval(7, 8)]] | |
140 | ||
141 | for i in range(len(test_seqs)): | |
142 | gaps = test_seqs[i].gaps() | |
143 | self.assertListEqual(correct_gaps[i], gaps) | |
144 | ||
145 | def test_contig_coords(self): | |
146 | '''contig_coords() should get the coords of all contigs in a sequence correctly''' | |
147 | test_seqs = [sequences.Fasta('ID', 'ACGT'), | |
148 | sequences.Fasta('ID', 'NACGT'), | |
149 | sequences.Fasta('ID', 'NNACGT'), | |
150 | sequences.Fasta('ID', 'ACGTN'), | |
151 | sequences.Fasta('ID', 'ACGTNN'), | |
152 | sequences.Fasta('ID', 'NANNCGT'), | |
153 | sequences.Fasta('ID', 'ACNNNGTNA'), | |
154 | sequences.Fasta('ID', 'ANNCGTNNAAAAA')] | |
155 | ||
156 | correct_coords = [[intervals.Interval(0,3)], | |
157 | [intervals.Interval(1, 4)], | |
158 | [intervals.Interval(2, 5)], | |
159 | [intervals.Interval(0, 3)], | |
160 | [intervals.Interval(0, 3)], | |
161 | [intervals.Interval(1, 1), intervals.Interval(4,6)], | |
162 | [intervals.Interval(0, 1), intervals.Interval(5, 6), intervals.Interval(8, 8)], | |
163 | [intervals.Interval(0, 0), intervals.Interval(3, 5), intervals.Interval(8, 12)]] | |
164 | ||
165 | for i in range(len(test_seqs)): | |
166 | gaps = test_seqs[i].contig_coords() | |
167 | self.assertListEqual(correct_coords[i], gaps) | |
168 | ||
169 | ||
170 | ||
171 | ||
172 | def test_orfs(self): | |
173 | '''Test orfs()''' | |
174 | test_seqs = [(sequences.Fasta('ID', 'AAACCCGG'), 0, False, [intervals.Interval(0,5)]), | |
175 | (sequences.Fasta('ID', 'AAAACCCGG'), 1, False, [intervals.Interval(1,6)]), | |
176 | (sequences.Fasta('ID', 'AAAAACCCGG'), 2, False, [intervals.Interval(2,7)]), | |
177 | (sequences.Fasta('ID', 'CCGGGTTT'), 0, True, [intervals.Interval(2,7)]), | |
178 | (sequences.Fasta('ID', 'CCGGGTTTT'), 1, True, [intervals.Interval(2,7)]), | |
179 | (sequences.Fasta('ID', 'CCGGGTTTTT'), 2, True, [intervals.Interval(2,7)]), | |
180 | (sequences.Fasta('ID', 'AAACCCTGA'), 0, False, [intervals.Interval(0,8)]), | |
181 | (sequences.Fasta('ID', 'AAACCCTGATAG'), 0, False, [intervals.Interval(0,8)]), | |
182 | (sequences.Fasta('ID', 'AAACCCTGA'), 1, False, [intervals.Interval(1,6)]), | |
183 | (sequences.Fasta('ID', ''), 0, False, []), | |
184 | (sequences.Fasta('ID', 'A'), 0, False, []), | |
185 | (sequences.Fasta('ID', 'AA'), 0, False, []), | |
186 | (sequences.Fasta('ID', 'AAA'), 0, False, [intervals.Interval(0,2)]), | |
187 | (sequences.Fasta('ID', 'AAAAAA'), 0, False, [intervals.Interval(0,5)]), | |
188 | (sequences.Fasta('ID', 'AAA'), 1, False, []), | |
189 | (sequences.Fasta('ID', 'AAA'), 2, False, []), | |
190 | (sequences.Fasta('ID', 'AAA'), 0, True, [intervals.Interval(0,2)]), | |
191 | (sequences.Fasta('ID', 'AAA'), 1, True, []), | |
192 | (sequences.Fasta('ID', 'AAA'), 2, True, []), | |
193 | (sequences.Fasta('ID', 'TAA'), 0, False, []), | |
194 | (sequences.Fasta('ID', 'CTA'), 0, True, [])] | |
195 | ||
196 | ||
197 | for t in test_seqs: | |
198 | orfs = t[0].orfs(frame=t[1], revcomp=t[2]) | |
199 | self.assertListEqual(orfs, t[3]) | |
200 | ||
201 | def test_all_orfs(self): | |
202 | '''Test all_orfs()''' | |
203 | d = {} | |
204 | tasks.file_to_dict(os.path.join(data_dir, 'sequences_test_orfs.fa'), d) | |
205 | seq = d['1'] | |
206 | orfs = seq.all_orfs(min_length=120) | |
207 | expected = [ | |
208 | (intervals.Interval(27, 221), False), | |
209 | (intervals.Interval(44, 226), False), | |
210 | (intervals.Interval(48, 170), True), | |
211 | (intervals.Interval(109, 240), False), | |
212 | (intervals.Interval(143, 265), True), | |
213 | (intervals.Interval(227, 421), False), | |
214 | (intervals.Interval(277, 432), True), | |
215 | (intervals.Interval(286, 477), False), | |
216 | (intervals.Interval(288, 518), True), | |
217 | (intervals.Interval(562, 702), False), | |
218 | (intervals.Interval(600, 758), False), | |
219 | (intervals.Interval(605, 817), False), | |
220 | (intervals.Interval(818, 937), False), | |
221 | (intervals.Interval(835, 987), False), | |
222 | (intervals.Interval(864, 998), False) | |
223 | ] | |
224 | ||
225 | self.assertEqual(len(orfs), len(expected)) | |
226 | ||
227 | for i in range(len(orfs)): | |
228 | print(orfs[i][0], expected[i][0]) | |
229 | self.assertEqual(orfs[i][0], expected[i][0]) | |
230 | self.assertEqual(orfs[i][1], expected[i][1]) | |
231 | ||
232 | ||
233 | def test_is_all_Ns(self): | |
234 | '''Test is_all_Ns()''' | |
235 | self.assertTrue(sequences.Fasta('ID', 'n').is_all_Ns()) | |
236 | self.assertTrue(sequences.Fasta('ID', 'N').is_all_Ns()) | |
237 | self.assertTrue(sequences.Fasta('ID', 'nNn').is_all_Ns()) | |
238 | self.assertFalse(sequences.Fasta('ID', 'a').is_all_Ns()) | |
239 | self.assertFalse(sequences.Fasta('ID', '').is_all_Ns()) | |
240 | self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns()) | |
241 | self.assertFalse(sequences.Fasta('ID', 'naN').is_all_Ns()) | |
242 | self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(start=0, end=0)) | |
243 | self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(start=0, end=1)) | |
244 | self.assertTrue(sequences.Fasta('ID', 'anNg').is_all_Ns(start=1, end=1)) | |
245 | self.assertTrue(sequences.Fasta('ID', 'anNg').is_all_Ns(start=1, end=2)) | |
246 | self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(start=1)) | |
247 | self.assertTrue(sequences.Fasta('ID', 'anN').is_all_Ns(start=1)) | |
248 | self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(end=1)) | |
249 | self.assertTrue(sequences.Fasta('ID', 'nNA').is_all_Ns(end=1)) | |
250 | ||
251 | with self.assertRaises(sequences.Error): | |
252 | sequences.Fasta('ID', 'anNg').is_all_Ns(start=1, end=0) | |
253 | ||
254 | def test_trim_Ns(self): | |
255 | '''trim_Ns() should do the right trimming of a sequence''' | |
256 | fa = sequences.Fasta('ID', 'ANNANA') | |
257 | test_seqs = [sequences.Fasta('ID', 'ANNANA'), | |
258 | sequences.Fasta('ID', 'NANNANA'), | |
259 | sequences.Fasta('ID', 'NANNANAN'), | |
260 | sequences.Fasta('ID', 'ANNANAN'), | |
261 | sequences.Fasta('ID', 'NNNNNNANNANAN'), | |
262 | sequences.Fasta('ID', 'NNANNANANn')] | |
263 | ||
264 | for s in test_seqs: | |
265 | s.trim_Ns() | |
266 | self.assertEqual(fa, s) | |
267 | ||
268 | def test_add_insertions(self): | |
269 | '''Test add_insertions''' | |
270 | fa = sequences.Fasta('X', 'acgtacgtacgt') | |
271 | fa.add_insertions(skip=4, window=0, test=True) | |
272 | self.assertEqual(fa, sequences.Fasta('X', 'acgtNacgtNacgt')) | |
273 | ||
274 | def test_replace_bases(self): | |
275 | '''Check that bases get replaced correctly''' | |
276 | fa = sequences.Fasta('X', 'AUCGTUUACT') | |
277 | fa.replace_bases('U', 'T') | |
278 | self.assertEqual(fa, sequences.Fasta('X', 'ATCGTTTACT')) | |
279 | ||
280 | def test_replace_interval(self): | |
281 | '''Test replace_interval()''' | |
282 | fa = sequences.Fasta('ID', 'ACGTA') | |
283 | fa.replace_interval(0, 0, 'NEW') | |
284 | self.assertEqual(fa, sequences.Fasta('ID', 'NEWCGTA')) | |
285 | ||
286 | fa = sequences.Fasta('ID', 'ACGTA') | |
287 | fa.replace_interval(4, 4, 'NEW') | |
288 | self.assertEqual(fa, sequences.Fasta('ID', 'ACGTNEW')) | |
289 | ||
290 | fa = sequences.Fasta('ID', 'ACGTA') | |
291 | fa.replace_interval(2, 3, 'NEW') | |
292 | self.assertEqual(fa, sequences.Fasta('ID', 'ACNEWA')) | |
293 | ||
294 | fa = sequences.Fasta('ID', 'ACGTA') | |
295 | with self.assertRaises(sequences.Error): | |
296 | fa.replace_interval(3,2,'x') | |
297 | with self.assertRaises(sequences.Error): | |
298 | fa.replace_interval(1,5,'x') | |
299 | with self.assertRaises(sequences.Error): | |
300 | fa.replace_interval(5,10,'x') | |
301 | ||
302 | fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE') | |
303 | fq.replace_interval(0, 0, 'NEW', 'III') | |
304 | self.assertEqual(fq, sequences.Fastq('ID', 'NEWCGTA', 'IIIBCDE')) | |
305 | ||
306 | fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE') | |
307 | fq.replace_interval(4, 4, 'NEW', 'III') | |
308 | self.assertEqual(fq, sequences.Fastq('ID', 'ACGTNEW', 'ABCDIII')) | |
309 | ||
310 | fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE') | |
311 | fq.replace_interval(2, 3, 'NEW', 'III') | |
312 | self.assertEqual(fq, sequences.Fastq('ID', 'ACNEWA', 'ABIIIE')) | |
313 | ||
314 | with self.assertRaises(sequences.Error): | |
315 | fq.replace_interval(1,1,'x', 'xx') | |
316 | ||
317 | def test_search_string(self): | |
318 | '''Check that search_string() finds all the hits''' | |
319 | fa = sequences.Fasta('X', 'AAA') | |
320 | hits = fa.search('G') | |
321 | self.assertTrue(len(hits) == 0) | |
322 | hits = fa.search('AAA') | |
323 | self.assertListEqual(hits, [(0, '+')]) | |
324 | hits = fa.search('AA') | |
325 | self.assertListEqual(hits, [(0, '+'), (1, '+')]) | |
326 | hits = fa.search('TTT') | |
327 | self.assertListEqual(hits, [(0, '-')]) | |
328 | ||
329 | def test_to_Fastq(self): | |
330 | '''Check to_Fastq converts OK, including out of range quality scores''' | |
331 | fa = sequences.Fasta('X', 'AAAAA') | |
332 | quals = [-1, 0, 40, 93, 94] | |
333 | self.assertEqual(sequences.Fastq('X', 'AAAAA', '!!I~~'), fa.to_Fastq(quals)) | |
334 | with self.assertRaises(sequences.Error): | |
335 | fa.to_Fastq('AAAAAAAAAAAAA') | |
336 | ||
337 | ||
338 | def test_translate(self): | |
339 | '''Test nucleotide -> amino acid conversion works on Fasta''' | |
340 | fa = sequences.Fasta('ID', 'GCAGCCGCGGCTAGAAGGCGACGCCGGCGTAACAATGACGATTGCTGTGAAGAGCAACAGGGAGGCGGGGGTCACCATATAATCATTTTATTGCTACTCCTGCTTAAAAAGATGTTCTTTCCACCCCCGCCTAGCAGTTCATCCTCGTCTACAACCACGACTTGGTACTATGTAGTCGTGGTTTAATAGTGA') | |
341 | self.assertEqual(sequences.Fasta('ID', 'AAAARRRRRRNNDDCCEEQQGGGGHHIIILLLLLLKKMFFPPPPSSSSSSTTTTWYYVVVV***'), fa.translate()) | |
342 | self.assertEqual(sequences.Fasta('ID', 'QPRLEGDAGVTMTIAVKSNREAGVTI*SFYCYSCLKRCSFHPRLAVHPRLQPRLGTM*SWFNS'), fa.translate(frame=1)) | |
343 | print(fa.translate(frame=1)) | |
344 | self.assertEqual(sequences.Fasta('ID', 'SRG*KATPA*Q*RLL*RATGRRGSPYNHFIATPA*KDVLSTPA*QFILVYNHDLVLCSRGLIV'), fa.translate(frame=2)) | |
345 | ||
346 | def test_expand_nucleotides(self): | |
347 | '''Test expand_nucleotides''' | |
348 | tests = [ | |
349 | (sequences.Fasta('1', 'A'), [sequences.Fasta('1.1', 'A')]), | |
350 | (sequences.Fasta('2', 'C'), [sequences.Fasta('2.1', 'C')]), | |
351 | (sequences.Fasta('3', 'G'), [sequences.Fasta('3.1', 'G')]), | |
352 | (sequences.Fasta('4', 'T'), [sequences.Fasta('4.1', 'T')]), | |
353 | (sequences.Fasta('6', 'R'), [sequences.Fasta('6.1', 'A'), sequences.Fasta('6.2', 'G')]), | |
354 | (sequences.Fasta('7', 'Y'), [sequences.Fasta('7.1', 'C'), sequences.Fasta('7.2', 'T')]), | |
355 | (sequences.Fasta('8', 'S'), [sequences.Fasta('8.1', 'C'), sequences.Fasta('8.2', 'G')]), | |
356 | (sequences.Fasta('9', 'W'), [sequences.Fasta('9.1', 'A'), sequences.Fasta('9.2', 'T')]), | |
357 | (sequences.Fasta('10', 'K'), [sequences.Fasta('10.1', 'G'), sequences.Fasta('10.2', 'T')]), | |
358 | (sequences.Fasta('11', 'M'), [sequences.Fasta('11.1', 'A'), sequences.Fasta('11.2', 'C')]), | |
359 | (sequences.Fasta('12', 'B'), [sequences.Fasta('12.1', 'C'), sequences.Fasta('12.2', 'G'), sequences.Fasta('12.3', 'T')]), | |
360 | (sequences.Fasta('13', 'D'), [sequences.Fasta('13.1', 'A'), sequences.Fasta('13.2', 'G'), sequences.Fasta('13.3', 'T')]), | |
361 | (sequences.Fasta('14', 'H'), [sequences.Fasta('14.1', 'A'), sequences.Fasta('14.2', 'C'), sequences.Fasta('14.3', 'T')]), | |
362 | (sequences.Fasta('15', 'V'), [sequences.Fasta('15.1', 'A'), sequences.Fasta('15.2', 'C'), sequences.Fasta('15.3', 'G')]), | |
363 | (sequences.Fasta('16', 'N'), [sequences.Fasta('16.1', 'A'), sequences.Fasta('16.2', 'C'), sequences.Fasta('16.3', 'G'), sequences.Fasta('16.4', 'T')]), | |
364 | (sequences.Fasta('17', 'ART'), [sequences.Fasta('17.1', 'AAT'), sequences.Fasta('17.2', 'AGT')]), | |
365 | (sequences.Fasta('18', 'ARRT'), [sequences.Fasta('18.1', 'AAAT'), sequences.Fasta('18.2', 'AAGT'), sequences.Fasta('18.3', 'AGAT'), sequences.Fasta('18.4', 'AGGT')]), | |
366 | (sequences.Fasta('19', 'ARTR'), [sequences.Fasta('19.1', 'AATA'), sequences.Fasta('19.2', 'AATG'), sequences.Fasta('19.3', 'AGTA'), sequences.Fasta('19.4', 'AGTG')]), | |
367 | (sequences.Fastq('20', 'ART', 'GHI'), [sequences.Fastq('20.1', 'AAT', 'GHI'), sequences.Fastq('20.2', 'AGT', 'GHI')]), | |
368 | ] | |
369 | ||
370 | for t in tests: | |
371 | self.assertListEqual(t[0].expand_nucleotides(), t[1]) | |
372 | ||
373 | def test_split_capillary_id(self): | |
374 | '''Tests that we get information from a sanger capillary read name OK''' | |
375 | ids = ['abcde.p1k', 'abcde.x.p1k', 'abcde.p1ka', 'abcde.q1k', 'abcde.w2k'] | |
376 | expected = [{'prefix': 'abcde', 'dir': 'fwd', 'suffix': 'p1k'}, | |
377 | {'prefix': 'abcde.x', 'dir': 'fwd', 'suffix': 'p1k'}, | |
378 | {'prefix': 'abcde', 'dir': 'fwd', 'suffix': 'p1ka'}, | |
379 | {'prefix': 'abcde', 'dir': 'rev', 'suffix': 'q1k'}, | |
380 | {'prefix': 'abcde', 'dir': 'unk', 'suffix': 'w2k'}] | |
381 | ||
382 | for i in range(len(ids)): | |
383 | fa = sequences.Fasta(ids[i], 'A') | |
384 | self.assertEqual(fa.split_capillary_id(), expected[i]) | |
385 | ||
386 | with self.assertRaises(sequences.Error): | |
387 | fa = sequences.Fasta('name', 'A') | |
388 | fa.split_capillary_id() | |
389 | ||
390 | ||
391 | class TestEmbl(unittest.TestCase): | |
392 | def test_get_id_from_header_line(self): | |
393 | '''Test get id from header line of EMBL''' | |
394 | embl = sequences.Embl('ID', 'ACGT') | |
395 | self.assertEqual(embl._get_id_from_header_line('ID X; blah'), 'X') | |
396 | self.assertEqual(embl._get_id_from_header_line('LOCUS X foo'), 'X') | |
397 | with self.assertRaises(sequences.Error): | |
398 | self.assertEqual(embl._get_id_from_header_line('ID X;'), 'X') | |
399 | with self.assertRaises(sequences.Error): | |
400 | self.assertEqual(embl._get_id_from_header_line('XX X;'), 'X') | |
401 | ||
402 | ||
403 | def test_get_next_from_embl_file(self): | |
404 | f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.embl')) | |
405 | embl = sequences.Embl() | |
406 | counter = 1 | |
407 | ||
408 | while embl.get_next_from_file(f_in): | |
409 | self.assertEqual(embl, sequences.Fasta('seq' + str(counter), expected_embl[counter-1])) | |
410 | counter += 1 | |
411 | ||
412 | utils.close(f_in) | |
413 | ||
414 | ||
415 | def test_get_next_from_gbk_file(self): | |
416 | f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.gbk')) | |
417 | embl = sequences.Embl() | |
418 | counter = 1 | |
419 | expected = [ | |
420 | 'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgatc', | |
421 | 'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgaaa'] | |
422 | ||
423 | while embl.get_next_from_file(f_in): | |
424 | self.assertEqual(embl, sequences.Fasta('NAME' + str(counter), expected[counter-1])) | |
425 | counter += 1 | |
426 | ||
427 | utils.close(f_in) | |
428 | ||
429 | ||
430 | class TestFastq(unittest.TestCase): | |
431 | def setUp(self): | |
432 | self.fastq = sequences.Fastq('ID', 'ACGTA', 'IIIII') | |
433 | ||
434 | def test_init(self): | |
435 | '''__init__ should get the ID, sequence and quality correctly''' | |
436 | self.assertEqual(self.fastq.id, 'ID') | |
437 | self.assertEqual(self.fastq.seq, 'ACGTA') | |
438 | self.assertEqual(self.fastq.qual, 'IIIII') | |
439 | ||
440 | def test_init_length_mismatch(self): | |
441 | '''__init__ should raise an error when length of seq and quality not the same''' | |
442 | with self.assertRaises(sequences.Error): | |
443 | sequences.Fastq('X', 'A', 'II') | |
444 | ||
445 | def test_get_next_from_file(self): | |
446 | '''get_next_from_file() should read seqs from OK, and raise error at badly formatted file''' | |
447 | bad_files = ['sequences_test_fail_no_AT.fq', | |
448 | 'sequences_test_fail_no_seq.fq', | |
449 | 'sequences_test_fail_no_plus.fq', | |
450 | 'sequences_test_fail_no_qual.fq'] | |
451 | ||
452 | bad_files = [os.path.join(data_dir, x) for x in bad_files] | |
453 | ||
454 | for fname in bad_files: | |
455 | f_in = utils.open_file_read(fname) | |
456 | fq = sequences.Fastq() | |
457 | with self.assertRaises(sequences.Error): | |
458 | while fq.get_next_from_file(f_in): | |
459 | pass | |
460 | ||
461 | utils.close(f_in) | |
462 | ||
463 | fname = os.path.join(data_dir, 'sequences_test_good_file.fq') | |
464 | try: | |
465 | f_in = open(fname) | |
466 | except IOError: | |
467 | print("Error opening '" + fname + "'", file=sys.stderr) | |
468 | sys.exit(1) | |
469 | ||
470 | fq = sequences.Fastq() | |
471 | while fq.get_next_from_file(f_in): | |
472 | self.assertEqual(fq, sequences.Fastq('ID', 'ACGTA', 'IIIII')) | |
473 | utils.close(f_in) | |
474 | ||
475 | def test_subseq(self): | |
476 | '''Test subseq''' | |
477 | fq = sequences.Fastq('name', 'ACGTA', 'FGHIJ') | |
478 | self.assertEqual(fq.subseq(1,4), sequences.Fastq('name', 'CGT', 'GHI')) | |
479 | self.assertEqual(fq.subseq(None,4), sequences.Fastq('name', 'ACGT', 'FGHI')) | |
480 | self.assertEqual(fq.subseq(1,None), sequences.Fastq('name', 'CGTA', 'GHIJ')) | |
481 | ||
482 | def test_revcomp(self): | |
483 | '''revcomp() should correctly reverse complement a sequence''' | |
484 | fq = sequences.Fastq('ID', 'ACGTNacgtn', '1234567890') | |
485 | fq.revcomp() | |
486 | self.assertEqual(fq, sequences.Fastq('ID', 'nacgtNACGT', '0987654321')) | |
487 | ||
488 | def test_trim_Ns(self): | |
489 | '''trim_Ns() should do the right trimming of a fastq sequence''' | |
490 | fq = sequences.Fastq('ID', 'ANNANA', '111111') | |
491 | test_seqs = [sequences.Fastq('ID', 'ANNANA', '111111'), | |
492 | sequences.Fastq('ID', 'NANNANA', '1111111'), | |
493 | sequences.Fastq('ID', 'NANNANAN', '11111111'), | |
494 | sequences.Fastq('ID', 'ANNANAN', '1111111'), | |
495 | sequences.Fastq('ID', 'NNNNNNANNANAN', '1111111111111'), | |
496 | sequences.Fastq('ID', 'NNANNANANn', '1111111111')] | |
497 | ||
498 | for s in test_seqs: | |
499 | s.trim_Ns() | |
500 | self.assertEqual(fq, s) | |
501 | ||
502 | def test_trim(self): | |
503 | '''trim() should trim the right number of bases off start and end''' | |
504 | fq = sequences.Fastq('ID', '1234567890', '1234567890') | |
505 | fq.trim(0, 0) | |
506 | self.assertEqual(fq, sequences.Fastq('ID', '1234567890', '1234567890')) | |
507 | ||
508 | fq = sequences.Fastq('ID', '1234567890', '1234567890') | |
509 | fq.trim(1, 0) | |
510 | self.assertEqual(fq, sequences.Fastq('ID', '234567890', '234567890')) | |
511 | ||
512 | fq = sequences.Fastq('ID', '1234567890', '1234567890') | |
513 | fq.trim(0, 1) | |
514 | self.assertEqual(fq, sequences.Fastq('ID', '123456789', '123456789')) | |
515 | ||
516 | fq = sequences.Fastq('ID', '1234567890', '1234567890') | |
517 | fq.trim(2, 2) | |
518 | self.assertEqual(fq, sequences.Fastq('ID', '345678', '345678')) | |
519 | ||
520 | def test_to_Fasta_and_qual(self): | |
521 | '''Check to_Fasta_and_qual converts quality scores correctly''' | |
522 | fq = sequences.Fastq('ID', 'ACGT', '>ADI') | |
523 | (fa, qual) = fq.to_Fasta_and_qual() | |
524 | self.assertEqual(fa, sequences.Fasta('ID', 'ACGT')) | |
525 | self.assertListEqual(qual, [29, 32, 35, 40]) | |
526 | ||
527 | ||
528 | def test_translate(self): | |
529 | '''Test nucleatide -> amino acid conversion works on Fasta''' | |
530 | fq = sequences.Fastq('ID', 'GCAGCCGCGGCTAGAAGGCGACGCCGGCGTAACAATGACGATTGCTGTGAAGAGCAACAGGGAGGCGGGGGTCACCATATAATCATTTTATTGCTACTCCTGCTTAAAAAGATGTTCTTTCCACCCCCGCCTAGCAGTTCATCCTCGTCTACAACCACGACTTGGTACTATGTAGTCGTGGTTTAATAGTGA', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII') | |
531 | ||
532 | self.assertEqual(sequences.Fastq('ID', 'AAAARRRRRRNNDDCCEEQQGGGGHHIIILLLLLLKKMFFPPPPSSSSSSTTTTWYYVVVV***', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII'), fq.translate()) | |
533 | ||
534 | class TestFileReader(unittest.TestCase): | |
535 | def test_file_reader_fasta(self): | |
536 | '''file_reader should iterate through a fasta file correctly''' | |
537 | reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test.fa')) | |
538 | counter = 1 | |
539 | for seq in reader: | |
540 | self.assertEqual(seq, sequences.Fasta(str(counter), 'ACGTA')) | |
541 | counter += 1 | |
542 | ||
543 | def test_file_reader_fastq(self): | |
544 | '''file_reader should iterate through a fastq file correctly''' | |
545 | reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_good_file.fq')) | |
546 | for seq in reader: | |
547 | self.assertEqual(seq, sequences.Fastq('ID', 'ACGTA', 'IIIII')) | |
548 | ||
549 | def test_file_reader_bad_format(self): | |
550 | '''file_reader should die properly when not given fasta or fastq file''' | |
551 | with self.assertRaises(sequences.Error): | |
552 | reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_not_a_fastaq_file')) | |
553 | for seq in reader: | |
554 | pass | |
555 | ||
556 | def test_file_reader_gff(self): | |
557 | '''Test read gff file''' | |
558 | good_files = [ | |
559 | 'sequences_test_gffv3.gff', | |
560 | 'sequences_test_gffv3.no_FASTA_line.gff' | |
561 | ] | |
562 | good_files = [os.path.join(data_dir, x) for x in good_files] | |
563 | ||
564 | for f in good_files: | |
565 | reader = sequences.file_reader(f) | |
566 | counter = 1 | |
567 | for seq in reader: | |
568 | self.assertEqual(seq, sequences.Fasta('seq' + str(counter), 'ACGTACGTAC')) | |
569 | counter += 1 | |
570 | ||
571 | bad_files = [ | |
572 | 'sequences_test_gffv3.no_seq.gff', | |
573 | 'sequences_test_gffv3.no_seq.2.gff' | |
574 | ] | |
575 | bad_files = [os.path.join(data_dir, x) for x in bad_files] | |
576 | ||
577 | for filename in bad_files: | |
578 | with self.assertRaises(sequences.Error): | |
579 | reader = sequences.file_reader(filename) | |
580 | for seq in reader: | |
581 | pass | |
582 | ||
583 | def test_file_reader_embl(self): | |
584 | '''Test read embl file''' | |
585 | reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test.embl')) | |
586 | ||
587 | counter = 1 | |
588 | for seq in reader: | |
589 | self.assertEqual(seq, sequences.Fasta('seq' + str(counter), expected_embl[counter-1])) | |
590 | counter += 1 | |
591 | ||
592 | bad_files = [ | |
593 | 'sequences_test.embl.bad', | |
594 | 'sequences_test.embl.bad2', | |
595 | ] | |
596 | bad_files = [os.path.join(data_dir, x) for x in bad_files] | |
597 | ||
598 | for filename in bad_files: | |
599 | with self.assertRaises(sequences.Error): | |
600 | reader = sequences.file_reader(filename) | |
601 | for seq in reader: | |
602 | pass | |
603 | ||
604 | def test_file_reader_phylip(self): | |
605 | '''Test read phylip file''' | |
606 | test_files = [ | |
607 | 'sequences_test_phylip.interleaved', | |
608 | 'sequences_test_phylip.interleaved2', | |
609 | 'sequences_test_phylip.sequential' | |
610 | ] | |
611 | ||
612 | test_files = [os.path.join(data_dir, f) for f in test_files] | |
613 | ||
614 | expected_seqs = [ | |
615 | sequences.Fasta('Turkey', 'AACTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT'), | |
616 | sequences.Fasta('Salmo_gair', 'AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT'), | |
617 | sequences.Fasta('H. Sapiens', 'ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA') | |
618 | ] | |
619 | ||
620 | for fname in test_files: | |
621 | reader = sequences.file_reader(fname) | |
622 | i = 0 | |
623 | for seq in reader: | |
624 | self.assertEqual(expected_seqs[i], seq) | |
625 | i += 1 | |
626 | ||
627 | # files made by seaview are a little different in the first line. | |
628 | # Test one of these | |
629 | expected_seqs = [ | |
630 | sequences.Fasta('seq1', 96 * 'G' + 'T'), | |
631 | sequences.Fasta('seq2', 94 * 'A' + 'G') | |
632 | ] | |
633 | ||
634 | reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_phylip.made_by_seaview')) | |
635 | i = 0 | |
636 | for seq in reader: | |
637 | print(seq) | |
638 | self.assertEqual(expected_seqs[i], seq) | |
639 | i += 1 | |
640 | ||
641 | ||
642 | class TestOther(unittest.TestCase): | |
643 | def test_orfs_from_aa_seq(self): | |
644 | '''Test _orfs_from_aa_seq()''' | |
645 | test_seqs = ['', | |
646 | '*', | |
647 | '**', | |
648 | 'A', | |
649 | 'A*A*A', | |
650 | 'AB**CDE*AB', | |
651 | '*ABCDE*', | |
652 | '**ABCDE**'] | |
653 | ||
654 | correct_coords = [[], | |
655 | [], | |
656 | [], | |
657 | [intervals.Interval(0, 0)], | |
658 | [intervals.Interval(0, 1), intervals.Interval(2, 3),intervals.Interval(4, 4)], | |
659 | [intervals.Interval(0, 2), intervals.Interval(4, 7), intervals.Interval(8, 9)], | |
660 | [intervals.Interval(1, 6)], | |
661 | [intervals.Interval(2, 7)]] | |
662 | ||
663 | for i in range(len(test_seqs)): | |
664 | orfs = sequences._orfs_from_aa_seq(test_seqs[i]) | |
665 | self.assertListEqual(correct_coords[i], orfs) | |
666 | ||
667 | ||
668 | if __name__ == '__main__': | |
669 | unittest.main() |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import sys | |
3 | import filecmp | |
4 | import os | |
5 | import unittest | |
6 | from pyfastaq import tasks, sequences | |
7 | ||
8 | modules_dir = os.path.dirname(os.path.abspath(sequences.__file__)) | |
9 | data_dir = os.path.join(modules_dir, 'tests', 'data') | |
10 | ||
11 | class Error (Exception): pass | |
12 | ||
13 | class TestCafToFastq(unittest.TestCase): | |
14 | def test_caf_to_fastq_default(self): | |
15 | '''Test caf_to_fastq with no filtering''' | |
16 | tmpfile = 'tmp.fq' | |
17 | tasks.caf_to_fastq(os.path.join(data_dir, 'caf_test.caf'), tmpfile) | |
18 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'caf_test.to_fastq.no_trim.min_length_0.fq'), tmpfile, shallow=False)) | |
19 | os.unlink(tmpfile) | |
20 | ||
21 | def test_caf_to_fastq_trim_and_min_length(self): | |
22 | '''Test caf_to_fastq with trimming and min_length''' | |
23 | tmpfile = 'tmp.fq' | |
24 | tasks.caf_to_fastq(os.path.join(data_dir, 'caf_test.caf'), tmpfile, trim=True, min_length=6) | |
25 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'caf_test.to_fastq.trim.min_length_6.fq'), tmpfile, shallow=False)) | |
26 | os.unlink(tmpfile) | |
27 | ||
28 | ||
29 | class TestCapillaryToPairs(unittest.TestCase): | |
30 | def test_capillary_to_pairs(self): | |
31 | '''Check that capillary reads file converted to paired and unpaired''' | |
32 | tmp_prefix = 'tmp.cap_to_pairs' | |
33 | tasks.capillary_to_pairs(os.path.join(data_dir, 'sequences_test_cap_to_read_pairs.fa'), tmp_prefix) | |
34 | # sequences have been hashed, so could be in any order in | |
35 | # output files. So need to check contents of files are OK | |
36 | d_correct_paired = {} | |
37 | d_correct_unpaired = {} | |
38 | tasks.file_to_dict(os.path.join(data_dir, 'sequences_test_cap_to_read_pairs.fa.paired.gz'), d_correct_paired) | |
39 | tasks.file_to_dict(os.path.join(data_dir, 'sequences_test_cap_to_read_pairs.fa.unpaired.gz'), d_correct_unpaired) | |
40 | d_test_paired = {} | |
41 | d_test_unpaired = {} | |
42 | tasks.file_to_dict(tmp_prefix + '.paired.gz', d_test_paired) | |
43 | tasks.file_to_dict(tmp_prefix + '.unpaired.gz', d_test_unpaired) | |
44 | self.assertDictEqual(d_test_paired, d_correct_paired) | |
45 | self.assertDictEqual(d_test_unpaired, d_correct_unpaired) | |
46 | os.unlink(tmp_prefix + '.paired.gz') | |
47 | os.unlink(tmp_prefix + '.unpaired.gz') | |
48 | ||
49 | ||
50 | class TestDeinterleave(unittest.TestCase): | |
51 | def test_deinterleave(self): | |
52 | '''deinterleave should deal with an interleaved file correctly''' | |
53 | tmp_1 = 'tmp.deinterleaved_1.fa' | |
54 | tmp_2 = 'tmp.deinterleaved_2.fa' | |
55 | tasks.deinterleave(os.path.join(data_dir, 'sequences_test_interleaved.fa'), tmp_1, tmp_2) | |
56 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_deinterleaved_1.fa'), tmp_1)) | |
57 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_deinterleaved_2.fa'), tmp_2)) | |
58 | ||
59 | tasks.deinterleave(os.path.join(data_dir, 'sequences_test_interleaved.fq'), tmp_1, tmp_2, fasta_out=True) | |
60 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_deinterleaved_1.fa'), tmp_1)) | |
61 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_deinterleaved_2.fa'), tmp_2)) | |
62 | ||
63 | with self.assertRaises(tasks.Error): | |
64 | tasks.deinterleave(os.path.join(data_dir, 'sequences_test_interleaved_bad.fa'), tmp_1, tmp_2) | |
65 | os.unlink(tmp_1) | |
66 | os.unlink(tmp_2) | |
67 | ||
68 | ||
69 | class TestEnumerateNames(unittest.TestCase): | |
70 | def test_enumerate_names(self): | |
71 | '''Test enomereate_names works with all options''' | |
72 | outfile = 'tmp.enumerate_seqs.fa' | |
73 | rename_out = outfile + '.rename' | |
74 | tasks.enumerate_names(os.path.join(data_dir, 'sequences_test_enumerate_names.fa'), outfile) | |
75 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.start.1'), outfile)) | |
76 | tasks.enumerate_names(os.path.join(data_dir, 'sequences_test_enumerate_names.fa'), outfile, rename_file=rename_out) | |
77 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.start.1'), outfile)) | |
78 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.start.1.rename_file'), rename_out)) | |
79 | tasks.enumerate_names(os.path.join(data_dir, 'sequences_test_enumerate_names.fa'), outfile, start_index=2) | |
80 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.start.2'), outfile)) | |
81 | tasks.enumerate_names(os.path.join(data_dir, 'sequences_test_enumerate_names.fa'), outfile, keep_illumina_suffix=True) | |
82 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.keep_suffix'), outfile)) | |
83 | ||
84 | tasks.enumerate_names(os.path.join(data_dir, 'sequences_test_enumerate_names.fa'), outfile, suffix='.SUFFIX') | |
85 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.add_suffix'), outfile, shallow=False)) | |
86 | os.unlink(outfile) | |
87 | os.unlink(rename_out) | |
88 | ||
89 | ||
90 | class TestExpandNucleotides(unittest.TestCase): | |
91 | def test_expand_nucleoties(self): | |
92 | '''Test expand_nucleotides''' | |
93 | tmp = 'tmp.expanded' | |
94 | fq_in = os.path.join(data_dir, 'tasks_test_expend_nucleotides.in.fq') | |
95 | fa_in = os.path.join(data_dir, 'tasks_test_expend_nucleotides.in.fa') | |
96 | fq_expected = os.path.join(data_dir, 'tasks_test_expend_nucleotides.out.fq') | |
97 | fa_expected = os.path.join(data_dir, 'tasks_test_expend_nucleotides.out.fa') | |
98 | tasks.expand_nucleotides(fq_in, tmp) | |
99 | self.assertTrue(filecmp.cmp(fq_expected, tmp, shallow=False)) | |
100 | os.unlink(tmp) | |
101 | tasks.expand_nucleotides(fa_in, tmp) | |
102 | self.assertTrue(filecmp.cmp(fa_expected, tmp, shallow=False)) | |
103 | os.unlink(tmp) | |
104 | ||
105 | ||
106 | class TestExtendGaps(unittest.TestCase): | |
107 | def test_trim_contigs(self): | |
108 | '''Test that gap extension works''' | |
109 | outfile = 'tmp.gap_extend.fa' | |
110 | tasks.trim_contigs(os.path.join(data_dir, 'sequences_test_trim_contigs.fa'), outfile, trim=2) | |
111 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_trim_contigs.fa.out'), outfile)) | |
112 | os.unlink(outfile) | |
113 | ||
114 | ||
115 | class TestFastqToMiraXml(unittest.TestCase): | |
116 | def test_fastaq_to_mira_xml(self): | |
117 | '''check that fastaq_to_mira_xml makes the correct xml file from a fastq file''' | |
118 | tmp = 'tmp.mira.xml' | |
119 | tasks.fastaq_to_mira_xml(os.path.join(data_dir, 'sequences_test_good_file.fq'), tmp) | |
120 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_good_file_mira.xml'), tmp)) | |
121 | os.unlink(tmp) | |
122 | ||
123 | ||
124 | class TestFastaqToOrfsGFF(unittest.TestCase): | |
125 | def test_fastaq_to_orfs_gff(self): | |
126 | '''Test fastaq_to_orfs_gff''' | |
127 | outfile = 'tmp.orfs.gff' | |
128 | tasks.fastaq_to_orfs_gff(os.path.join(data_dir, 'sequences_test_orfs.fa'), outfile, min_length=120) | |
129 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_orfs.gff'), outfile, shallow=False)) | |
130 | os.unlink(outfile) | |
131 | ||
132 | ||
133 | class TestFilter(unittest.TestCase): | |
134 | def test_length_filter(self): | |
135 | '''Check that filtering by length works as expected''' | |
136 | infile = os.path.join(data_dir, 'sequences_test_length_filter.fa') | |
137 | correct_files = [os.path.join(data_dir, 'sequences_test_length_filter.min-0.max-1.fa'), | |
138 | os.path.join(data_dir, 'sequences_test_length_filter.min-0.max-inf.fa'), | |
139 | os.path.join(data_dir, 'sequences_test_length_filter.min-4.max-4.fa')] | |
140 | cutoffs = [(0, 1), (0, float('inf')), (4, 4)] | |
141 | ||
142 | for i in range(len(cutoffs)): | |
143 | outfile = 'tmp.length_filter.fa' | |
144 | tasks.filter(infile, outfile, minlength=cutoffs[i][0], maxlength=cutoffs[i][1]) | |
145 | self.assertTrue(filecmp.cmp(correct_files[i], outfile)) | |
146 | os.unlink(outfile) | |
147 | ||
148 | def test_regex_filter(self): | |
149 | '''Check that filtering by name regex works as expected''' | |
150 | infile = os.path.join(data_dir, 'sequences_test_filter_by_regex.fa') | |
151 | correct_files = [os.path.join(data_dir, 'sequences_test_filter_by_regex.numeric.fa'), | |
152 | os.path.join(data_dir, 'sequences_test_filter_by_regex.first-of-pair.fa'), | |
153 | os.path.join(data_dir, 'sequences_test_filter_by_regex.first-char-a.fa')] | |
154 | regexes = ['^[0-9]+$', '/1$', '^a'] | |
155 | ||
156 | for i in range(len(regexes)): | |
157 | outfile = 'tmp.regex_filter.fa' | |
158 | tasks.filter(infile, outfile, regex=regexes[i]) | |
159 | self.assertTrue(filecmp.cmp(correct_files[i], outfile)) | |
160 | os.unlink(outfile) | |
161 | ||
162 | def test_ids_from_file_filter(self): | |
163 | '''Test that can extract reads from a file of read names''' | |
164 | infile = os.path.join(data_dir, 'sequences_test_filter_by_ids_file.fa') | |
165 | outfile = 'tmp.ids_file_filter.fa' | |
166 | tasks.filter(infile, outfile, ids_file=infile + '.ids') | |
167 | self.assertTrue(filecmp.cmp(infile + '.filtered', outfile)) | |
168 | os.unlink(outfile) | |
169 | ||
170 | def test_invert_filter(self): | |
171 | '''Test that inverting filtering works''' | |
172 | infile = os.path.join(data_dir, 'sequences_test_filter_by_ids_file.fa') | |
173 | outfile = 'tmp.ids_file_filter.fa' | |
174 | tasks.filter(infile, outfile, ids_file=infile + '.ids', invert=True) | |
175 | self.assertTrue(filecmp.cmp(infile + '.filtered.invert', outfile)) | |
176 | os.unlink(outfile) | |
177 | ||
178 | ||
179 | def test_paired_both_pass(self): | |
180 | '''Test filter with paired file both pass''' | |
181 | infile1 = os.path.join(data_dir, 'tasks_test_filter_paired_both_pass.in_1.fa') | |
182 | infile2 = os.path.join(data_dir, 'tasks_test_filter_paired_both_pass.in_2.fa') | |
183 | outfile1 = 'tmp.filter_both_pass_1.fa' | |
184 | outfile2 = 'tmp.filter_both_pass_2.fa' | |
185 | expected1 = os.path.join(data_dir, 'tasks_test_filter_paired_both_pass.out_1.fa') | |
186 | expected2 = os.path.join(data_dir, 'tasks_test_filter_paired_both_pass.out_2.fa') | |
187 | tasks.filter(infile1, outfile1, mate_in=infile2, mate_out=outfile2, minlength=3) | |
188 | self.assertTrue(filecmp.cmp(outfile1, expected1, shallow=False)) | |
189 | self.assertTrue(filecmp.cmp(outfile2, expected2, shallow=False)) | |
190 | os.unlink(outfile1) | |
191 | os.unlink(outfile2) | |
192 | ||
193 | ||
194 | def test_paired_one_pass(self): | |
195 | '''Test filter with paired file one pass''' | |
196 | infile1 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.in_1.fa') | |
197 | infile2 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.in_2.fa') | |
198 | outfile1 = 'tmp.filter_one_pass_1.fa' | |
199 | outfile2 = 'tmp.filter_one_pass_2.fa' | |
200 | expected1 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.out_1.fa') | |
201 | expected2 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.out_2.fa') | |
202 | tasks.filter(infile1, outfile1, mate_in=infile2, mate_out=outfile2, both_mates_pass=False, minlength=3) | |
203 | self.assertTrue(filecmp.cmp(outfile1, expected1, shallow=False)) | |
204 | self.assertTrue(filecmp.cmp(outfile2, expected2, shallow=False)) | |
205 | os.unlink(outfile1) | |
206 | os.unlink(outfile2) | |
207 | ||
208 | ||
209 | class TestGetSeqsFlankingGaps(unittest.TestCase): | |
210 | def test_get_seqs_flanking_gaps(self): | |
211 | outfile = 'tmp.seqs_flanking_gaps' | |
212 | tasks.get_seqs_flanking_gaps(os.path.join(data_dir, 'sequences_test_get_seqs_flanking_gaps.fa'), outfile, 3, 3) | |
213 | self.assertTrue(filecmp.cmp(outfile, os.path.join(data_dir, 'sequences_test_get_seqs_flanking_gaps.fa.out'))) | |
214 | os.unlink(outfile) | |
215 | ||
216 | ||
217 | class TestInterleave(unittest.TestCase): | |
218 | def test_interleave(self): | |
219 | '''Check that interleave works as expected''' | |
220 | tmp = 'tmp.interleaved.fa' | |
221 | tasks.interleave(os.path.join(data_dir, 'sequences_test_deinterleaved_1.fa'), | |
222 | os.path.join(data_dir, 'sequences_test_deinterleaved_2.fa'), | |
223 | tmp) | |
224 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_interleaved.fa'), tmp)) | |
225 | ||
226 | with self.assertRaises(tasks.Error): | |
227 | tasks.interleave(os.path.join(data_dir, 'sequences_test_deinterleaved_bad_1.fa'), | |
228 | os.path.join(data_dir, 'sequences_test_deinterleaved_bad_2.fa'), | |
229 | tmp) | |
230 | ||
231 | with self.assertRaises(tasks.Error): | |
232 | tasks.interleave(os.path.join(data_dir, 'sequences_test_deinterleaved_bad2_1.fa'), | |
233 | os.path.join(data_dir, 'sequences_test_deinterleaved_bad2_2.fa'), | |
234 | tmp) | |
235 | os.unlink(tmp) | |
236 | ||
237 | ||
238 | class TestMakeRandomContigs(unittest.TestCase): | |
239 | def test_make_random_contigs(self): | |
240 | '''Test make_random_contigs()''' | |
241 | # Can't guarantee same results from random (even using same seed), so | |
242 | # just check sequence names and lengths | |
243 | def files_are_equal(file1, file2): | |
244 | seqs1 = {} | |
245 | seqs2 = {} | |
246 | tasks.file_to_dict(file1, seqs1) | |
247 | tasks.file_to_dict(file2, seqs2) | |
248 | if len(seqs1) != len(seqs2): | |
249 | return False | |
250 | ||
251 | for name in seqs1: | |
252 | seq1 = seqs1[name] | |
253 | seq2 = seqs2[name] | |
254 | if seq1.id != seq2.id: | |
255 | return False | |
256 | if len(seq1) != len(seq2): | |
257 | return False | |
258 | ||
259 | return True | |
260 | ||
261 | tmp = 'tmp.random_contigs.fa' | |
262 | tasks.make_random_contigs(2, 3, tmp) | |
263 | self.assertTrue(files_are_equal(os.path.join(data_dir, 'sequences_test_make_random_contigs.default.fa'), tmp)) | |
264 | tasks.make_random_contigs(2, 3, tmp, prefix='p') | |
265 | self.assertTrue(files_are_equal(os.path.join(data_dir, 'sequences_test_make_random_contigs.prefix-p.fa'), tmp)) | |
266 | tasks.make_random_contigs(2, 3, tmp, first_number=42) | |
267 | self.assertTrue(files_are_equal(os.path.join(data_dir, 'sequences_test_make_random_contigs.first-42.fa'), tmp)) | |
268 | tasks.make_random_contigs(28, 3, tmp, name_by_letters=True) | |
269 | self.assertTrue(files_are_equal(os.path.join(data_dir, 'sequences_test_make_random_contigs.name-by-letters.fa'), tmp)) | |
270 | os.unlink(tmp) | |
271 | ||
272 | ||
273 | class TestMakeLongReads(unittest.TestCase): | |
274 | def test_tiling_reads(self): | |
275 | tmp = 'tmp.out.fa' | |
276 | fa_in = os.path.join(data_dir, 'tasks_test_make_long_reads.input.fa') | |
277 | tasks.make_long_reads(fa_in, tmp, method='tiling', fixed_read_length=10, tile_step=5) | |
278 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'tasks_test_make_long_reads.output.fa'), tmp, shallow=False)) | |
279 | os.unlink(tmp) | |
280 | ||
281 | ||
282 | class TestMeanLength(unittest.TestCase): | |
283 | def test_mean_length(self): | |
284 | '''Test mean_length''' | |
285 | expected = [3, 2, 3, 4, 4] | |
286 | limits = [1, 2, 3, 4, None] | |
287 | assert len(expected) == len(limits) | |
288 | for i in range(len(expected)): | |
289 | mean = tasks.mean_length(os.path.join(data_dir, 'tasks_test_mean_length.fa'), limit=limits[i]) | |
290 | self.assertEqual(expected[i], mean) | |
291 | ||
292 | ||
293 | class TestMergeToOneSeq(unittest.TestCase): | |
294 | def test_merge_to_one_seq_fa(self): | |
295 | '''Test merge_to_one_seq with fasta''' | |
296 | tmp = 'tmp.merged.fa' | |
297 | tasks.merge_to_one_seq(os.path.join(data_dir, 'sequences_test_merge_to_one_seq.fa'), tmp) | |
298 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_merge_to_one_seq.merged.fa'), tmp, shallow=False)) | |
299 | os.unlink(tmp) | |
300 | ||
301 | def test_merge_to_one_seq_fq(self): | |
302 | '''Test merge_to_one_seq with fastq''' | |
303 | tmp = 'tmp.merged.fq' | |
304 | tasks.merge_to_one_seq(os.path.join(data_dir, 'sequences_test_merge_to_one_seq.fq'), tmp) | |
305 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_merge_to_one_seq.merged.fq'), tmp, shallow=False)) | |
306 | os.unlink(tmp) | |
307 | ||
308 | class TestReverseComplement(unittest.TestCase): | |
309 | def test_reverse_complement(self): | |
310 | '''reverse_complement should correctly reverse complement each seq in a file''' | |
311 | tmp = 'tmp.revcomp.fa' | |
312 | tasks.reverse_complement(os.path.join(data_dir, 'sequences_test.fa'), tmp) | |
313 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_revcomp.fa'), tmp)) | |
314 | os.unlink(tmp) | |
315 | ||
316 | ||
317 | class TestScaffoldsToContigs(unittest.TestCase): | |
318 | def test_scaffolds_to_contigs(self): | |
319 | '''Test scaffolds_to_contigs''' | |
320 | tmp = 'tmp.contigs.fa' | |
321 | tasks.scaffolds_to_contigs(os.path.join(data_dir, 'utils_test_scaffolds.fa'), tmp) | |
322 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'utils_test_scaffolds.fa.to_contigs.fa'), tmp)) | |
323 | os.unlink(tmp) | |
324 | ||
325 | def test_scaffolds_to_contigs_number_contigs(self): | |
326 | '''Test scaffolds_to_contigs with contig numbering''' | |
327 | tmp = 'tmp.contigs.fa' | |
328 | tasks.scaffolds_to_contigs(os.path.join(data_dir, 'utils_test_scaffolds.fa'), tmp, number_contigs=True) | |
329 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'utils_test_scaffolds.fa.to_contigs.number_contigs.fa'), tmp)) | |
330 | os.unlink(tmp) | |
331 | ||
332 | ||
333 | class TestSearchForSeq(unittest.TestCase): | |
334 | def test_search_for_seq(self): | |
335 | '''Test that sequence search finds all hits''' | |
336 | tmp = 'tmp.search.fa' | |
337 | tasks.search_for_seq(os.path.join(data_dir, 'sequences_test_search_string.fa'), tmp, 'AGA') | |
338 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_search_string.fa.hits'), tmp)) | |
339 | os.unlink(tmp) | |
340 | ||
341 | ||
342 | class TestSequenceTrim(unittest.TestCase): | |
343 | def test_sequence_trim(self): | |
344 | '''Test sequence_trim''' | |
345 | tmp1 = 'tmp.trimmed_1.fa' | |
346 | tmp2 = 'tmp.trimmed_2.fa' | |
347 | in1 = os.path.join(data_dir, 'tasks_test_sequence_trim_1.fa') | |
348 | in2 = os.path.join(data_dir, 'tasks_test_sequence_trim_2.fa') | |
349 | to_trim = os.path.join(data_dir, 'tasks_test_sequences_to_trim.fa') | |
350 | expected1 = os.path.join(data_dir, 'tasks_test_sequence_trim_1.trimmed.fa') | |
351 | expected2 = os.path.join(data_dir, 'tasks_test_sequence_trim_2.trimmed.fa') | |
352 | tasks.sequence_trim(in1, in2, tmp1, tmp2, to_trim, min_length=10, check_revcomp=True) | |
353 | self.assertTrue(filecmp.cmp(expected1, tmp1)) | |
354 | self.assertTrue(filecmp.cmp(expected2, tmp2)) | |
355 | os.unlink(tmp1) | |
356 | os.unlink(tmp2) | |
357 | ||
358 | ||
359 | class TestTranslate(unittest.TestCase): | |
360 | def test_translate(self): | |
361 | '''Test translate works in each frame''' | |
362 | tmp = 'tmp.translated.fa' | |
363 | for i in range(3): | |
364 | tasks.translate(os.path.join(data_dir, 'sequences_test_translate.fa'), tmp, frame=i) | |
365 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_translate.fa.frame' + str(i)), tmp)) | |
366 | ||
367 | os.unlink(tmp) | |
368 | ||
369 | ||
370 | class TestTrim(unittest.TestCase): | |
371 | def test_trim(self): | |
372 | '''trim should correctly trim each seq in a file''' | |
373 | tmp = 'tmp.trim.fq' | |
374 | tasks.trim(os.path.join(data_dir, 'sequences_test_untrimmed.fq'), tmp, 2, 1) | |
375 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_trimmed.fq'), tmp)) | |
376 | os.unlink(tmp) | |
377 | ||
378 | ||
379 | def test_trim_Ns_at_end(self): | |
380 | '''Test Ns at ends of sequences trimmed OK''' | |
381 | tmp = 'tmp.trim.fa' | |
382 | tasks.trim_Ns_at_end(os.path.join(data_dir, 'sequences_test_trim_Ns_at_end.fa'), tmp) | |
383 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_trim_Ns_at_end.fa.trimmed'), tmp)) | |
384 | os.unlink(tmp) | |
385 | ||
386 | ||
387 | class TestFileToDict(unittest.TestCase): | |
388 | def test_file_to_dict(self): | |
389 | '''check file_to_dict fills dictionary correctly''' | |
390 | d_test = {} | |
391 | d = {} | |
392 | tasks.file_to_dict(os.path.join(data_dir, 'sequences_test.fa'), d_test) | |
393 | for i in range(1,5): | |
394 | d[str(i)] = sequences.Fasta(str(i),'ACGTA') | |
395 | ||
396 | self.assertSequenceEqual(d_test.keys(),d.keys()) | |
397 | for i in range(1,5): | |
398 | key = str(i) | |
399 | self.assertEqual(d_test[key].id, d[key].id) | |
400 | self.assertEqual(d_test[key].seq, d[key].seq) | |
401 | ||
402 | ||
403 | class TestLengthsFromFai(unittest.TestCase): | |
404 | def test_lengths_from_fai(self): | |
405 | '''Check lengths_from_fai gets the length of each seq OK''' | |
406 | d = {} | |
407 | lengths = {str(x):x for x in range(1,5)} | |
408 | tasks.lengths_from_fai(os.path.join(data_dir, 'sequences_test_fai_test.fa.fai'), d) | |
409 | self.assertSequenceEqual(d.keys(), lengths.keys()) | |
410 | for i in d: | |
411 | self.assertEqual(int(i), d[i]) | |
412 | ||
413 | ||
414 | class TestSplit(unittest.TestCase): | |
415 | def test_split_by_base_count(self): | |
416 | '''Check that fasta/q files get split by base count correctly''' | |
417 | infile = os.path.join(data_dir, 'sequences_test_split_test.fa') | |
418 | outprefix = 'tmp.sequences_test_split_test.fa.test' | |
419 | length2files = {2: ['1','2','3','4'], | |
420 | 3: ['1','2','3'], | |
421 | 4: ['1', '2', '3'], | |
422 | 6: ['1', '2']} | |
423 | for l in length2files: | |
424 | tasks.split_by_base_count(infile, outprefix, l) | |
425 | for x in range(len(length2files[l])): | |
426 | file_index = str(length2files[l][x]) | |
427 | fname = outprefix + '.' + file_index | |
428 | self.assertTrue(filecmp.cmp(fname, infile + '.' + str(l) + '.' + file_index)) | |
429 | os.unlink(fname) | |
430 | ||
431 | # check that limiting the number of files works | |
432 | tasks.split_by_base_count(infile, outprefix, 6, 2) | |
433 | for i in range(1,4): | |
434 | test_file = outprefix + '.' + str(i) | |
435 | self.assertTrue(filecmp.cmp(test_file, os.path.join(data_dir, 'sequences_test_split_test.fa.6.limit2.') + str(i))) | |
436 | os.unlink(test_file) | |
437 | ||
438 | # check big sequence not broken | |
439 | tasks.split_by_base_count(os.path.join(data_dir, 'sequences_test_split_test.long.fa'), outprefix, 2) | |
440 | self.assertTrue(filecmp.cmp(outprefix + '.1', os.path.join(data_dir, 'sequences_test_split_test.long.fa.2.1'))) | |
441 | self.assertTrue(filecmp.cmp(outprefix + '.2', os.path.join(data_dir, 'sequences_test_split_test.long.fa.2.2'))) | |
442 | os.unlink(outprefix + '.1') | |
443 | os.unlink(outprefix + '.2') | |
444 | ||
445 | def test_split_by_fixed_size(self): | |
446 | '''Test fasta/q file split by fixed size''' | |
447 | infile = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa') | |
448 | outprefix = 'tmp.sequences_test_split' | |
449 | tasks.split_by_fixed_size(infile, outprefix, 4, 1) | |
450 | ||
451 | for i in range(1,7,1): | |
452 | correct = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.' + str(i)) | |
453 | test = outprefix + '.' + str(i) | |
454 | self.assertTrue(filecmp.cmp(test, correct)) | |
455 | os.unlink(test) | |
456 | ||
457 | test_coords = outprefix + '.coords' | |
458 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.coords'), test_coords)) | |
459 | os.unlink(test_coords) | |
460 | ||
461 | def test_split_by_fixed_size_exclude_Ns(self): | |
462 | infile = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa') | |
463 | outprefix = 'tmp.sequences_test_split' | |
464 | tasks.split_by_fixed_size(infile, outprefix, 4, 1, skip_if_all_Ns=True) | |
465 | ||
466 | for i in range(1,5,1): | |
467 | correct = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.' + str(i)) | |
468 | test = outprefix + '.' + str(i) | |
469 | self.assertTrue(filecmp.cmp(test, correct)) | |
470 | os.unlink(test) | |
471 | ||
472 | test_coords = outprefix + '.coords' | |
473 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.coords'), test_coords)) | |
474 | os.unlink(test_coords) | |
475 | ||
476 | def test_split_by_fixed_size_onefile(self): | |
477 | infile = os.path.join(data_dir, 'sequences_test_split_fixed_size_onefile.fa') | |
478 | tmp_out = 'tmp.sequences_test_split_fixed_size_onefile.fa' | |
479 | expected = os.path.join(data_dir, 'sequences_test_split_fixed_size_onefile.out.fa') | |
480 | tasks.split_by_fixed_size_onefile(infile, tmp_out, chunk_size=3, tolerance=1) | |
481 | self.assertTrue(filecmp.cmp(expected, tmp_out)) | |
482 | os.unlink(tmp_out) | |
483 | ||
484 | def test_split_by_fixed_size_onefile_exclude_Ns(self): | |
485 | infile = os.path.join(data_dir, 'sequences_test_split_fixed_size_onefile.fa') | |
486 | tmp_out = 'tmp.sequences_test_split_fixed_size_onefile.skip_Ns.fa' | |
487 | expected = os.path.join(data_dir, 'sequences_test_split_fixed_size_onefile.skip_Ns.out.fa') | |
488 | tasks.split_by_fixed_size_onefile(infile, tmp_out, chunk_size=3, tolerance=1, skip_if_all_Ns=True) | |
489 | self.assertTrue(filecmp.cmp(expected, tmp_out)) | |
490 | os.unlink(tmp_out) | |
491 | ||
492 | class TestCountSequences(unittest.TestCase): | |
493 | def test_count_sequences(self): | |
494 | '''Check that count_sequences does as expected''' | |
495 | self.assertEqual(2, tasks.count_sequences(os.path.join(data_dir, 'sequences_test_good_file.fq'))) | |
496 | self.assertEqual(4, tasks.count_sequences(os.path.join(data_dir, 'sequences_test.fa'))) | |
497 | self.assertEqual(0, tasks.count_sequences(os.path.join(data_dir, 'sequences_test_empty_file'))) | |
498 | ||
499 | class TestGetIds(unittest.TestCase): | |
500 | def test_get_ids(self): | |
501 | '''Check that IDs extracted correctly from fasta/q file''' | |
502 | tmpfile = 'tmp.ids' | |
503 | tasks.get_ids(os.path.join(data_dir, 'sequences_test.fa'), tmpfile) | |
504 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test.fa.ids'), tmpfile)) | |
505 | os.unlink(tmpfile) | |
506 | ||
507 | ||
508 | class TestFastaToFakeQual(unittest.TestCase): | |
509 | def test_fasta_to_fake_qual(self): | |
510 | '''Test fasta_to_fake_qual''' | |
511 | tmpfile = 'tmp.qual' | |
512 | infile = os.path.join(data_dir, 'tasks_test_fasta_to_fake_qual.in.fa') | |
513 | tasks.fastaq_to_fake_qual(infile, tmpfile) | |
514 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'tasks_test_fasta_to_fake_qual.out.default.qual'), tmpfile, shallow=False)) | |
515 | os.unlink(tmpfile) | |
516 | tasks.fastaq_to_fake_qual(infile, tmpfile, q=42) | |
517 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'tasks_test_fasta_to_fake_qual.out.q42.qual'), tmpfile, shallow=False)) | |
518 | os.unlink(tmpfile) | |
519 | ||
520 | ||
521 | class TestFastaToFastq(unittest.TestCase): | |
522 | def test_fasta_to_fastq(self): | |
523 | '''Check fasta_to_fastq converts files as expected''' | |
524 | tasks.fasta_to_fastq(os.path.join(data_dir, 'sequences_test.fa'), | |
525 | os.path.join(data_dir, 'sequences_test.fa.qual'), | |
526 | 'tmp.fq') | |
527 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test.fasta_to_fastq.fq'), 'tmp.fq')) | |
528 | ||
529 | with self.assertRaises(tasks.Error): | |
530 | tasks.fasta_to_fastq(os.path.join(data_dir, 'sequences_test.fa'), | |
531 | os.path.join(data_dir, 'sequences_test.fa.qual.bad'), | |
532 | 'tmp.fq') | |
533 | ||
534 | os.unlink('tmp.fq') | |
535 | ||
536 | ||
537 | class TestReplaceBases(unittest.TestCase): | |
538 | def test_sequences_replace_bases(self): | |
539 | '''Check that fasta file gets all bases replaced OK''' | |
540 | tmpfile = 'tmp.replace_bases.fa' | |
541 | tasks.replace_bases(os.path.join(data_dir, 'sequences_test_fastaq_replace_bases.fa'), tmpfile, 'T', 'X') | |
542 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_fastaq_replace_bases.expected.fa'), tmpfile)) | |
543 | os.unlink(tmpfile) | |
544 | ||
545 | ||
546 | class TestSortBySize(unittest.TestCase): | |
547 | def test_sort_by_size(self): | |
548 | '''Test sort_by_size''' | |
549 | infile = os.path.join(data_dir, 'tasks_test_sort_by_size.in.fa') | |
550 | tmpfile = 'tmp.sorted.fa' | |
551 | tasks.sort_by_size(infile, tmpfile) | |
552 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'tasks_test_sort_by_size.out.fa'), tmpfile, shallow=False)) | |
553 | tasks.sort_by_size(infile, tmpfile, smallest_first=True) | |
554 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'tasks_test_sort_by_size.out.rev.fa'), tmpfile, shallow=False)) | |
555 | os.unlink(tmpfile) | |
556 | ||
557 | ||
558 | class TestStripIlluminaSuffix(unittest.TestCase): | |
559 | def test_strip_illumina_suffix(self): | |
560 | '''Check illumina suffixes stripped correctly off read names''' | |
561 | tmpfile = 'tmp.stripped.fa' | |
562 | tasks.strip_illumina_suffix(os.path.join(data_dir, 'sequences_test_strip_illumina_suffix.fq'), tmpfile) | |
563 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_strip_illumina_suffix.fq.stripped'), tmpfile)) | |
564 | os.unlink(tmpfile) | |
565 | ||
566 | ||
567 | class TestToFasta(unittest.TestCase): | |
568 | def test_to_fasta(self): | |
569 | '''Test to_fasta''' | |
570 | tmpfile = 'tmp.to_fasta' | |
571 | infiles = [ | |
572 | 'sequences_test_good_file.fq', | |
573 | 'sequences_test_gffv3.gff', | |
574 | 'sequences_test_gffv3.no_FASTA_line.gff', | |
575 | 'sequences_test.embl', | |
576 | 'sequences_test.gbk', | |
577 | 'sequences_test_phylip.interleaved', | |
578 | 'sequences_test_phylip.interleaved2', | |
579 | 'sequences_test_phylip.sequential' | |
580 | ] | |
581 | infiles = [os.path.join(data_dir, x) for x in infiles] | |
582 | expected_outfiles = [x + '.to_fasta' for x in infiles] | |
583 | ||
584 | for i in range(len(infiles)): | |
585 | tasks.to_fasta(infiles[i], tmpfile) | |
586 | self.assertTrue(filecmp.cmp(expected_outfiles[i], tmpfile)) | |
587 | ||
588 | tasks.to_fasta(os.path.join(data_dir, 'sequences_test.fa'), tmpfile, line_length=3) | |
589 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test.line_length3.fa'), tmpfile)) | |
590 | tasks.to_fasta(os.path.join(data_dir, 'sequences_test_strip_after_whitespace.fa'), tmpfile, strip_after_first_whitespace=True) | |
591 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_strip_after_whitespace.fa.to_fasta'), tmpfile)) | |
592 | os.unlink(tmpfile) | |
593 | ||
594 | ||
595 | class TestToUniqueByID(unittest.TestCase): | |
596 | def test_to_unique_by_id(self): | |
597 | '''Test to_unique_by_id()''' | |
598 | tmpfile = 'tmp.unique_by_id.fa' | |
599 | tasks.to_unique_by_id(os.path.join(data_dir, 'sequences_test_to_unique_by_id.fa'), tmpfile) | |
600 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_to_unique_by_id.fa.out'), tmpfile)) | |
601 | os.unlink(tmpfile) | |
602 | ||
603 | ||
604 | class TestToFastaUnion(unittest.TestCase): | |
605 | def test_to_fasta_union(self): | |
606 | '''Test to_fasta_union''' | |
607 | tmpfile = 'tmp.to_fasta_union' | |
608 | tasks.to_fasta_union(os.path.join(data_dir, 'sequences_test_to_fasta_union.in.fa'), tmpfile, seqname='testname') | |
609 | self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_to_fasta_union.out.fa'), tmpfile, shallow=False)) | |
610 | os.unlink(tmpfile) | |
611 | ||
612 | ||
613 | if __name__ == '__main__': | |
614 | unittest.main() | |
615 |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import sys | |
3 | import os | |
4 | import filecmp | |
5 | import unittest | |
6 | from pyfastaq import utils | |
7 | ||
8 | modules_dir = os.path.dirname(os.path.abspath(utils.__file__)) | |
9 | data_dir = os.path.join(modules_dir, 'tests', 'data') | |
10 | ||
11 | class TestUtils(unittest.TestCase): | |
12 | def test_write_and_read(self): | |
13 | '''open_file_write() and open_file_read() should do the right thing depending gzipped or not''' | |
14 | for filename in ['utils.tmp', 'utils.tmp.gz', 'utils.tmp.bgz']: | |
15 | f = utils.open_file_write(filename) | |
16 | for i in range(3): | |
17 | print(i, file=f) | |
18 | utils.close(f) | |
19 | ||
20 | counter = 0 | |
21 | ||
22 | f = utils.open_file_read(filename) | |
23 | for line in f: | |
24 | self.assertEqual(counter, int(line.strip())) | |
25 | counter += 1 | |
26 | utils.close(f) | |
27 | ||
28 | os.unlink(filename) | |
29 | ||
30 | f = utils.open_file_read('-') | |
31 | self.assertEqual(sys.stdin, f) | |
32 | f = utils.open_file_write('-') | |
33 | self.assertEqual(sys.stdout, f) | |
34 | ||
35 | def test_raise_exception(self): | |
36 | '''open_file_write() and open_file_read() should raise an exception when can't do the opening''' | |
37 | with self.assertRaises(utils.Error): | |
38 | utils.open_file_read('this_file_is_not_here_so_throw_error') | |
39 | with self.assertRaises(utils.Error): | |
40 | utils.open_file_read('this_file_is_not_here_so_throw_error.gz') | |
41 | with self.assertRaises(utils.Error): | |
42 | utils.open_file_read(os.path.join(data_dir, 'utils_test_not_really_zipped.gz')) | |
43 | ||
44 | with self.assertRaises(utils.Error): | |
45 | utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error')) | |
46 | with self.assertRaises(utils.Error): | |
47 | utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error.gz')) | |
48 | ||
49 | def test_file_transpose(self): | |
50 | '''Test that file_transpose() does what it should''' | |
51 | infile = os.path.join(data_dir, 'utils_test_file_transpose.txt') | |
52 | tmp_out = 'utils_test_file_transpose.tmp' | |
53 | correct_file = os.path.join(data_dir, 'utils_test_file_transposed.txt') | |
54 | utils.file_transpose(infile, tmp_out) | |
55 | self.assertTrue(filecmp.cmp(tmp_out, correct_file)) | |
56 | os.unlink(tmp_out) | |
57 | ||
58 | def test_system_call(self): | |
59 | '''Test that system call appears to work and die as it should''' | |
60 | test_file = os.path.join(data_dir, 'utils_test_system_call.txt') | |
61 | tmp_out = 'utils_test_syscall.tmp' | |
62 | utils.syscall('cat ' + test_file + ' > ' + tmp_out) | |
63 | self.assertTrue(filecmp.cmp(tmp_out, test_file)) | |
64 | os.unlink(tmp_out) | |
65 | ||
66 | with self.assertRaises(utils.Error): | |
67 | utils.syscall('thisisveryunlikelytoebarealcommandandshouldthrowerror') | |
68 | ||
69 | utils.syscall('echo "this is not the right string" > ' + tmp_out) | |
70 | self.assertFalse(filecmp.cmp(tmp_out, test_file)) | |
71 | os.unlink(tmp_out) | |
72 | ||
73 | s = utils.syscall_get_stdout('echo bingo') | |
74 | self.assertListEqual(["bingo"], s) | |
75 | with self.assertRaises(utils.Error): | |
76 | utils.syscall_get_stdout('thisisveryunlikelytoebarealcommandandshouldthrowerror') | |
77 | ||
78 | if __name__ == '__main__': | |
79 | unittest.main() |
0 | import os | |
1 | import sys | |
2 | import subprocess | |
3 | import shlex | |
4 | ||
5 | class Error (Exception): pass | |
6 | ||
7 | def open_file_read(filename): | |
8 | if filename == '-': | |
9 | f = sys.stdin | |
10 | elif filename.endswith('.gz'): | |
11 | # first check that the file is OK according to gunzip | |
12 | retcode = subprocess.call('gunzip -t ' + filename, shell=True) | |
13 | if retcode != 0: | |
14 | raise Error("Error opening for reading gzipped file '" + filename + "'") | |
15 | ||
16 | # now open the file | |
17 | f = os.popen('gunzip -c ' + filename) | |
18 | else: | |
19 | try: | |
20 | f = open(filename) | |
21 | except: | |
22 | raise Error("Error opening for reading file '" + filename + "'") | |
23 | ||
24 | return f | |
25 | ||
26 | ||
27 | def open_file_write(filename): | |
28 | if filename == '-': | |
29 | f = sys.stdout | |
30 | elif filename.endswith('.gz'): | |
31 | if not os.path.exists(os.path.abspath(os.path.dirname(filename))): | |
32 | raise Error("Error opening for writing gzipped file '" + filename + "'") | |
33 | ||
34 | try: | |
35 | f = os.popen('gzip -9 -c > ' + filename, 'w') | |
36 | except: | |
37 | raise Error("Error opening for writing gzipped file '" + filename + "'") | |
38 | else: | |
39 | try: | |
40 | f = open(filename, 'w') | |
41 | except: | |
42 | raise Error("Error opening for writing file '" + filename + "'") | |
43 | ||
44 | return f | |
45 | ||
46 | ||
47 | def close(filehandle): | |
48 | if filehandle not in [sys.stdout, sys.stderr]: | |
49 | filehandle.close() | |
50 | ||
51 | ||
52 | def file_transpose(f_in, f_out, sep_in=None, sep_out='\t'): | |
53 | rows = [] | |
54 | f = open_file_read(f_in) | |
55 | for line in f: | |
56 | rows.append(line.rstrip().split(sep_in)) | |
57 | close(f) | |
58 | ||
59 | columns_out = max([len(x) for x in rows]) | |
60 | ||
61 | for r in rows: | |
62 | r += ['.'] * (columns_out - len(r)) | |
63 | ||
64 | f = open_file_write(f_out) | |
65 | for i in range(columns_out): | |
66 | print(sep_out.join([str(rows[x][i]) for x in range(len(rows))]), file=f) | |
67 | ||
68 | close(f) | |
69 | ||
70 | ||
71 | def syscall(cmd): | |
72 | retcode = subprocess.call(cmd, shell=True) | |
73 | ||
74 | if retcode != 0: | |
75 | raise Error("Error in system call. Command was:\n" + cmd) | |
76 | ||
77 | ||
78 | def syscall_get_stdout(cmd): | |
79 | try: | |
80 | out = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE).communicate()[0].decode('utf-8').rstrip() | |
81 | return out.split('\n') | |
82 | except: | |
83 | raise Error('Error in system call. I tried to run:\n' + str(cmd)) | |
84 | ||
85 |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | import sys | |
4 | ||
5 | tasks = { | |
6 | 'add_indels': 'Deletes or inserts bases at given position(s)', | |
7 | 'caf_to_fastq': 'Converts a CAF file to FASTQ format', | |
8 | 'capillary_to_pairs': 'Converts file of capillary reads to paired and unpaired files', | |
9 | 'chunker': 'Splits sequences into equal sized chunks', | |
10 | 'count_sequences': 'Counts the sequences in input file', | |
11 | 'deinterleave': 'Splits interleaved paired file into two separate files', | |
12 | 'enumerate_names': 'Renames sequences in a file, calling them 1,2,3... etc', | |
13 | 'expand_nucleotides': 'Makes every combination of degenerate nucleotides', | |
14 | 'fasta_to_fastq': 'Convert FASTA and .qual to FASTQ', | |
15 | 'filter': 'Filter sequences to get a subset of them', | |
16 | 'get_ids': 'Get the ID of each sequence', | |
17 | 'get_seq_flanking_gaps': 'Gets the sequences flanking gaps', | |
18 | 'interleave': 'Interleaves two files, output is alternating between fwd/rev reads', | |
19 | 'long_read_simulate': 'Simulates long reads from reference', | |
20 | 'make_random_contigs': 'Make contigs of random sequence', | |
21 | 'merge': 'Converts multi sequence file to a single sequence', | |
22 | 'replace_bases': 'Replaces all occurences of one letter with another', | |
23 | 'reverse_complement': 'Reverse complement all sequences', | |
24 | 'scaffolds_to_contigs': 'Creates a file of contigs from a file of scaffolds', | |
25 | 'search_for_seq': 'Find all exact matches to a string (and its reverse complement)', | |
26 | 'sequence_trim': 'Trim exact matches to a given string off the start of every sequence', | |
27 | 'split_by_base_count': 'Split multi sequence file into separate files', | |
28 | 'sort_by_size': 'Sorts sequences in length order', | |
29 | 'strip_illumina_suffix': 'Strips /1 or /2 off the end of every read name', | |
30 | 'to_fasta': 'Converts a variety of input formats to nicely formatted FASTA format', | |
31 | 'to_fake_qual': 'Make fake quality scores file', | |
32 | 'to_mira_xml': 'Create an xml file from a file of reads, for use with Mira assembler', | |
33 | 'to_orfs_gff': 'Writes a GFF file of open reading frames', | |
34 | 'to_perfect_reads': 'Make perfect paired reads from reference', | |
35 | 'to_random_subset': 'Make a random sample of sequences (and optionally mates as well)', | |
36 | 'to_tiling_bam': 'Make a BAM file of reads uniformly spread across the input reference', | |
37 | 'to_unique_by_id': 'Remove duplicate sequences, based on their names. Keep longest seqs', | |
38 | 'translate': 'Translate all sequences in input nucleotide sequences', | |
39 | 'trim_contigs': 'Trims a set number of bases off the end of every contig', | |
40 | 'trim_ends': 'Trim fixed number of bases of start and/or end of every sequence', | |
41 | 'trim_Ns_at_end': 'Trims all Ns at the start/end of all sequences', | |
42 | 'version': 'Print version number and exit', | |
43 | } | |
44 | ||
45 | ||
46 | def print_usage_and_exit(): | |
47 | print('Usage: fastaq <command> [options]', file=sys.stderr) | |
48 | print('\nTo get minimal usage for a command use:\nfastaq command', file=sys.stderr) | |
49 | print('\nTo get full help for a command use one of:\nfastaq command -h\nfastaq command --help\n', file=sys.stderr) | |
50 | print('\nAvailable commands:\n', file=sys.stderr) | |
51 | max_task_length = max([len(x) for x in list(tasks.keys())]) | |
52 | for task in sorted(tasks): | |
53 | print('{{0: <{}}}'.format(max_task_length).format(task), tasks[task], sep=' ', file=sys.stderr) | |
54 | sys.exit(1) | |
55 | ||
56 | ||
57 | if len(sys.argv) == 1 or sys.argv[1] in ['-h', '-help', '--help']: | |
58 | print_usage_and_exit() | |
59 | ||
60 | task = sys.argv.pop(1) | |
61 | ||
62 | if task not in tasks: | |
63 | print('Task "' + task + '" not recognised. Cannot continue.\n', file=sys.stderr) | |
64 | print_usage_and_exit() | |
65 | ||
66 | ||
67 | exec('import pyfastaq.runners.' + task) | |
68 | exec('pyfastaq.runners.' + task + '.run("' + tasks[task] + '")') | |
69 |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Given a fasta/q file of capillary reads, makes an interleaved file of read pairs (where more than read from same ligation, takes the longest read) and a file of unpaired reads. Replaces the .p1k/.q1k part of read names to denote fwd/rev reads with /1 and /2', | |
7 | usage = '%(prog)s <infile> <outfiles prefix>') | |
8 | parser.add_argument('infile', help='Name of input fasta/q file') | |
9 | parser.add_argument('outprefix', help='Prefix of output files', metavar='outfiles prefix') | |
10 | options = parser.parse_args() | |
11 | tasks.capillary_to_pairs(options.infile, options.outprefix) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Splits a multi fasta/q file into separate files. Splits sequences into chunks of a fixed size. Aims for chunk_size chunks in each file, but allows a little extra, so chunk can be up to (chunk_size + tolerance), to prevent tiny chunks made from the ends of sequences', | |
7 | usage = '%(prog)s [options] <fasta/q in> <prefix of output files> <chunk size> <tolerance>') | |
8 | parser.add_argument('infile', help='Name of input fasta/q file to be split') | |
9 | parser.add_argument('outprefix', help='Name of output fasta/q file') | |
10 | parser.add_argument('chunk_size', type=int, help='Size of each chunk') | |
11 | parser.add_argument('tolerance', type=int, help='Tolerance allowed in chunk size') | |
12 | parser.add_argument('--skip_all_Ns', action='store_true', help='Do not output any sequence that consists of all Ns') | |
13 | options = parser.parse_args() | |
14 | tasks.split_by_fixed_size( | |
15 | options.infile, | |
16 | options.outprefix, | |
17 | options.chunk_size, | |
18 | options.tolerance, | |
19 | skip_if_all_Ns=options.skip_all_Ns | |
20 | ) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Counts the number of sequences in a fasta/q file', | |
7 | usage = '%(prog)s <fasta/q in>') | |
8 | parser.add_argument('infile', help='Name of input fasta/q file') | |
9 | options = parser.parse_args() | |
10 | print(tasks.count_sequences(options.infile)) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Deinterleaves fasta/q file, so that reads are written alternately between two output files', | |
7 | usage = '%(prog)s [options] <fasta/q in> <out_fwd> <out_rev>') | |
8 | parser.add_argument('--fasta_out', action='store_true', help='Use this to write output as fasta (default is same as input)', default=False) | |
9 | parser.add_argument('infile', help='Name of fasta/q file to be deinterleaved') | |
10 | parser.add_argument('out_fwd', help='Name of output fasta/q file of forwards reads') | |
11 | parser.add_argument('out_rev', help='Name of output fasta/q file of reverse reads') | |
12 | options = parser.parse_args() | |
13 | tasks.deinterleave(options.infile, options.out_fwd, options.out_rev, fasta_out=options.fasta_out) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Renames sequences in a file, calling them 1,2,3... etc', | |
7 | usage = '%(prog)s [options] <fasta/q in> <fasta/q out>') | |
8 | parser.add_argument('--start_index', type=int, help='Starting number [%(default)s]', default=1) | |
9 | parser.add_argument('--rename_file', help='If used, will write a file of old name to new name') | |
10 | parser.add_argument('--keep_suffix', action='store_true', help='Use this to keep a /1 or /2 suffix at the end of each name') | |
11 | parser.add_argument('infile', help='Name of fasta/q file to be read') | |
12 | parser.add_argument('outfile', help='Name of output fasta/q file') | |
13 | options = parser.parse_args() | |
14 | tasks.enumerate_names(options.infile, | |
15 | options.outfile, | |
16 | start_index=options.start_index, | |
17 | keep_illumina_suffix=options.keep_suffix, | |
18 | rename_file=options.rename_file) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Makes all combinations of sequences in input file by using all possibilities of redundant bases. e.g. ART could be AAT or AGT. Assumes input is nucleotides, not amino acids', | |
7 | usage = '%(prog)s <infile> <outfile>') | |
8 | parser.add_argument('infile', help='Name of input file. Can be any of FASTA, FASTQ, GFF3, EMBL, GBK, Phylip') | |
9 | parser.add_argument('outfile', help='Name of output file') | |
10 | options = parser.parse_args() | |
11 | tasks.expand_nucleotides( | |
12 | options.infile, | |
13 | options.outfile, | |
14 | ) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Extends the length of all gaps (and trims the start/end of sequences) in a fasta/q file. Does this by replacing a set number of bases either side of each gap with Ns. Any sequence that ends up as all Ns is lost', | |
7 | usage = '%(prog)s [options] <fasta/q in> <fasta/q out>') | |
8 | parser.add_argument('--trim_number', type=int, help='Number of bases to trim around each gap, and off ends of each sequence [%(default)s]', default=100) | |
9 | parser.add_argument('infile', help='Name of input fasta/q file') | |
10 | parser.add_argument('outfile', help='Name of output fasta/q file') | |
11 | options = parser.parse_args() | |
12 | tasks.extend_gaps(options.infile, options.outfile, options.trim_number) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Given a fasta and qual file, makes a fastq file', | |
7 | usage = '%(prog)s <fasta in> <qual in> <fastq out>') | |
8 | parser.add_argument('fasta', help='Name of input fasta file', metavar='fasta in') | |
9 | parser.add_argument('qual', help='Name of input quality scores file', metavar='qual in') | |
10 | parser.add_argument('outfile', help='Name of output fastq file', metavar='fastq out') | |
11 | options = parser.parse_args() | |
12 | tasks.fasta_to_fastq(options.fasta, options.qual, options.outfile) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Filters a fasta/q file by sequence length and/or by name matching a regular expression', | |
7 | usage = '%(prog)s [options] <infile> <outfile>') | |
8 | parser.add_argument('--min_length', type=int, help='Minimum length of sequence to keep [%(default)s]', default=0, metavar='INT') | |
9 | parser.add_argument('--max_length', type=float, help='Maximum length of sequence to keep [%(default)s]', default=float('inf'), metavar='INT') | |
10 | parser.add_argument('--regex', help='If given, only reads with a name matching the regular expression will be kept') | |
11 | parser.add_argument('--ids_file', help='If given, only reads whose ID is in th given file will be used. One ID per line of file.') | |
12 | parser.add_argument('-v', '--invert', action='store_true', help='Keep sequences that do not match the filters') | |
13 | parser.add_argument('infile', help='Name of fasta/q file to be filtered') | |
14 | parser.add_argument('outfile', help='Name of output fasta/q file') | |
15 | options = parser.parse_args() | |
16 | tasks.filter(options.infile, | |
17 | options.outfile, | |
18 | minlength=options.min_length, | |
19 | maxlength=options.max_length, | |
20 | regex=options.regex, | |
21 | ids_file=options.ids_file, | |
22 | invert=options.invert | |
23 | ) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Gets IDs from each sequence in a fasta or fastq file', | |
7 | usage = '%(prog)s <infile> <outfile>') | |
8 | parser.add_argument('infile', help='Name of input fasta/q file') | |
9 | parser.add_argument('outfile', help='Name of output file') | |
10 | options = parser.parse_args() | |
11 | tasks.get_ids(options.infile, options.outfile) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Gets the sequences either side of gaps in a fasta/q file', | |
7 | usage = '%(prog)s [options] <fasta/q in> <fasta/q out>') | |
8 | parser.add_argument('--left', type=int, help='Number of bases to get to left of gap [%(default)s]', default=25, metavar='INT') | |
9 | parser.add_argument('--right', type=int, help='Number of bases to get to right of gap [%(default)s]', default=25, metavar='INT') | |
10 | parser.add_argument('infile', help='Name of input fasta/q file') | |
11 | parser.add_argument('outfile', help='Name of output fasta/q file') | |
12 | options = parser.parse_args() | |
13 | tasks.get_seqs_flanking_gaps(options.infile, options.outfile, options.left, options.right) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | import sys | |
4 | import random | |
5 | from fastaq import sequences, utils, intervals | |
6 | ||
7 | parser = argparse.ArgumentParser( | |
8 | description = 'Deletes or inserts bases at given position(s) from a fasta/q file', | |
9 | usage = '%(prog)s <fasta/q in> <outfile>') | |
10 | parser.add_argument('infile', help='Name of fasta/q file to be read') | |
11 | parser.add_argument('outfile', help='Name of output file') | |
12 | parser.add_argument('-d','--delete', action='append', help='Delete the given bases from the given sequence. Format same as samtools view: name:start-end. This option can be used multiple times (once for each region to delete). Overlapping coords will be merged before deleting', metavar='Name:start:bases') | |
13 | parser.add_argument('--delete_range', help='Deletes bases starting at position P in each sequence of the input file. Deletes start + (n-1)*step bases from sequence n.', metavar='P,start,step') | |
14 | parser.add_argument('-i','--insert', action='append', help='Insert a random string of bases at the given position. Format is name:position:number_to_add. Bases are added after the position. This option can be used multiple times', metavar='Name:start:bases') | |
15 | parser.add_argument('--insert_range', help='Inserts random bases starting after position P in each sequence of the input file. Inserts start + (n-1)*step bases into sequence n.', metavar='P,start,step') | |
16 | options = parser.parse_args() | |
17 | ||
18 | test_ops = [int(x is not None) for x in [options.delete, options.insert, options.delete_range, options.insert_range]] | |
19 | ||
20 | if sum(test_ops) != 1: | |
21 | print('Must use one of --delete, --insert, --delete_range, --insert_range. Cannot continue', file=sys.stderr) | |
22 | sys.exit(1) | |
23 | ||
24 | ||
25 | def range2dic(range_in): | |
26 | if range_in is None: | |
27 | return {} | |
28 | (pos, start, step) = range_in.split(',') | |
29 | d = {} | |
30 | d['pos'] = int(pos) - 1 | |
31 | d['bases'] = int(start) | |
32 | d['step'] = int(step) | |
33 | return d | |
34 | ||
35 | delete_range = range2dic(options.delete_range) | |
36 | insert_range = range2dic(options.insert_range) | |
37 | ||
38 | ||
39 | # convert the -d regions into sequence name, start and end coords | |
40 | to_delete = {} | |
41 | if options.delete: | |
42 | for s in options.delete: | |
43 | id, coords = s.rsplit(':') | |
44 | start, end = [int(x)-1 for x in coords.split('-')] | |
45 | if id not in to_delete: | |
46 | to_delete[id] = [] | |
47 | to_delete[id].append(intervals.Interval(start, end)) | |
48 | ||
49 | ||
50 | to_insert = {} | |
51 | if options.insert: | |
52 | for s in options.insert: | |
53 | id, pos, bases = s.rsplit(':',2) | |
54 | pos = int(pos) - 1 | |
55 | bases = int(bases) | |
56 | if id not in to_insert: | |
57 | to_insert[id] = [] | |
58 | to_insert[id].append((pos, bases)) | |
59 | ||
60 | ||
61 | assert len(to_delete) * len(to_insert) == 0 | |
62 | ||
63 | # merge overlapping regions to be deleted | |
64 | for l in to_delete.values(): | |
65 | intervals.merge_overlapping_in_list(l) | |
66 | ||
67 | # sort positions to be inserted | |
68 | for l in to_insert.values(): | |
69 | l.sort() | |
70 | ||
71 | # read in the fasta/q file and print outfile with deleted sequences | |
72 | seq_reader = sequences.file_reader(options.infile) | |
73 | f = utils.open_file_write(options.outfile) | |
74 | ||
75 | for seq in seq_reader: | |
76 | if seq.id in to_delete: | |
77 | # delete regions for this sequence, but start at the end so the | |
78 | # coords don't get messed up after the first deletion | |
79 | for inter in reversed(to_delete[seq.id]): | |
80 | seq.seq = seq.seq[:inter.start] + seq.seq[inter.end + 1:] | |
81 | elif options.delete_range: | |
82 | seq.seq = seq.seq[:delete_range['pos']] + seq.seq[delete_range['pos'] + delete_range['bases']:] | |
83 | delete_range['bases'] += delete_range['step'] | |
84 | elif seq.id in to_insert: | |
85 | for pos, bases in reversed(to_insert[seq.id]): | |
86 | seq.seq = seq.seq[:pos + 1] + ''.join([random.choice('ACGT') for x in range(bases)]) + seq.seq[pos + 1:] | |
87 | elif options.insert_range: | |
88 | seq.seq = seq.seq[:insert_range['pos'] + 1] + ''.join([random.choice('ACGT') for x in range(insert_range['bases'])]) + seq.seq[insert_range['pos'] + 1:] | |
89 | insert_range['bases'] += insert_range['step'] | |
90 | ||
91 | print(seq, file=f) | |
92 | ||
93 | utils.close(f) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Interleaves two fasta/q files, so that reads are written alternately first/second in output file', | |
7 | usage = '%(prog)s [options] <fasta/q 1> <fasta/q 2> <outfile>') | |
8 | parser.add_argument('infile_1', help='Name of first input fasta/q file') | |
9 | parser.add_argument('infile_2', help='Name of second input fasta/q file') | |
10 | parser.add_argument('outfile', help='Name of output fasta/q file of interleaved reads') | |
11 | options = parser.parse_args() | |
12 | tasks.interleave(options.infile_1, options.infile_2, options.outfile) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Simulates long reads from a fasta/q file. Can optionally make insertions into the reads, like pacbio does. If insertions made, coverage calculation is done before the insertions (so total read length may appear longer then expected).', | |
7 | usage = '%(prog)s [options] <infile> <outfile>') | |
8 | ||
9 | parser.add_argument('infile', help='Name of input fasta/q file') | |
10 | parser.add_argument('outfile', help='Name of output fasta file') | |
11 | ||
12 | parser.add_argument('--method', help='How to sample the read positions and lengths. Choose from 1) "tiling", where reads of fixed length are taken at equal intervals from the reference. 2) "unfiform", where reads of fixed length taken at positions sampled uniformly. 3) "gamma", where reads lengths are taken from a gamma distribution, and positions sampled uniformly. [%(default)s]', default='tiling', choices=['tiling', 'uniform', 'gamma'], metavar='tiling|uniform|gamma') | |
13 | parser.add_argument('--seed', type=int, help='Seed for random number generator [default: use python\'s default]', metavar='INT') | |
14 | parser.add_argument('--qual', help='Write a file of fake quality scores called outfile.qual, all bases same quality [%(default)s]', metavar='INT') | |
15 | parser.add_argument('--fixed_read_length', type=int, help='Length of each read. Only applies if method is tile or uniform. [%(default)s]', default=20000, metavar='INT') | |
16 | parser.add_argument('--coverage', type=float, help='Read coverage. Only applies if method is gamma or uniform. [%(default)s]', default=2, metavar='FLOAT') | |
17 | ||
18 | ||
19 | tiling_group = parser.add_argument_group('tiling options') | |
20 | tiling_group.add_argument('--tile_step', type=int, help='Distance between start of each read [%(default)s]', default=10000, metavar='INT') | |
21 | ||
22 | gamma_group = parser.add_argument_group('gamma options') | |
23 | gamma_group.add_argument('--gamma_shape', type=float, help='Shape parameter of gamma distribution [%(default)s]', default=1.2, metavar='FLOAT') | |
24 | gamma_group.add_argument('--gamma_scale', type=float, help='Scale parameter of gamma distribution [%(default)s]', default=6000, metavar='FLOAT') | |
25 | gamma_group.add_argument('--gamma_min_length', type=int, help='Minimum read length [%(default)s]', default=20000, metavar='INT') | |
26 | ||
27 | ins_group = parser.add_argument_group('options to add insertions to reads') | |
28 | ins_group.add_argument('--ins_skip', type=int, help='Insert a random base every --skip bases plus or minus --ins_window. If this option is used, must also use --ins_window.', metavar='INT') | |
29 | ins_group.add_argument('--ins_window', type=int, help='See --ins_skip. If this option is used, must also use --ins_skip.', metavar='INT') | |
30 | ||
31 | ||
32 | options = parser.parse_args() | |
33 | tasks.make_long_reads( | |
34 | options.infile, | |
35 | options.outfile, | |
36 | method=options.method, | |
37 | fixed_read_length=options.fixed_read_length, | |
38 | coverage=options.coverage, | |
39 | tile_step=options.tile_step, | |
40 | gamma_shape=options.gamma_shape, | |
41 | gamma_scale=options.gamma_scale, | |
42 | gamma_min_length=options.gamma_min_length, | |
43 | seed=options.seed, | |
44 | ins_skip=options.ins_skip, | |
45 | ins_window=options.ins_window | |
46 | ) | |
47 | ||
48 | if options.qual: | |
49 | tasks.fastaq_to_fake_qual(options.outfile, options.outfile + '.qual', q=options.qual) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Makes a multi-fasta file of random sequences, all of the same length. Each base has equal chance of being A,C,G or T', | |
7 | usage = '%(prog)s [options] <number of sequences> <length of each sequence> <fasta out>') | |
8 | parser.add_argument('--first_number', type=int, help='If numbering the sequences, the first sequence gets this number [%(default)s]', default=1) | |
9 | parser.add_argument('--name_by_letters', action='store_true', help='Name the contigs A,B,C,... will start at A again if you get to Z') | |
10 | parser.add_argument('--prefix', help='Prefix to add to start of every sequence name', default='') | |
11 | parser.add_argument('--seed', type=int, help='Seed for random number generator. Default is to use python\'s default', default=None) | |
12 | parser.add_argument('contigs', type=int, help='Nunber of contigs to make') | |
13 | parser.add_argument('length', type=int, help='Length of each contig') | |
14 | parser.add_argument('outfile', help='Name of output file') | |
15 | options = parser.parse_args() | |
16 | tasks.make_random_contigs( | |
17 | options.contigs, | |
18 | options.length, | |
19 | options.outfile, | |
20 | name_by_letters=options.name_by_letters, | |
21 | prefix=options.prefix, | |
22 | seed=options.seed, | |
23 | first_number=options.first_number | |
24 | ) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Converts multi fasta/q file to single sequence file, preserving original order of sequences', | |
7 | usage = '%(prog)s <infile> <outfile>') | |
8 | parser.add_argument('infile', help='Name of input file. Can be any of FASTA, FASTQ, GFF3, EMBL, GBK, Phylip') | |
9 | parser.add_argument('outfile', help='Name of output file') | |
10 | parser.add_argument('-n', '--name', help='Name of sequence in output file [%(default)s]', default='union') | |
11 | options = parser.parse_args() | |
12 | tasks.merge_to_one_seq( | |
13 | options.infile, | |
14 | options.outfile, | |
15 | seqname=options.name | |
16 | ) | |
17 |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Replaces all occurences of one letter with another in a fasta/q file', | |
7 | usage = '%(prog)s <fasta/q in> <outfile> <old> <new>') | |
8 | parser.add_argument('infile', help='Name of input fasta/q file') | |
9 | parser.add_argument('outfile', help='Name of output file') | |
10 | parser.add_argument('old', help='Base to be replaced') | |
11 | parser.add_argument('new', help='Replace with this letter') | |
12 | options = parser.parse_args() | |
13 | tasks.replace_bases(options.infile, options.outfile, options.old, options.new) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Reverse complements all sequences in a fasta/q file', | |
7 | usage = '%(prog)s [options] <fasta/q in> <fasta/q out>') | |
8 | parser.add_argument('infile', help='Name of input fasta/q file') | |
9 | parser.add_argument('outfile', help='Name of output fasta/q file') | |
10 | options = parser.parse_args() | |
11 | tasks.reverse_complement(options.infile, options.outfile) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Creates a file of contigs from a file of scaffolds - i.e. breaks at every gap in the input', | |
7 | usage = '%(prog)s [options] <infile> <outfile>') | |
8 | parser.add_argument('--number_contigs', action='store_true', help='Use this to enumerate contig names 1,2,3,... within each scaffold') | |
9 | parser.add_argument('infile', help='Name of input fasta/q file') | |
10 | parser.add_argument('outfile', help='Name of output contigs file') | |
11 | options = parser.parse_args() | |
12 | tasks.scaffolds_to_contigs(options.infile, options.outfile, number_contigs=options.number_contigs) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Searches for an exact match on a given string and its reverese complement, in every sequences of a fasta/q file. Case insensitive. Guaranteed to find all hits', | |
7 | usage = '%(prog)s [options] <fasta/q in> <outfile> <search_string>') | |
8 | parser.add_argument('infile', help='Name of input fasta/q file') | |
9 | parser.add_argument('outfile', help='Name of outputfile. Tab-delimited output: sequence name, position, strand') | |
10 | parser.add_argument('search_string', help='String to search for in the sequences') | |
11 | options = parser.parse_args() | |
12 | tasks.search_for_seq(options.infile, options.outfile, options.search_string) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Trims sequences off the start of all sequences in a pair of fasta/q files, whenever there is a perfect match. Only keeps a read pair if both reads of the pair are at least a minimum length after any trimming', | |
7 | usage = '%(prog)s [options] <fasta/q 1 in> <fastaq/2 in> <out 1> <out 2> <trim_seqs>') | |
8 | parser.add_argument('--min_length', type=int, help='Minimum length of output sequences [%(default)s]', default=50, metavar='INT') | |
9 | parser.add_argument('--revcomp', action='store_true', help='Trim the end of each sequence if it matches the reverse complement. This option is intended for PCR primer trimming') | |
10 | parser.add_argument('infile_1', help='Name of forward fasta/q file to be trimmed', metavar='fasta/q 1 in') | |
11 | parser.add_argument('infile_2', help='Name of reverse fasta/q file to be trimmed', metavar='fasta/q 2 in') | |
12 | parser.add_argument('outfile_1', help='Name of output forward fasta/q file', metavar='out_1') | |
13 | parser.add_argument('outfile_2', help='Name of output reverse fasta/q file', metavar='out_2') | |
14 | parser.add_argument('trim_seqs', help='Name of fasta/q file of sequences to search for at the start of each input sequence', metavar='trim_seqs') | |
15 | options = parser.parse_args() | |
16 | tasks.sequence_trim( | |
17 | options.infile_1, | |
18 | options.infile_2, | |
19 | options.outfile_1, | |
20 | options.outfile_2, | |
21 | options.trim_seqs, | |
22 | min_length=options.min_length, | |
23 | check_revcomp=options.revcomp | |
24 | ) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Splits a multi fasta/q file into separate files. Does not split sequences. Puts up to max_bases into each split file. The exception is that any sequence longer than max_bases is put into its own file.', | |
7 | usage = '%(prog)s [options] <fasta/q in> <prefix of output files> <max_bases>') | |
8 | parser.add_argument('infile', help='Name of input fasta/q file to be split') | |
9 | parser.add_argument('outprefix', help='Name of output fasta/q file') | |
10 | parser.add_argument('max_bases', type=int, help='Max bases in each output split file', metavar='max_bases') | |
11 | parser.add_argument('--max_seqs', type=int, help='Max number of sequences in each output split file [no limit]', metavar='INT') | |
12 | ||
13 | options = parser.parse_args() | |
14 | tasks.split_by_base_count(options.infile, options.outprefix, options.max_bases, options.max_seqs) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Strips /1 or /2 off the end of every read name in a fasta/q file', | |
7 | usage = '%(prog)s [options] <fasta/q in> <fasta/q out>') | |
8 | parser.add_argument('infile', help='Name of input fasta/q file') | |
9 | parser.add_argument('outfile', help='Name of output fasta/q file') | |
10 | options = parser.parse_args() | |
11 | tasks.strip_illumina_suffix(options.infile, options.outfile) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Makes fake quality scores file from a fasta/q file', | |
7 | usage = '%(prog)s <infile> <outfile>') | |
8 | parser.add_argument('infile', help='Name of input file') | |
9 | parser.add_argument('outfile', help='Name of output file') | |
10 | parser.add_argument('-q', '--qual', type=int, help='Quality score to assign to all bases [%(default)s]', default=40) | |
11 | options = parser.parse_args() | |
12 | tasks.fastaq_to_fake_qual( | |
13 | options.infile, | |
14 | options.outfile, | |
15 | q=options.qual | |
16 | ) | |
17 |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Converts sequence file to FASTA format', | |
7 | usage = '%(prog)s <infile> <outfile>') | |
8 | parser.add_argument('infile', help='Name of input file. Can be any of FASTA, FASTQ, GFF3, EMBL, GBK, Phylip') | |
9 | parser.add_argument('outfile', help='Name of output file') | |
10 | parser.add_argument('-l', '--line_length', type=int, help='Number of bases on each sequence line of output file [%(default)s]', default=60) | |
11 | parser.add_argument('-s', '--strip_after_whitespace', action='store_true', help='Remove everything after first whitesapce in every sequence name') | |
12 | options = parser.parse_args() | |
13 | tasks.to_fasta( | |
14 | options.infile, | |
15 | options.outfile, | |
16 | line_length=options.line_length, | |
17 | strip_after_first_whitespace=options.strip_after_whitespace | |
18 | ) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Creates an xml file from a fasta/q file of reads, for use with Mira assembler', | |
7 | usage = '%(prog)s [options] <fastq_in> <xml_out>') | |
8 | parser.add_argument('infile', help='Name of input fasta/q file') | |
9 | parser.add_argument('xml_out', help='Name of output xml file') | |
10 | options = parser.parse_args() | |
11 | tasks.fastaq_to_mira_xml(options.infile, options.xml_out) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Writes a GFF file of open reading frames from a fasta/q file', | |
7 | usage = '%(prog)s [options] <fasta/q in> <gff_out>') | |
8 | parser.add_argument('--min_length', type=int, help='Minimum length of ORF, in nucleotides [%(default)s]', default=300, metavar='INT') | |
9 | parser.add_argument('infile', help='Name of input fasta/q file') | |
10 | parser.add_argument('gff_out', help='Name of output gff file') | |
11 | options = parser.parse_args() | |
12 | tasks.fastaq_to_orfs_gff(options.infile, options.gff_out, min_length=options.min_length) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | import random | |
4 | from math import floor, ceil | |
5 | from fastaq import sequences, utils | |
6 | import sys | |
7 | ||
8 | parser = argparse.ArgumentParser( | |
9 | description = 'Makes perfect paired end fastq reads from a fasta/q file, with insert sizes sampled from a normal distribution. Read orientation is innies. Output is an interleaved fastq file.', | |
10 | usage = '%(prog)s <fasta/q in> <out.fastq> <mean insert size> <insert std deviation> <mean coverage> <read length>') | |
11 | parser.add_argument('infile', help='Name of input fasta/q file') | |
12 | parser.add_argument('outfile', help='Name of output fastq file') | |
13 | parser.add_argument('mean_insert', type=int, help='Mean insert size of read pairs', metavar='mean insert size') | |
14 | parser.add_argument('insert_std', type=float, help='Standard devation of insert size', metavar='insert std deviation') | |
15 | parser.add_argument('coverage', type=float, help='Mean coverage of the reads', metavar='mean coverage') | |
16 | parser.add_argument('readlength', type=int, help='Length of each read', metavar='read length') | |
17 | parser.add_argument('--fragments', help='Write FASTA sequences of fragments (i.e. read pairs plus sequences in between them) to the given filename', metavar='FILENAME') | |
18 | parser.add_argument('--no_n', action='store_true', help='Don\'t allow any N or n characters in the reads') | |
19 | parser.add_argument('--seed', type=int, help='Seed for random number generator. Default is to use python\'s default', default=None, metavar='INT') | |
20 | options = parser.parse_args() | |
21 | ||
22 | random.seed(a=options.seed) | |
23 | ||
24 | seq_reader = sequences.file_reader(options.infile) | |
25 | fout = utils.open_file_write(options.outfile) | |
26 | pair_counter = 1 | |
27 | ||
28 | if options.fragments: | |
29 | fout_frags = utils.open_file_write(options.fragments) | |
30 | ||
31 | for ref in seq_reader: | |
32 | # check if current seq is long enough | |
33 | if len(ref) < options.mean_insert + 4 * options.insert_std: | |
34 | print('Warning, sequence ', ref.id, ' too short. Skipping it...', file=sys.stderr) | |
35 | continue | |
36 | ||
37 | # work out how many reads to simulate | |
38 | read_pairs = int(0.5 * options.coverage * len(ref) / options.readlength) | |
39 | ||
40 | # it's possible that we pick the same fragment twice, in which case the | |
41 | # reads would get the same name. So remember the frag coords | |
42 | used_fragments = {} # (middle_position, length) => count | |
43 | ||
44 | # do the simulation: pick insert size from normal distribution, and | |
45 | # position in genome from uniform distribution | |
46 | x = 0 | |
47 | while x < read_pairs: | |
48 | isize = int(random.normalvariate(options.mean_insert, options.insert_std)) | |
49 | while isize > len(ref) or isize < options.readlength: | |
50 | isize = int(random.normalvariate(options.mean_insert, options.insert_std)) | |
51 | middle_pos = random.randint(ceil(0.5 *isize), floor(len(ref) - 0.5 * isize)) | |
52 | read_start1 = int(middle_pos - ceil(0.5 * isize)) | |
53 | read_start2 = read_start1 + isize - options.readlength | |
54 | ||
55 | readname = ':'.join([ref.id, str(pair_counter), str(read_start1+1), str(read_start2+1)]) | |
56 | ||
57 | fragment = (middle_pos, isize) | |
58 | if fragment in used_fragments: | |
59 | used_fragments[fragment] += 1 | |
60 | readname += '.dup.' + str(used_fragments[fragment]) | |
61 | else: | |
62 | used_fragments[fragment] = 1 | |
63 | ||
64 | read1 = sequences.Fastq(readname + '/1', ref.seq[read_start1:read_start1 + options.readlength], 'I' * options.readlength) | |
65 | read2 = sequences.Fastq(readname + '/2', ref.seq[read_start2:read_start2 + options.readlength], 'I' * options.readlength) | |
66 | ||
67 | ||
68 | if options.no_n and ('n' in read1.seq or 'N' in read1.seq or 'n' in read2.seq or 'N' in read2.seq): | |
69 | continue | |
70 | ||
71 | read2.revcomp() | |
72 | ||
73 | print(read1, file=fout) | |
74 | print(read2, file=fout) | |
75 | ||
76 | if options.fragments: | |
77 | frag = sequences.Fasta(readname, ref.seq[read_start1:read_start2 + options.readlength]) | |
78 | print(frag, file=fout_frags) | |
79 | ||
80 | pair_counter += 1 | |
81 | x += 1 | |
82 | ||
83 | utils.close(fout) | |
84 | if options.fragments: | |
85 | utils.close(fout_frags) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import sys | |
3 | import argparse | |
4 | import random | |
5 | from fastaq import sequences, utils | |
6 | ||
7 | parser = argparse.ArgumentParser( | |
8 | description = 'Takes a random subset of reads from a fasta/q file and optionally the corresponding read ' + | |
9 | 'from a mates file. Ouptut is interleaved if mates file given', | |
10 | usage = '%(prog)s [options] <fasta/q in> <outfile> <probablilty of keeping read (pair) in [0,100]>') | |
11 | parser.add_argument('--mate_file', help='Name of fasta/q mates file') | |
12 | parser.add_argument('infile', help='Name of input fasta/q file') | |
13 | parser.add_argument('outfile', help='Name of fasta/q output file') | |
14 | parser.add_argument('probability', type=int, help='Probability of keeping any given read (pair) in [0,100]', metavar='INT') | |
15 | options = parser.parse_args() | |
16 | ||
17 | seq_reader = sequences.file_reader(options.infile) | |
18 | fout = utils.open_file_write(options.outfile) | |
19 | ||
20 | if options.mate_file: | |
21 | mate_seq_reader = sequences.file_reader(options.mate_file) | |
22 | ||
23 | for seq in seq_reader: | |
24 | if options.mate_file: | |
25 | try: | |
26 | mate_seq = next(mate_seq_reader) | |
27 | except StopIteration: | |
28 | print('Error! Didn\'t get mate for read', seq.id, file=sys.stderr) | |
29 | sys.exit(1) | |
30 | if random.randint(0, 100) <= options.probability: | |
31 | print(seq, file=fout) | |
32 | if options.mate_file: | |
33 | print(mate_seq, file=fout) | |
34 | ||
35 | utils.close(fout) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | import sys | |
4 | import os | |
5 | from fastaq import sequences, utils | |
6 | ||
7 | parser = argparse.ArgumentParser( | |
8 | description = 'Takes a fasta/q file. Makes a BAM file containing perfect (unpaired) reads tiling the whole genome', | |
9 | usage = '%(prog)s [options] <fasta/q in> <read length> <read step> <read prefix> <out.bam>', | |
10 | epilog = 'Important: assumes that samtools is in your path') | |
11 | parser.add_argument('infile', help='Name of input fasta/q file') | |
12 | parser.add_argument('read_length', type=int, help='Length of reads') | |
13 | parser.add_argument('read_step', type=int, help='Distance between start of each read') | |
14 | parser.add_argument('read_prefix', help='Prefix of read names') | |
15 | parser.add_argument('outfile', help='Name of output BAM file') | |
16 | parser.add_argument('--read_group', help='Add the given read group ID to all reads [%(default)s]' ,default='42') | |
17 | options = parser.parse_args() | |
18 | ||
19 | # make a header first - we need to add the @RG line to the default header made by samtools | |
20 | tmp_empty_file = options.outfile + '.tmp.empty' | |
21 | f = utils.open_file_write(tmp_empty_file) | |
22 | utils.close(f) | |
23 | try: | |
24 | f = os.popen('samtools view -H -T ' + options.infile + ' ' + tmp_empty_file) | |
25 | except IOError: | |
26 | print('Error making tmp header file', file=sys.stderr) | |
27 | sys.exit(1) | |
28 | ||
29 | header_lines = f.readlines() | |
30 | header_lines.append('@RG\tID:' + options.read_group + '\tSM:FAKE') | |
31 | f.close() | |
32 | os.unlink(tmp_empty_file) | |
33 | ||
34 | seq_reader = sequences.file_reader(options.infile) | |
35 | try: | |
36 | f = os.popen('samtools view -hbS - > ' + options.outfile, 'w') | |
37 | except IOError: | |
38 | print("Error opening for writing BAM file '" + options.outfile + "'", file=sys.stderr) | |
39 | sys.exit(1) | |
40 | ||
41 | print(''.join(header_lines), file=f) | |
42 | ||
43 | for seq in seq_reader: | |
44 | end_range = len(seq) | |
45 | if len(seq) < options.read_length: | |
46 | end_range = 1 | |
47 | for i in range(0, end_range, options.read_step): | |
48 | if len(seq) <= options.read_length: | |
49 | start = 0 | |
50 | end = len(seq) - 1 | |
51 | else: | |
52 | start = i | |
53 | end = start + options.read_length - 1 | |
54 | ||
55 | if end > len(seq) - 1: | |
56 | end = len(seq) - 1 | |
57 | start = end - options.read_length + 1 | |
58 | ||
59 | read = sequences.Fastq(options.read_prefix + ':' + seq.id + ':' + str(start + 1) + ':' + str(end + 1), seq[start:end+1], 'I' * (end - start + 1)) | |
60 | ||
61 | print ('\t'.join([read.id, | |
62 | '0', | |
63 | seq.id, | |
64 | str(start + 1), | |
65 | '60', | |
66 | str(len(read)) + 'M', | |
67 | '*', | |
68 | '*', | |
69 | '*', | |
70 | read.seq, | |
71 | read.qual, | |
72 | 'RG:Z:' + options.read_group]), file=f) | |
73 | ||
74 | if end == len(seq) - 1: | |
75 | break | |
76 | ||
77 | f.close() | |
78 |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Removes duplicate sequences from a fasta/q file, based on their names. If the same name is found more than once, then the longest sequence is kept. Order of sequences is preserved in output', | |
7 | usage = '%(prog)s <infile> <outfile>') | |
8 | parser.add_argument('infile', help='Name of input fasta/q file') | |
9 | parser.add_argument('outfile', help='Name of output fasta/q file') | |
10 | options = parser.parse_args() | |
11 | tasks.to_unique_by_id(options.infile, options.outfile) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Translates all sequences in a fasta or fastq file. Output is always fasta format', | |
7 | usage = '%(prog)s <in.fasta/q> <out.fasta>') | |
8 | parser.add_argument('--frame', type=int, choices=[0,1,2], help='Frame to translate [%(default)s]', default=0) | |
9 | parser.add_argument('infile', help='Name of fasta/q file to be translated', metavar='in.fasta/q') | |
10 | parser.add_argument('outfile', help='Name of output fasta file', metavar='out.fasta') | |
11 | options = parser.parse_args() | |
12 | tasks.translate(options.infile, options.outfile, frame=options.frame) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Trims any Ns off each sequence in a fasta/q file. Does nothing to gaps in the middle, just trims the ends', | |
7 | usage = '%(prog)s [options] <fasta/q in> <fasta/q out>') | |
8 | parser.add_argument('infile', help='Name of input fasta/q file') | |
9 | parser.add_argument('outfile', help='Name of output fasta/q file') | |
10 | options = parser.parse_args() | |
11 | tasks.trim_Ns_at_end(options.infile, options.outfile) |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | import argparse | |
3 | from fastaq import tasks | |
4 | ||
5 | parser = argparse.ArgumentParser( | |
6 | description = 'Trims set number of bases off each sequence in a fasta/q file', | |
7 | usage = '%(prog)s [options] <fasta/q in> <bases off start> <bases off end> <fasta/q out>') | |
8 | parser.add_argument('infile', help='Name of input fasta/q file') | |
9 | parser.add_argument('start_trim', type=int, help='Number of bases to trim off start') | |
10 | parser.add_argument('end_trim', type=int, help='Number of bases to trim off end') | |
11 | parser.add_argument('outfile', help='Name of output fasta/q file') | |
12 | options = parser.parse_args() | |
13 | tasks.trim(options.infile, options.outfile, options.start_trim, options.end_trim) |
0 | 0 | import os |
1 | 1 | import glob |
2 | import sys | |
2 | 3 | from setuptools import setup, find_packages |
3 | 4 | |
4 | def read(fname): | |
5 | return open(os.path.join(os.path.dirname(__file__), fname)).read() | |
5 | ||
6 | try: | |
7 | import numpy | |
8 | except ImportError: | |
9 | print("Error! numpy for Python3 not found.\nPlease install it (e.g. apt-get install python3-numpy)", file=sys.stderr) | |
10 | sys.exit(1) | |
6 | 11 | |
7 | 12 | setup( |
8 | name='Fastaq', | |
9 | version='1.6.0', | |
10 | description='Scripts to manipulate FASTA and FASTQ files, plus API for developers', | |
11 | long_description=read('README.md'), | |
13 | name='pyfastaq', | |
14 | version='3.2.0', | |
15 | description='Script to manipulate FASTA and FASTQ files, plus API for developers', | |
12 | 16 | packages = find_packages(), |
13 | 17 | author='Martin Hunt', |
14 | 18 | author_email='mh12@sanger.ac.uk', |
17 | 21 | test_suite='nose.collector', |
18 | 22 | install_requires=['nose >= 1.3'], |
19 | 23 | license='GPLv3', |
24 | classifiers=[ | |
25 | 'Development Status :: 4 - Beta', | |
26 | 'Topic :: Scientific/Engineering :: Bio-Informatics', | |
27 | 'Programming Language :: Python :: 3 :: Only', | |
28 | 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', | |
29 | ], | |
20 | 30 | ) |