New upstream version 1.3
Andreas Tille
5 years ago
0 | 0 | CC=gcc |
1 | 1 | CFLAGS=-g -Wall -O2 -Wno-unused-function |
2 | BINDIR=/usr/local/bin | |
2 | 3 | |
3 | 4 | all:seqtk |
4 | 5 | |
5 | 6 | seqtk:seqtk.c khash.h kseq.h |
6 | 7 | $(CC) $(CFLAGS) seqtk.c -o $@ -lz -lm |
7 | 8 | |
9 | install:all | |
10 | install seqtk $(BINDIR) | |
11 | ||
8 | 12 | clean: |
9 | 13 | rm -fr gmon.out *.o ext/*.o a.out seqtk trimadap *~ *.a *.dSYM session* |
22 | 22 | SOFTWARE. |
23 | 23 | */ |
24 | 24 | |
25 | /* Last Modified: 05MAR2012 */ | |
25 | /* Last Modified: 2017-02-11 */ | |
26 | 26 | |
27 | 27 | #ifndef AC_KSEQ_H |
28 | 28 | #define AC_KSEQ_H |
36 | 36 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) |
37 | 37 | #define KS_SEP_MAX 2 |
38 | 38 | |
39 | #define __KS_TYPE(type_t) \ | |
40 | typedef struct __kstream_t { \ | |
41 | unsigned char *buf; \ | |
42 | int begin, end, is_eof; \ | |
43 | type_t f; \ | |
39 | #define __KS_TYPE(type_t) \ | |
40 | typedef struct __kstream_t { \ | |
41 | unsigned char *buf; \ | |
42 | int begin, end, is_eof; \ | |
43 | type_t f; \ | |
44 | 44 | } kstream_t; |
45 | 45 | |
46 | #define ks_err(ks) ((ks)->end < 0) | |
46 | 47 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) |
47 | 48 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) |
48 | 49 | |
49 | #define __KS_BASIC(type_t, __bufsize) \ | |
50 | static inline kstream_t *ks_init(type_t f) \ | |
51 | { \ | |
52 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ | |
53 | ks->f = f; \ | |
54 | ks->buf = (unsigned char*)malloc(__bufsize); \ | |
55 | return ks; \ | |
56 | } \ | |
57 | static inline void ks_destroy(kstream_t *ks) \ | |
58 | { \ | |
59 | if (ks) { \ | |
60 | free(ks->buf); \ | |
61 | free(ks); \ | |
62 | } \ | |
63 | } | |
64 | ||
65 | #define __KS_GETC(__read, __bufsize) \ | |
66 | static inline int ks_getc(kstream_t *ks) \ | |
67 | { \ | |
68 | if (ks->is_eof && ks->begin >= ks->end) return -1; \ | |
69 | if (ks->begin >= ks->end) { \ | |
70 | ks->begin = 0; \ | |
71 | ks->end = __read(ks->f, ks->buf, __bufsize); \ | |
72 | if (ks->end == 0) { ks->is_eof = 1; return -1;} \ | |
73 | } \ | |
74 | return (int)ks->buf[ks->begin++]; \ | |
50 | #define __KS_BASIC(type_t, __bufsize) \ | |
51 | static inline kstream_t *ks_init(type_t f) \ | |
52 | { \ | |
53 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ | |
54 | ks->f = f; \ | |
55 | ks->buf = (unsigned char*)malloc(__bufsize); \ | |
56 | return ks; \ | |
57 | } \ | |
58 | static inline void ks_destroy(kstream_t *ks) \ | |
59 | { \ | |
60 | if (ks) { \ | |
61 | free(ks->buf); \ | |
62 | free(ks); \ | |
63 | } \ | |
64 | } | |
65 | ||
66 | #define __KS_GETC(__read, __bufsize) \ | |
67 | static inline int ks_getc(kstream_t *ks) \ | |
68 | { \ | |
69 | if (ks_err(ks)) return -3; \ | |
70 | if (ks_eof(ks)) return -1; \ | |
71 | if (ks->begin >= ks->end) { \ | |
72 | ks->begin = 0; \ | |
73 | ks->end = __read(ks->f, ks->buf, __bufsize); \ | |
74 | if (ks->end == 0) { ks->is_eof = 1; return -1; } \ | |
75 | else if (ks->end < 0) { ks->is_eof = 1; return -3; } \ | |
76 | } \ | |
77 | return (int)ks->buf[ks->begin++]; \ | |
75 | 78 | } |
76 | 79 | |
77 | 80 | #ifndef KSTRING_T |
86 | 89 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) |
87 | 90 | #endif |
88 | 91 | |
89 | #define __KS_GETUNTIL(__read, __bufsize) \ | |
92 | #define __KS_GETUNTIL(__read, __bufsize) \ | |
90 | 93 | static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ |
91 | { \ | |
92 | int gotany = 0; \ | |
93 | if (dret) *dret = 0; \ | |
94 | str->l = append? str->l : 0; \ | |
95 | for (;;) { \ | |
96 | int i; \ | |
97 | if (ks->begin >= ks->end) { \ | |
98 | if (!ks->is_eof) { \ | |
99 | ks->begin = 0; \ | |
100 | ks->end = __read(ks->f, ks->buf, __bufsize); \ | |
101 | if (ks->end == 0) { ks->is_eof = 1; break; } \ | |
102 | } else break; \ | |
103 | } \ | |
94 | { \ | |
95 | int gotany = 0; \ | |
96 | if (dret) *dret = 0; \ | |
97 | str->l = append? str->l : 0; \ | |
98 | for (;;) { \ | |
99 | int i; \ | |
100 | if (ks_err(ks)) return -3; \ | |
101 | if (ks->begin >= ks->end) { \ | |
102 | if (!ks->is_eof) { \ | |
103 | ks->begin = 0; \ | |
104 | ks->end = __read(ks->f, ks->buf, __bufsize); \ | |
105 | if (ks->end == 0) { ks->is_eof = 1; break; } \ | |
106 | if (ks->end == -1) { ks->is_eof = 1; return -3; } \ | |
107 | } else break; \ | |
108 | } \ | |
104 | 109 | if (delimiter == KS_SEP_LINE) { \ |
105 | 110 | for (i = ks->begin; i < ks->end; ++i) \ |
106 | 111 | if (ks->buf[i] == '\n') break; \ |
107 | } else if (delimiter > KS_SEP_MAX) { \ | |
108 | for (i = ks->begin; i < ks->end; ++i) \ | |
109 | if (ks->buf[i] == delimiter) break; \ | |
110 | } else if (delimiter == KS_SEP_SPACE) { \ | |
111 | for (i = ks->begin; i < ks->end; ++i) \ | |
112 | if (isspace(ks->buf[i])) break; \ | |
113 | } else if (delimiter == KS_SEP_TAB) { \ | |
114 | for (i = ks->begin; i < ks->end; ++i) \ | |
112 | } else if (delimiter > KS_SEP_MAX) { \ | |
113 | for (i = ks->begin; i < ks->end; ++i) \ | |
114 | if (ks->buf[i] == delimiter) break; \ | |
115 | } else if (delimiter == KS_SEP_SPACE) { \ | |
116 | for (i = ks->begin; i < ks->end; ++i) \ | |
117 | if (isspace(ks->buf[i])) break; \ | |
118 | } else if (delimiter == KS_SEP_TAB) { \ | |
119 | for (i = ks->begin; i < ks->end; ++i) \ | |
115 | 120 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ |
116 | } else i = 0; /* never come to here! */ \ | |
117 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ | |
118 | str->m = str->l + (i - ks->begin) + 1; \ | |
119 | kroundup32(str->m); \ | |
120 | str->s = (char*)realloc(str->s, str->m); \ | |
121 | } \ | |
122 | gotany = 1; \ | |
121 | } else i = 0; /* never come to here! */ \ | |
122 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ | |
123 | str->m = str->l + (i - ks->begin) + 1; \ | |
124 | kroundup32(str->m); \ | |
125 | str->s = (char*)realloc(str->s, str->m); \ | |
126 | } \ | |
127 | gotany = 1; \ | |
123 | 128 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ |
124 | str->l = str->l + (i - ks->begin); \ | |
125 | ks->begin = i + 1; \ | |
126 | if (i < ks->end) { \ | |
127 | if (dret) *dret = ks->buf[i]; \ | |
128 | break; \ | |
129 | } \ | |
130 | } \ | |
131 | if (!gotany && ks_eof(ks)) return -1; \ | |
132 | if (str->s == 0) { \ | |
133 | str->m = 1; \ | |
134 | str->s = (char*)calloc(1, 1); \ | |
129 | str->l = str->l + (i - ks->begin); \ | |
130 | ks->begin = i + 1; \ | |
131 | if (i < ks->end) { \ | |
132 | if (dret) *dret = ks->buf[i]; \ | |
133 | break; \ | |
134 | } \ | |
135 | } \ | |
136 | if (!gotany && ks_eof(ks)) return -1; \ | |
137 | if (str->s == 0) { \ | |
138 | str->m = 1; \ | |
139 | str->s = (char*)calloc(1, 1); \ | |
135 | 140 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ |
136 | str->s[str->l] = '\0'; \ | |
137 | return str->l; \ | |
141 | str->s[str->l] = '\0'; \ | |
142 | return str->l; \ | |
138 | 143 | } \ |
139 | 144 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ |
140 | 145 | { return ks_getuntil2(ks, delimiter, str, dret, 0); } |
141 | 146 | |
142 | 147 | #define KSTREAM_INIT(type_t, __read, __bufsize) \ |
143 | __KS_TYPE(type_t) \ | |
144 | __KS_BASIC(type_t, __bufsize) \ | |
145 | __KS_GETC(__read, __bufsize) \ | |
148 | __KS_TYPE(type_t) \ | |
149 | __KS_BASIC(type_t, __bufsize) \ | |
150 | __KS_GETC(__read, __bufsize) \ | |
146 | 151 | __KS_GETUNTIL(__read, __bufsize) |
147 | 152 | |
148 | 153 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) |
149 | 154 | |
150 | #define __KSEQ_BASIC(SCOPE, type_t) \ | |
151 | SCOPE kseq_t *kseq_init(type_t fd) \ | |
152 | { \ | |
153 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ | |
154 | s->f = ks_init(fd); \ | |
155 | return s; \ | |
156 | } \ | |
157 | SCOPE void kseq_destroy(kseq_t *ks) \ | |
158 | { \ | |
159 | if (!ks) return; \ | |
155 | #define __KSEQ_BASIC(SCOPE, type_t) \ | |
156 | SCOPE kseq_t *kseq_init(type_t fd) \ | |
157 | { \ | |
158 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ | |
159 | s->f = ks_init(fd); \ | |
160 | return s; \ | |
161 | } \ | |
162 | SCOPE void kseq_destroy(kseq_t *ks) \ | |
163 | { \ | |
164 | if (!ks) return; \ | |
160 | 165 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ |
161 | ks_destroy(ks->f); \ | |
162 | free(ks); \ | |
166 | ks_destroy(ks->f); \ | |
167 | free(ks); \ | |
163 | 168 | } |
164 | 169 | |
165 | 170 | /* Return value: |
166 | 171 | >=0 length of the sequence (normal) |
167 | 172 | -1 end-of-file |
168 | 173 | -2 truncated quality string |
174 | -3 error reading stream | |
169 | 175 | */ |
170 | 176 | #define __KSEQ_READ(SCOPE) \ |
171 | 177 | SCOPE int kseq_read(kseq_t *seq) \ |
172 | 178 | { \ |
173 | int c; \ | |
179 | int c,r; \ | |
174 | 180 | kstream_t *ks = seq->f; \ |
175 | 181 | if (seq->last_char == 0) { /* then jump to the next header line */ \ |
176 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ | |
177 | if (c == -1) return -1; /* end of file */ \ | |
182 | while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \ | |
183 | if (c < 0) return c; /* end of file or error*/ \ | |
178 | 184 | seq->last_char = c; \ |
179 | 185 | } /* else: the first header char has been read in the previous call */ \ |
180 | 186 | seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ |
181 | if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ | |
187 | if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r; /* normal exit: EOF or error */ \ | |
182 | 188 | if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ |
183 | 189 | if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ |
184 | 190 | seq->seq.m = 256; \ |
185 | 191 | seq->seq.s = (char*)malloc(seq->seq.m); \ |
186 | 192 | } \ |
187 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ | |
193 | while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \ | |
188 | 194 | if (c == '\n') continue; /* skip empty lines */ \ |
189 | 195 | seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ |
190 | 196 | ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ |
191 | 197 | } \ |
192 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ | |
198 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ | |
193 | 199 | if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ |
194 | 200 | seq->seq.m = seq->seq.l + 2; \ |
195 | 201 | kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ |
196 | 202 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ |
197 | 203 | } \ |
198 | 204 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ |
199 | if (c != '+') return seq->seq.l; /* FASTA */ \ | |
205 | seq->is_fastq = (c == '+'); \ | |
206 | if (!seq->is_fastq) return seq->seq.l; /* FASTA */ \ | |
200 | 207 | if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ |
201 | 208 | seq->qual.m = seq->seq.m; \ |
202 | 209 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ |
203 | 210 | } \ |
204 | while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ | |
211 | while ((c = ks_getc(ks)) >= 0 && c != '\n'); /* skip the rest of '+' line */ \ | |
205 | 212 | if (c == -1) return -2; /* error: no quality string */ \ |
206 | while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ | |
213 | while ((c = ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l)); \ | |
214 | if (c == -3) return -3; /* stream error */ \ | |
207 | 215 | seq->last_char = 0; /* we have not come to the next header line */ \ |
208 | 216 | if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ |
209 | 217 | return seq->seq.l; \ |
210 | 218 | } |
211 | 219 | |
212 | #define __KSEQ_TYPE(type_t) \ | |
213 | typedef struct { \ | |
214 | kstring_t name, comment, seq, qual; \ | |
215 | int last_char; \ | |
216 | kstream_t *f; \ | |
220 | #define __KSEQ_TYPE(type_t) \ | |
221 | typedef struct { \ | |
222 | kstring_t name, comment, seq, qual; \ | |
223 | int last_char, is_fastq; \ | |
224 | kstream_t *f; \ | |
217 | 225 | } kseq_t; |
218 | 226 | |
219 | #define KSEQ_INIT2(SCOPE, type_t, __read) \ | |
220 | KSTREAM_INIT(type_t, __read, 16384) \ | |
221 | __KSEQ_TYPE(type_t) \ | |
222 | __KSEQ_BASIC(SCOPE, type_t) \ | |
227 | #define KSEQ_INIT2(SCOPE, type_t, __read) \ | |
228 | KSTREAM_INIT(type_t, __read, 16384) \ | |
229 | __KSEQ_TYPE(type_t) \ | |
230 | __KSEQ_BASIC(SCOPE, type_t) \ | |
223 | 231 | __KSEQ_READ(SCOPE) |
224 | 232 | |
225 | 233 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) |
0 | /* The MIT License | |
1 | ||
2 | Copyright (c) 2008, 2011 Attractive Chaos <attractor@live.co.uk> | |
3 | ||
4 | Permission is hereby granted, free of charge, to any person obtaining | |
5 | a copy of this software and associated documentation files (the | |
6 | "Software"), to deal in the Software without restriction, including | |
7 | without limitation the rights to use, copy, modify, merge, publish, | |
8 | distribute, sublicense, and/or sell copies of the Software, and to | |
9 | permit persons to whom the Software is furnished to do so, subject to | |
10 | the following conditions: | |
11 | ||
12 | The above copyright notice and this permission notice shall be | |
13 | included in all copies or substantial portions of the Software. | |
14 | ||
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
19 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
20 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 | SOFTWARE. | |
23 | */ | |
24 | ||
25 | /* | |
26 | 2011-04-10 (0.1.6): | |
27 | ||
28 | * Added sample | |
29 | ||
30 | 2011-03 (0.1.5): | |
31 | ||
32 | * Added shuffle/permutation | |
33 | ||
34 | 2008-11-16 (0.1.4): | |
35 | ||
36 | * Fixed a bug in introsort() that happens in rare cases. | |
37 | ||
38 | 2008-11-05 (0.1.3): | |
39 | ||
40 | * Fixed a bug in introsort() for complex comparisons. | |
41 | ||
42 | * Fixed a bug in mergesort(). The previous version is not stable. | |
43 | ||
44 | 2008-09-15 (0.1.2): | |
45 | ||
46 | * Accelerated introsort. On my Mac (not on another Linux machine), | |
47 | my implementation is as fast as std::sort on random input. | |
48 | ||
49 | * Added combsort and in introsort, switch to combsort if the | |
50 | recursion is too deep. | |
51 | ||
52 | 2008-09-13 (0.1.1): | |
53 | ||
54 | * Added k-small algorithm | |
55 | ||
56 | 2008-09-05 (0.1.0): | |
57 | ||
58 | * Initial version | |
59 | ||
60 | */ | |
61 | ||
62 | #ifndef AC_KSORT_H | |
63 | #define AC_KSORT_H | |
64 | ||
65 | #include <stdlib.h> | |
66 | #include <string.h> | |
67 | ||
68 | typedef struct { | |
69 | void *left, *right; | |
70 | int depth; | |
71 | } ks_isort_stack_t; | |
72 | ||
73 | #define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } | |
74 | ||
75 | #define KSORT_INIT(name, type_t, __sort_lt) \ | |
76 | void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ | |
77 | { \ | |
78 | type_t *a2[2], *a, *b; \ | |
79 | int curr, shift; \ | |
80 | \ | |
81 | a2[0] = array; \ | |
82 | a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ | |
83 | for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) { \ | |
84 | a = a2[curr]; b = a2[1-curr]; \ | |
85 | if (shift == 0) { \ | |
86 | type_t *p = b, *i, *eb = a + n; \ | |
87 | for (i = a; i < eb; i += 2) { \ | |
88 | if (i == eb - 1) *p++ = *i; \ | |
89 | else { \ | |
90 | if (__sort_lt(*(i+1), *i)) { \ | |
91 | *p++ = *(i+1); *p++ = *i; \ | |
92 | } else { \ | |
93 | *p++ = *i; *p++ = *(i+1); \ | |
94 | } \ | |
95 | } \ | |
96 | } \ | |
97 | } else { \ | |
98 | size_t i, step = 1ul<<shift; \ | |
99 | for (i = 0; i < n; i += step<<1) { \ | |
100 | type_t *p, *j, *k, *ea, *eb; \ | |
101 | if (n < i + step) { \ | |
102 | ea = a + n; eb = a; \ | |
103 | } else { \ | |
104 | ea = a + i + step; \ | |
105 | eb = a + (n < i + (step<<1)? n : i + (step<<1)); \ | |
106 | } \ | |
107 | j = a + i; k = a + i + step; p = b + i; \ | |
108 | while (j < ea && k < eb) { \ | |
109 | if (__sort_lt(*k, *j)) *p++ = *k++; \ | |
110 | else *p++ = *j++; \ | |
111 | } \ | |
112 | while (j < ea) *p++ = *j++; \ | |
113 | while (k < eb) *p++ = *k++; \ | |
114 | } \ | |
115 | } \ | |
116 | curr = 1 - curr; \ | |
117 | } \ | |
118 | if (curr == 1) { \ | |
119 | type_t *p = a2[0], *i = a2[1], *eb = array + n; \ | |
120 | for (; p < eb; ++i) *p++ = *i; \ | |
121 | } \ | |
122 | if (temp == 0) free(a2[1]); \ | |
123 | } \ | |
124 | void ks_heapadjust_##name(size_t i, size_t n, type_t l[]) \ | |
125 | { \ | |
126 | size_t k = i; \ | |
127 | type_t tmp = l[i]; \ | |
128 | while ((k = (k << 1) + 1) < n) { \ | |
129 | if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k; \ | |
130 | if (__sort_lt(l[k], tmp)) break; \ | |
131 | l[i] = l[k]; i = k; \ | |
132 | } \ | |
133 | l[i] = tmp; \ | |
134 | } \ | |
135 | void ks_heapmake_##name(size_t lsize, type_t l[]) \ | |
136 | { \ | |
137 | size_t i; \ | |
138 | for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i) \ | |
139 | ks_heapadjust_##name(i, lsize, l); \ | |
140 | } \ | |
141 | void ks_heapsort_##name(size_t lsize, type_t l[]) \ | |
142 | { \ | |
143 | size_t i; \ | |
144 | for (i = lsize - 1; i > 0; --i) { \ | |
145 | type_t tmp; \ | |
146 | tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ | |
147 | } \ | |
148 | } \ | |
149 | static inline void __ks_insertsort_##name(type_t *s, type_t *t) \ | |
150 | { \ | |
151 | type_t *i, *j, swap_tmp; \ | |
152 | for (i = s + 1; i < t; ++i) \ | |
153 | for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ | |
154 | swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ | |
155 | } \ | |
156 | } \ | |
157 | void ks_combsort_##name(size_t n, type_t a[]) \ | |
158 | { \ | |
159 | const double shrink_factor = 1.2473309501039786540366528676643; \ | |
160 | int do_swap; \ | |
161 | size_t gap = n; \ | |
162 | type_t tmp, *i, *j; \ | |
163 | do { \ | |
164 | if (gap > 2) { \ | |
165 | gap = (size_t)(gap / shrink_factor); \ | |
166 | if (gap == 9 || gap == 10) gap = 11; \ | |
167 | } \ | |
168 | do_swap = 0; \ | |
169 | for (i = a; i < a + n - gap; ++i) { \ | |
170 | j = i + gap; \ | |
171 | if (__sort_lt(*j, *i)) { \ | |
172 | tmp = *i; *i = *j; *j = tmp; \ | |
173 | do_swap = 1; \ | |
174 | } \ | |
175 | } \ | |
176 | } while (do_swap || gap > 2); \ | |
177 | if (gap != 1) __ks_insertsort_##name(a, a + n); \ | |
178 | } \ | |
179 | void ks_introsort_##name(size_t n, type_t a[]) \ | |
180 | { \ | |
181 | int d; \ | |
182 | ks_isort_stack_t *top, *stack; \ | |
183 | type_t rp, swap_tmp; \ | |
184 | type_t *s, *t, *i, *j, *k; \ | |
185 | \ | |
186 | if (n < 1) return; \ | |
187 | else if (n == 2) { \ | |
188 | if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ | |
189 | return; \ | |
190 | } \ | |
191 | for (d = 2; 1ul<<d < n; ++d); \ | |
192 | stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \ | |
193 | top = stack; s = a; t = a + (n-1); d <<= 1; \ | |
194 | while (1) { \ | |
195 | if (s < t) { \ | |
196 | if (--d == 0) { \ | |
197 | ks_combsort_##name(t - s + 1, s); \ | |
198 | t = s; \ | |
199 | continue; \ | |
200 | } \ | |
201 | i = s; j = t; k = i + ((j-i)>>1) + 1; \ | |
202 | if (__sort_lt(*k, *i)) { \ | |
203 | if (__sort_lt(*k, *j)) k = j; \ | |
204 | } else k = __sort_lt(*j, *i)? i : j; \ | |
205 | rp = *k; \ | |
206 | if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ | |
207 | for (;;) { \ | |
208 | do ++i; while (__sort_lt(*i, rp)); \ | |
209 | do --j; while (i <= j && __sort_lt(rp, *j)); \ | |
210 | if (j <= i) break; \ | |
211 | swap_tmp = *i; *i = *j; *j = swap_tmp; \ | |
212 | } \ | |
213 | swap_tmp = *i; *i = *t; *t = swap_tmp; \ | |
214 | if (i-s > t-i) { \ | |
215 | if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ | |
216 | s = t-i > 16? i+1 : t; \ | |
217 | } else { \ | |
218 | if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ | |
219 | t = i-s > 16? i-1 : s; \ | |
220 | } \ | |
221 | } else { \ | |
222 | if (top == stack) { \ | |
223 | free(stack); \ | |
224 | __ks_insertsort_##name(a, a+n); \ | |
225 | return; \ | |
226 | } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ | |
227 | } \ | |
228 | } \ | |
229 | } \ | |
230 | /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ | |
231 | /* 0 <= kk < n */ \ | |
232 | type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ | |
233 | { \ | |
234 | type_t *low, *high, *k, *ll, *hh, *mid; \ | |
235 | low = arr; high = arr + n - 1; k = arr + kk; \ | |
236 | for (;;) { \ | |
237 | if (high <= low) return *k; \ | |
238 | if (high == low + 1) { \ | |
239 | if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ | |
240 | return *k; \ | |
241 | } \ | |
242 | mid = low + (high - low) / 2; \ | |
243 | if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ | |
244 | if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ | |
245 | if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ | |
246 | KSORT_SWAP(type_t, *mid, *(low+1)); \ | |
247 | ll = low + 1; hh = high; \ | |
248 | for (;;) { \ | |
249 | do ++ll; while (__sort_lt(*ll, *low)); \ | |
250 | do --hh; while (__sort_lt(*low, *hh)); \ | |
251 | if (hh < ll) break; \ | |
252 | KSORT_SWAP(type_t, *ll, *hh); \ | |
253 | } \ | |
254 | KSORT_SWAP(type_t, *low, *hh); \ | |
255 | if (hh <= k) low = ll; \ | |
256 | if (hh >= k) high = hh - 1; \ | |
257 | } \ | |
258 | } \ | |
259 | void ks_shuffle_##name(size_t n, type_t a[]) \ | |
260 | { \ | |
261 | int i, j; \ | |
262 | for (i = n; i > 1; --i) { \ | |
263 | type_t tmp; \ | |
264 | j = (int)(drand48() * i); \ | |
265 | tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; \ | |
266 | } \ | |
267 | } \ | |
268 | void ks_sample_##name(size_t n, size_t r, type_t a[]) /* FIXME: NOT TESTED!!! */ \ | |
269 | { /* reference: http://code.activestate.com/recipes/272884/ */ \ | |
270 | int i, k, pop = n; \ | |
271 | for (i = (int)r, k = 0; i >= 0; --i) { \ | |
272 | double z = 1., x = drand48(); \ | |
273 | type_t tmp; \ | |
274 | while (x < z) z -= z * i / (pop--); \ | |
275 | if (k != n - pop - 1) tmp = a[k], a[k] = a[n-pop-1], a[n-pop-1] = tmp; \ | |
276 | ++k; \ | |
277 | } \ | |
278 | } | |
279 | ||
280 | #define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) | |
281 | #define ks_introsort(name, n, a) ks_introsort_##name(n, a) | |
282 | #define ks_combsort(name, n, a) ks_combsort_##name(n, a) | |
283 | #define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) | |
284 | #define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) | |
285 | #define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) | |
286 | #define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) | |
287 | #define ks_shuffle(name, n, a) ks_shuffle_##name(n, a) | |
288 | ||
289 | #define ks_lt_generic(a, b) ((a) < (b)) | |
290 | #define ks_lt_str(a, b) (strcmp((a), (b)) < 0) | |
291 | ||
292 | typedef const char *ksstr_t; | |
293 | ||
294 | #define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) | |
295 | #define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) | |
296 | ||
297 | #endif |
0 | /* The MIT License | |
1 | ||
2 | Copyright (c) by Attractive Chaos <attractor@live.co.uk> | |
3 | ||
4 | Permission is hereby granted, free of charge, to any person obtaining | |
5 | a copy of this software and associated documentation files (the | |
6 | "Software"), to deal in the Software without restriction, including | |
7 | without limitation the rights to use, copy, modify, merge, publish, | |
8 | distribute, sublicense, and/or sell copies of the Software, and to | |
9 | permit persons to whom the Software is furnished to do so, subject to | |
10 | the following conditions: | |
11 | ||
12 | The above copyright notice and this permission notice shall be | |
13 | included in all copies or substantial portions of the Software. | |
14 | ||
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
19 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
20 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 | SOFTWARE. | |
23 | */ | |
24 | ||
25 | #ifndef KSTRING_H | |
26 | #define KSTRING_H | |
27 | ||
28 | #include <stdlib.h> | |
29 | #include <string.h> | |
30 | #include <stdint.h> | |
31 | ||
32 | #ifndef kroundup32 | |
33 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) | |
34 | #endif | |
35 | ||
36 | #ifndef KSTRING_T | |
37 | #define KSTRING_T kstring_t | |
38 | typedef struct __kstring_t { | |
39 | uint32_t l, m; | |
40 | char *s; | |
41 | } kstring_t; | |
42 | #endif | |
43 | ||
44 | typedef struct { | |
45 | uint64_t tab[4]; | |
46 | int sep, finished; | |
47 | const char *p; // end of the current token | |
48 | } ks_tokaux_t; | |
49 | ||
50 | #ifdef __cplusplus | |
51 | extern "C" { | |
52 | #endif | |
53 | ||
54 | int ksprintf(kstring_t *s, const char *fmt, ...); | |
55 | int ksprintf_fast(kstring_t *s, const char *fmt, ...); | |
56 | int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); | |
57 | char *kstrstr(const char *str, const char *pat, int **_prep); | |
58 | char *kstrnstr(const char *str, const char *pat, int n, int **_prep); | |
59 | void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep); | |
60 | ||
61 | /* kstrtok() is similar to strtok_r() except that str is not | |
62 | * modified and both str and sep can be NULL. For efficiency, it is | |
63 | * actually recommended to set both to NULL in the subsequent calls | |
64 | * if sep is not changed. */ | |
65 | char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); | |
66 | ||
67 | #ifdef __cplusplus | |
68 | } | |
69 | #endif | |
70 | ||
71 | static inline void ks_resize(kstring_t *s, size_t size) | |
72 | { | |
73 | if (s->m < size) { | |
74 | s->m = size; | |
75 | kroundup32(s->m); | |
76 | s->s = (char*)realloc(s->s, s->m); | |
77 | } | |
78 | } | |
79 | ||
80 | static inline int kputsn(const char *p, int l, kstring_t *s) | |
81 | { | |
82 | if (s->l + l + 1 >= s->m) { | |
83 | s->m = s->l + l + 2; | |
84 | kroundup32(s->m); | |
85 | s->s = (char*)realloc(s->s, s->m); | |
86 | } | |
87 | memcpy(s->s + s->l, p, l); | |
88 | s->l += l; | |
89 | s->s[s->l] = 0; | |
90 | return l; | |
91 | } | |
92 | ||
93 | static inline int kputs(const char *p, kstring_t *s) | |
94 | { | |
95 | return kputsn(p, strlen(p), s); | |
96 | } | |
97 | ||
98 | static inline int kputc(int c, kstring_t *s) | |
99 | { | |
100 | if (s->l + 1 >= s->m) { | |
101 | s->m = s->l + 2; | |
102 | kroundup32(s->m); | |
103 | s->s = (char*)realloc(s->s, s->m); | |
104 | } | |
105 | s->s[s->l++] = c; | |
106 | s->s[s->l] = 0; | |
107 | return c; | |
108 | } | |
109 | ||
110 | static inline int kputw(int c, kstring_t *s) | |
111 | { | |
112 | char buf[16]; | |
113 | int l, x; | |
114 | if (c == 0) return kputc('0', s); | |
115 | for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; | |
116 | if (c < 0) buf[l++] = '-'; | |
117 | if (s->l + l + 1 >= s->m) { | |
118 | s->m = s->l + l + 2; | |
119 | kroundup32(s->m); | |
120 | s->s = (char*)realloc(s->s, s->m); | |
121 | } | |
122 | for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; | |
123 | s->s[s->l] = 0; | |
124 | return 0; | |
125 | } | |
126 | ||
127 | static inline int kputuw(unsigned c, kstring_t *s) | |
128 | { | |
129 | char buf[16]; | |
130 | int l, i; | |
131 | unsigned x; | |
132 | if (c == 0) return kputc('0', s); | |
133 | for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; | |
134 | if (s->l + l + 1 >= s->m) { | |
135 | s->m = s->l + l + 2; | |
136 | kroundup32(s->m); | |
137 | s->s = (char*)realloc(s->s, s->m); | |
138 | } | |
139 | for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; | |
140 | s->s[s->l] = 0; | |
141 | return 0; | |
142 | } | |
143 | ||
144 | static inline int kputl(long c, kstring_t *s) | |
145 | { | |
146 | char buf[32]; | |
147 | long l, x; | |
148 | if (c == 0) return kputc('0', s); | |
149 | for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; | |
150 | if (c < 0) buf[l++] = '-'; | |
151 | if (s->l + l + 1 >= s->m) { | |
152 | s->m = s->l + l + 2; | |
153 | kroundup32(s->m); | |
154 | s->s = (char*)realloc(s->s, s->m); | |
155 | } | |
156 | for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; | |
157 | s->s[s->l] = 0; | |
158 | return 0; | |
159 | } | |
160 | ||
161 | static inline int *ksplit(kstring_t *s, int delimiter, int *n) | |
162 | { | |
163 | int max = 0, *offsets = 0; | |
164 | *n = ksplit_core(s->s, delimiter, &max, &offsets); | |
165 | return offsets; | |
166 | } | |
167 | ||
168 | #endif |
0 | /* The MIT License | |
1 | ||
2 | Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk> | |
3 | ||
4 | Permission is hereby granted, free of charge, to any person obtaining | |
5 | a copy of this software and associated documentation files (the | |
6 | "Software"), to deal in the Software without restriction, including | |
7 | without limitation the rights to use, copy, modify, merge, publish, | |
8 | distribute, sublicense, and/or sell copies of the Software, and to | |
9 | permit persons to whom the Software is furnished to do so, subject to | |
10 | the following conditions: | |
11 | ||
12 | The above copyright notice and this permission notice shall be | |
13 | included in all copies or substantial portions of the Software. | |
14 | ||
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
19 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
20 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 | SOFTWARE. | |
23 | */ | |
24 | ||
25 | /* | |
26 | An example: | |
27 | ||
28 | #include "kvec.h" | |
29 | int main() { | |
30 | kvec_t(int) array; | |
31 | kv_init(array); | |
32 | kv_push(int, array, 10); // append | |
33 | kv_a(int, array, 20) = 5; // dynamic | |
34 | kv_A(array, 20) = 4; // static | |
35 | kv_destroy(array); | |
36 | return 0; | |
37 | } | |
38 | */ | |
39 | ||
40 | /* | |
41 | 2008-09-22 (0.1.0): | |
42 | ||
43 | * The initial version. | |
44 | ||
45 | */ | |
46 | ||
47 | #ifndef AC_KVEC_H | |
48 | #define AC_KVEC_H | |
49 | ||
50 | #include <stdlib.h> | |
51 | ||
52 | #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) | |
53 | ||
54 | #define kvec_t(type) struct { size_t n, m; type *a; } | |
55 | #define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) | |
56 | #define kv_destroy(v) free((v).a) | |
57 | #define kv_A(v, i) ((v).a[(i)]) | |
58 | #define kv_pop(v) ((v).a[--(v).n]) | |
59 | #define kv_size(v) ((v).n) | |
60 | #define kv_max(v) ((v).m) | |
61 | ||
62 | #define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m)) | |
63 | ||
64 | #define kv_copy(type, v1, v0) do { \ | |
65 | if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ | |
66 | (v1).n = (v0).n; \ | |
67 | memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ | |
68 | } while (0) \ | |
69 | ||
70 | #define kv_push(type, v, x) do { \ | |
71 | if ((v).n == (v).m) { \ | |
72 | (v).m = (v).m? (v).m<<1 : 2; \ | |
73 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ | |
74 | } \ | |
75 | (v).a[(v).n++] = (x); \ | |
76 | } while (0) | |
77 | ||
78 | #define kv_pushp(type, v) (((v).n == (v).m)? \ | |
79 | ((v).m = ((v).m? (v).m<<1 : 2), \ | |
80 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ | |
81 | : 0), ((v).a + ((v).n++)) | |
82 | ||
83 | #define kv_a(type, v, i) (((v).m <= (size_t)(i)? \ | |
84 | ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ | |
85 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ | |
86 | : (v).n <= (size_t)(i)? (v).n = (i) + 1 \ | |
87 | : 0), (v).a[(i)]) | |
88 | ||
89 | #endif |
0 | 0 | /* The MIT License |
1 | 1 | |
2 | Copyright (c) 20082-2012 by Heng Li <lh3@me.com> | |
2 | Copyright (c) 2008-2016 Broad Institute | |
3 | 3 | |
4 | 4 | Permission is hereby granted, free of charge, to any person obtaining |
5 | 5 | a copy of this software and associated documentation files (the |
56 | 56 | int dret; |
57 | 57 | kstring_t *str; |
58 | 58 | // read the list |
59 | fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); | |
60 | if (fp == 0) return 0; | |
61 | ks = ks_init(fp); | |
59 | 62 | str = calloc(1, sizeof(kstring_t)); |
60 | fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); | |
61 | ks = ks_init(fp); | |
62 | 63 | while (ks_getuntil(ks, 0, str, &dret) >= 0) { |
63 | 64 | int beg = -1, end = -1; |
64 | 65 | reglist_t *p; |
297 | 298 | fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q/-l) [0]\n"); |
298 | 299 | fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q/-l) [0]\n"); |
299 | 300 | fprintf(stderr, " -L INT retain at most INT bp from the 5'-end (non-zero to disable -q/-l) [0]\n"); |
301 | fprintf(stderr, " -Q force FASTQ output\n"); | |
300 | 302 | fprintf(stderr, "\n"); |
301 | 303 | return 1; |
302 | 304 | } |
303 | 305 | fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); |
306 | if (fp == 0) { | |
307 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
308 | return 1; | |
309 | } | |
304 | 310 | seq = kseq_init(fp); |
305 | 311 | for (i = 0; i < 128; ++i) |
306 | 312 | q_int2real[i] = pow(10., -(i - 33) / 10.); |
335 | 341 | end = beg + min_len; |
336 | 342 | } |
337 | 343 | } else beg = 0, end = seq->seq.l; |
338 | putchar(seq->qual.l? '@' : '>'); fputs(seq->name.s, stdout); | |
344 | putchar(seq->is_fastq? '@' : '>'); fputs(seq->name.s, stdout); | |
339 | 345 | if (seq->comment.l) { |
340 | 346 | putchar(' '); puts(seq->comment.s); |
341 | 347 | } else putchar('\n'); |
342 | 348 | fwrite(seq->seq.s + beg, 1, end - beg, stdout); putchar('\n'); |
343 | if (seq->qual.l) { | |
349 | if (seq->is_fastq) { | |
344 | 350 | puts("+"); |
345 | 351 | fwrite(seq->qual.s + beg, 1, end - beg, stdout); putchar('\n'); |
346 | 352 | } |
371 | 377 | return 1; |
372 | 378 | } |
373 | 379 | fp = optind < argc && strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); |
380 | if (fp == 0) { | |
381 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
382 | return 1; | |
383 | } | |
374 | 384 | seq = kseq_init(fp); |
375 | 385 | dummy.n= dummy.m = 1; dummy.a = calloc(1, 8); |
376 | 386 | while ((l = kseq_read(seq)) >= 0) { |
436 | 446 | return 1; |
437 | 447 | } |
438 | 448 | fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); |
449 | if (fp == 0) { | |
450 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
451 | return 1; | |
452 | } | |
439 | 453 | seq = kseq_init(fp); |
440 | 454 | while ((l = kseq_read(seq)) >= 0) { |
441 | 455 | int i; |
488 | 502 | } |
489 | 503 | } |
490 | 504 | fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); |
505 | if (fp == 0) { | |
506 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
507 | return 1; | |
508 | } | |
491 | 509 | seq = kseq_init(fp); |
492 | 510 | win_step = win_size / n_start; |
493 | 511 | buf = calloc(win_size, 1); |
546 | 564 | return 1; |
547 | 565 | } |
548 | 566 | h = stk_reg_read(argv[optind+1]); |
567 | if (h == 0) { | |
568 | fprintf(stderr, "[E::%s] failed to read the list of regions in file '%s'\n", __func__, argv[optind+1]); | |
569 | return 1; | |
570 | } | |
549 | 571 | // subseq |
550 | 572 | fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); |
573 | if (fp == 0) { | |
574 | fprintf(stderr, "[E::%s] failed to open the input file/stream\n", __func__); | |
575 | return 1; | |
576 | } | |
551 | 577 | seq = kseq_init(fp); |
552 | 578 | while ((l = kseq_read(seq)) >= 0) { |
553 | 579 | reglist_t *p; |
624 | 650 | for (i = 0; i < 2; ++i) { |
625 | 651 | fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); |
626 | 652 | seq[i] = kseq_init(fp[i]); |
653 | } | |
654 | if (fp[0] == 0 || fp[1] == 0) { | |
655 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
656 | return 1; | |
627 | 657 | } |
628 | 658 | cnt[0] = cnt[1] = cnt[2] = cnt[3] = cnt[4] = 0; |
629 | 659 | srand48(11); |
706 | 736 | fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); |
707 | 737 | seq[i] = kseq_init(fp[i]); |
708 | 738 | } |
739 | if (fp[0] == 0 || fp[1] == 0) { | |
740 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
741 | return 1; | |
742 | } | |
709 | 743 | while (kseq_read(seq[0]) >= 0) { |
710 | 744 | int min_l, c[2]; |
711 | 745 | kseq_read(seq[1]); |
745 | 779 | // read the list |
746 | 780 | str = calloc(1, sizeof(kstring_t)); |
747 | 781 | fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); |
782 | if (fp == 0) { | |
783 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
784 | return 1; | |
785 | } | |
748 | 786 | ks = ks_init(fp); |
749 | 787 | while (ks_getuntil(ks, 0, str, &dret) >= 0) { |
750 | 788 | char *s = strdup(str->s); |
774 | 812 | free(str->s); free(str); |
775 | 813 | // mutfa |
776 | 814 | fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); |
815 | if (fp == 0) { | |
816 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
817 | return 1; | |
818 | } | |
777 | 819 | seq = kseq_init(fp); |
778 | 820 | while ((l = kseq_read(seq)) >= 0) { |
779 | 821 | reglist_t *p; |
816 | 858 | return 1; |
817 | 859 | } |
818 | 860 | fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); |
861 | if (fp == 0) { | |
862 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
863 | return 1; | |
864 | } | |
819 | 865 | seq = kseq_init(fp); |
820 | 866 | while ((l = kseq_read(seq)) >= 0) { |
821 | 867 | for (i = 0; i < l; ++i) { |
901 | 947 | return 1; |
902 | 948 | } |
903 | 949 | fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); |
950 | if (fp == 0) { | |
951 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
952 | return 1; | |
953 | } | |
904 | 954 | ks = kseq_init(fp); |
905 | 955 | while ((l = kseq_read(ks)) >= 0) { |
906 | 956 | int k = 0, begin = 0, end = 0; |
929 | 979 | } |
930 | 980 | if (argc == optind + 2) min_len = atoi(argv[optind+1]); |
931 | 981 | fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); |
982 | if (fp == 0) { | |
983 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
984 | return 1; | |
985 | } | |
932 | 986 | ks = kseq_init(fp); |
933 | 987 | while (kseq_read(ks) >= 0) { |
934 | 988 | c = ks->seq.s[0]; l = 1; beg = 0; |
988 | 1042 | return 1; |
989 | 1043 | } |
990 | 1044 | frac = atof(argv[optind+1]); |
991 | if (frac > 1.) num = (uint64_t)(frac + .499), frac = 0.; | |
1045 | if (frac >= 1.0) num = (uint64_t)(frac + .499), frac = 0.; | |
992 | 1046 | else if (twopass) { |
993 | 1047 | fprintf(stderr, "[W::%s] when sampling a fraction, option -2 is ignored.", __func__); |
994 | 1048 | twopass = 0; |
1005 | 1059 | } |
1006 | 1060 | |
1007 | 1061 | fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); |
1062 | if (fp == 0) { | |
1063 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
1064 | return 1; | |
1065 | } | |
1008 | 1066 | seq = kseq_init(fp); |
1009 | 1067 | n_seqs = 0; |
1010 | 1068 | while (kseq_read(seq) >= 0) { |
1036 | 1094 | buf = malloc(num * 8); |
1037 | 1095 | for (i = 0; i < num; ++i) buf[i] = UINT64_MAX; |
1038 | 1096 | fp = gzopen(argv[optind], "r"); |
1097 | if (fp == 0) { | |
1098 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
1099 | return 1; | |
1100 | } | |
1039 | 1101 | seq = kseq_init(fp); |
1040 | 1102 | n_seqs = 0; |
1041 | 1103 | while (kseq_read(seq) >= 0) { |
1116 | 1178 | { |
1117 | 1179 | gzFile fp; |
1118 | 1180 | kseq_t *seq; |
1119 | int c, qual_thres = 0, flag = 0, qual_shift = 33, mask_chr = 0, min_len = 0, max_q = 255; | |
1181 | int c, qual_thres = 0, flag = 0, qual_shift = 33, mask_chr = 0, min_len = 0, max_q = 255, fake_qual = -1; | |
1120 | 1182 | unsigned i, line_len = 0; |
1121 | 1183 | int64_t n_seqs = 0; |
1122 | 1184 | double frac = 1.; |
1123 | 1185 | khash_t(reg) *h = 0; |
1124 | 1186 | krand_t *kr = 0; |
1125 | 1187 | |
1126 | while ((c = getopt(argc, argv, "N12q:l:Q:aACrn:s:f:M:L:cVUX:S")) >= 0) { | |
1188 | while ((c = getopt(argc, argv, "N12q:l:Q:aACrn:s:f:M:L:cVUX:SF:")) >= 0) { | |
1127 | 1189 | switch (c) { |
1128 | 1190 | case 'a': |
1129 | 1191 | case 'A': flag |= 1; break; |
1145 | 1207 | case 'L': min_len = atoi(optarg); break; |
1146 | 1208 | case 's': kr = kr_srand(atol(optarg)); break; |
1147 | 1209 | case 'f': frac = atof(optarg); break; |
1210 | case 'F': fake_qual = *optarg; break; | |
1148 | 1211 | } |
1149 | 1212 | } |
1150 | 1213 | if (kr == 0) kr = kr_srand(11); |
1160 | 1223 | fprintf(stderr, " -f FLOAT sample FLOAT fraction of sequences [1]\n"); |
1161 | 1224 | fprintf(stderr, " -M FILE mask regions in BED or name list FILE [null]\n"); |
1162 | 1225 | fprintf(stderr, " -L INT drop sequences with length shorter than INT [0]\n"); |
1226 | fprintf(stderr, " -F CHAR fake FASTQ quality []\n"); | |
1163 | 1227 | fprintf(stderr, " -c mask complement region (effective with -M)\n"); |
1164 | 1228 | fprintf(stderr, " -r reverse complement\n"); |
1165 | 1229 | fprintf(stderr, " -A force FASTA output (discard quality)\n"); |
1176 | 1240 | } |
1177 | 1241 | if (line_len == 0) line_len = UINT_MAX; |
1178 | 1242 | fp = optind < argc && strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); |
1243 | if (fp == 0) { | |
1244 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
1245 | return 1; | |
1246 | } | |
1179 | 1247 | seq = kseq_init(fp); |
1180 | 1248 | qual_thres += qual_shift; |
1181 | 1249 | while (kseq_read(seq) >= 0) { |
1214 | 1282 | for (i = 0; i < seq->seq.l; ++i) |
1215 | 1283 | seq->seq.s[i] = toupper(seq->seq.s[i]); |
1216 | 1284 | if (flag & 1) seq->qual.l = 0; // option -a: fastq -> fasta |
1285 | else if (fake_qual >= 33 && fake_qual <= 127) { | |
1286 | if (seq->qual.m < seq->seq.m) { | |
1287 | seq->qual.m = seq->seq.m; | |
1288 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); | |
1289 | } | |
1290 | seq->qual.l = seq->seq.l; | |
1291 | memset(seq->qual.s, fake_qual, seq->qual.l); | |
1292 | seq->qual.s[seq->qual.l] = 0; | |
1293 | } | |
1217 | 1294 | if (flag & 2) seq->comment.l = 0; // option -C: drop fasta/q comments |
1218 | 1295 | if (h) stk_mask(seq, h, flag&8, mask_chr); // masking |
1219 | 1296 | if (flag & 4) { // option -r: reverse complement |
1273 | 1350 | q = (1.0f - frac) / frac; |
1274 | 1351 | |
1275 | 1352 | fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); |
1353 | if (fp == 0) { | |
1354 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
1355 | return 1; | |
1356 | } | |
1276 | 1357 | seq = kseq_init(fp); |
1277 | 1358 | while (kseq_read(seq) >= 0) { |
1278 | 1359 | int i, start = 0, max_i = 0, n_hits = 0, start_hits = 0, max_hits = 0; |
1316 | 1397 | } |
1317 | 1398 | fp1 = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); |
1318 | 1399 | fp2 = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); |
1400 | if (fp1 == 0 || fp2 == 0) { | |
1401 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
1402 | return 1; | |
1403 | } | |
1319 | 1404 | seq[0] = kseq_init(fp1); |
1320 | 1405 | seq[1] = kseq_init(fp2); |
1321 | 1406 | while (kseq_read(seq[0]) >= 0) { |
1343 | 1428 | return 1; |
1344 | 1429 | } |
1345 | 1430 | fp = argc > 1 && strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); |
1431 | if (fp == 0) { | |
1432 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
1433 | return 1; | |
1434 | } | |
1346 | 1435 | seq = kseq_init(fp); |
1347 | 1436 | |
1348 | 1437 | memset(&last, 0, sizeof(kseq_t)); |
1380 | 1469 | return 1; |
1381 | 1470 | } |
1382 | 1471 | fp = argc > 1 && strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); |
1472 | if (fp == 0) { | |
1473 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
1474 | return 1; | |
1475 | } | |
1383 | 1476 | seq = kseq_init(fp); |
1384 | 1477 | if (argc > 2) prefix = argv[2]; |
1385 | 1478 | |
1443 | 1536 | } |
1444 | 1537 | |
1445 | 1538 | fp = argc == 2 || strcmp(argv[2], "-") == 0? gzdopen(fileno(stdin), "r") : gzopen(argv[2], "r"); |
1539 | if (fp == 0) { | |
1540 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
1541 | return 1; | |
1542 | } | |
1446 | 1543 | ks = kseq_init(fp); |
1447 | 1544 | while (kseq_read(ks) >= 0) { |
1448 | 1545 | int k, x[2], cnt[2], cnt_nei[2], which; |
1516 | 1613 | return 1; |
1517 | 1614 | } |
1518 | 1615 | fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); |
1616 | if (fp == 0) { | |
1617 | fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); | |
1618 | return 1; | |
1619 | } | |
1519 | 1620 | seq = kseq_init(fp); |
1520 | 1621 | for (k = 0; k <= 93; ++k) |
1521 | 1622 | perr[k] = pow(10., -.1 * k); |
1573 | 1674 | { |
1574 | 1675 | fprintf(stderr, "\n"); |
1575 | 1676 | fprintf(stderr, "Usage: seqtk <command> <arguments>\n"); |
1576 | fprintf(stderr, "Version: 1.2-r94\n\n"); | |
1677 | fprintf(stderr, "Version: 1.3-r106\n\n"); | |
1577 | 1678 | fprintf(stderr, "Command: seq common transformation of FASTA/Q\n"); |
1578 | 1679 | fprintf(stderr, " comp get the nucleotide composition of FASTA/Q\n"); |
1579 | 1680 | fprintf(stderr, " sample subsample sequences\n"); |
1598 | 1699 | int main(int argc, char *argv[]) |
1599 | 1700 | { |
1600 | 1701 | if (argc == 1) return usage(); |
1601 | if (strcmp(argv[1], "comp") == 0) stk_comp(argc-1, argv+1); | |
1602 | else if (strcmp(argv[1], "fqchk") == 0) stk_fqchk(argc-1, argv+1); | |
1603 | else if (strcmp(argv[1], "hety") == 0) stk_hety(argc-1, argv+1); | |
1604 | else if (strcmp(argv[1], "gc") == 0) stk_gc(argc-1, argv+1); | |
1605 | else if (strcmp(argv[1], "subseq") == 0) stk_subseq(argc-1, argv+1); | |
1606 | else if (strcmp(argv[1], "mutfa") == 0) stk_mutfa(argc-1, argv+1); | |
1607 | else if (strcmp(argv[1], "mergefa") == 0) stk_mergefa(argc-1, argv+1); | |
1608 | else if (strcmp(argv[1], "mergepe") == 0) stk_mergepe(argc-1, argv+1); | |
1609 | else if (strcmp(argv[1], "dropse") == 0) stk_dropse(argc-1, argv+1); | |
1610 | else if (strcmp(argv[1], "randbase") == 0) stk_randbase(argc-1, argv+1); | |
1611 | else if (strcmp(argv[1], "cutN") == 0) stk_cutN(argc-1, argv+1); | |
1612 | else if (strcmp(argv[1], "listhet") == 0) stk_listhet(argc-1, argv+1); | |
1613 | else if (strcmp(argv[1], "famask") == 0) stk_famask(argc-1, argv+1); | |
1614 | else if (strcmp(argv[1], "trimfq") == 0) stk_trimfq(argc-1, argv+1); | |
1615 | else if (strcmp(argv[1], "hrun") == 0) stk_hrun(argc-1, argv+1); | |
1616 | else if (strcmp(argv[1], "sample") == 0) stk_sample(argc-1, argv+1); | |
1617 | else if (strcmp(argv[1], "seq") == 0) stk_seq(argc-1, argv+1); | |
1618 | else if (strcmp(argv[1], "kfreq") == 0) stk_kfreq(argc-1, argv+1); | |
1619 | else if (strcmp(argv[1], "rename") == 0) stk_rename(argc-1, argv+1); | |
1702 | if (strcmp(argv[1], "comp") == 0) return stk_comp(argc-1, argv+1); | |
1703 | else if (strcmp(argv[1], "fqchk") == 0) return stk_fqchk(argc-1, argv+1); | |
1704 | else if (strcmp(argv[1], "hety") == 0) return stk_hety(argc-1, argv+1); | |
1705 | else if (strcmp(argv[1], "gc") == 0) return stk_gc(argc-1, argv+1); | |
1706 | else if (strcmp(argv[1], "subseq") == 0) return stk_subseq(argc-1, argv+1); | |
1707 | else if (strcmp(argv[1], "mutfa") == 0) return stk_mutfa(argc-1, argv+1); | |
1708 | else if (strcmp(argv[1], "mergefa") == 0) return stk_mergefa(argc-1, argv+1); | |
1709 | else if (strcmp(argv[1], "mergepe") == 0) return stk_mergepe(argc-1, argv+1); | |
1710 | else if (strcmp(argv[1], "dropse") == 0) return stk_dropse(argc-1, argv+1); | |
1711 | else if (strcmp(argv[1], "randbase") == 0) return stk_randbase(argc-1, argv+1); | |
1712 | else if (strcmp(argv[1], "cutN") == 0) return stk_cutN(argc-1, argv+1); | |
1713 | else if (strcmp(argv[1], "listhet") == 0) return stk_listhet(argc-1, argv+1); | |
1714 | else if (strcmp(argv[1], "famask") == 0) return stk_famask(argc-1, argv+1); | |
1715 | else if (strcmp(argv[1], "trimfq") == 0) return stk_trimfq(argc-1, argv+1); | |
1716 | else if (strcmp(argv[1], "hrun") == 0) return stk_hrun(argc-1, argv+1); | |
1717 | else if (strcmp(argv[1], "sample") == 0) return stk_sample(argc-1, argv+1); | |
1718 | else if (strcmp(argv[1], "seq") == 0) return stk_seq(argc-1, argv+1); | |
1719 | else if (strcmp(argv[1], "kfreq") == 0) return stk_kfreq(argc-1, argv+1); | |
1720 | else if (strcmp(argv[1], "rename") == 0) return stk_rename(argc-1, argv+1); | |
1620 | 1721 | else { |
1621 | 1722 | fprintf(stderr, "[main] unrecognized command '%s'. Abort!\n", argv[1]); |
1622 | 1723 | return 1; |
1623 | 1724 | } |
1624 | return 0; | |
1625 | } | |
1725 | } |