Import upstream version 2.14.5+git20200607.58b996f, md5 4ee5b48326a0fdf2e716f6c3669da271
Debian Janitor
3 years ago
182 | 182 | print('You can use them with ARIBA like this:') |
183 | 183 | print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') |
184 | 184 | print('If you use this downloaded data, please cite:') |
185 | print('"The Comprehensive Antibiotic Resistance Database", McArthur et al 2013, PMID: 23650175') | |
185 | print('"CARD 2020: antibiotic resistome surveillance with the comprehensive antibiotic resistance database", Alcock et al 2020, PMID: 31665441') | |
186 | 186 | print('and in your methods say that version', self.version, 'of the database was used') |
187 | 187 | |
188 | 188 | |
658 | 658 | print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') |
659 | 659 | |
660 | 660 | else: |
661 | print(f"Nothing to do. Exiting.") | |
661 | print(f"Nothing to do. Exiting.") | |
662 | 662 | def run(self, outprefix): |
663 | 663 | exec('self._get_from_' + self.ref_db + '(outprefix)') |
0 | The MIT License | |
1 | ||
2 | Copyright (c) 2016 Broad Institute | |
3 | ||
4 | Permission is hereby granted, free of charge, to any person obtaining | |
5 | a copy of this software and associated documentation files (the | |
6 | "Software"), to deal in the Software without restriction, including | |
7 | without limitation the rights to use, copy, modify, merge, publish, | |
8 | distribute, sublicense, and/or sell copies of the Software, and to | |
9 | permit persons to whom the Software is furnished to do so, subject to | |
10 | the following conditions: | |
11 | ||
12 | The above copyright notice and this permission notice shall be | |
13 | included in all copies or substantial portions of the Software. | |
14 | ||
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
19 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
20 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 | SOFTWARE. |
0 | ## Getting Started | |
1 | ```sh | |
2 | git clone https://github.com/lh3/fermi-lite | |
3 | cd fermi-lite && make | |
4 | ./fml-asm test/MT-simu.fq.gz > MT.fq | |
5 | # to compile your program: | |
6 | gcc -Wall -O2 prog.c -o prog -L/path/to/fermi-lite -lfml -lz -lm -lpthread | |
7 | ``` | |
8 | ||
9 | ## Introduction | |
10 | ||
11 | Fermi-lite is a standalone C library as well as a command-line tool for | |
12 | assembling Illumina short reads in regions from 100bp to 10 million bp in size. | |
13 | It is largely a light-weight in-memory version of [fermikit][fk] without | |
14 | generating any intermediate files. It inherits the performance, the relatively | |
15 | small memory footprint and the features of fermikit. In particular, fermi-lite | |
16 | is able to retain heterozygous events and thus can be used to assemble diploid | |
17 | regions for the purpose of variant calling. It is one of the limited choices | |
18 | for local re-assembly and arguably the easiest to interface. | |
19 | ||
20 | ## Usage | |
21 | ||
22 | For now, see [example.c][example] for the basic use of the library. Here is a | |
23 | sketch of the example: | |
24 | ```cpp | |
25 | #include <stdio.h> // for printf() | |
26 | #include "fml.h" // only one header file required | |
27 | ||
28 | int main(int argc, char *argv[]) | |
29 | { | |
30 | int i, n_seqs, n_utgs; | |
31 | bseq1_t *seqs; // array of input sequences | |
32 | fml_utg_t *utgs; // array of output unitigs | |
33 | fml_opt_t opt; | |
34 | if (argc == 1) return 1; // do nothing if there is no input file | |
35 | seqs = bseq_read(argv[1], &n_seqs); // or fill the array with callers' functions | |
36 | fml_opt_init(&opt); // initialize parameters | |
37 | utgs = fml_assemble(&opt, n_seqs, seqs, &n_utgs); // assemble! | |
38 | for (i = 0; i < n_utgs; ++i) // output in fasta | |
39 | printf(">%d\n%s\n", i+1, utgs[i].seq); | |
40 | fml_utg_destroy(n_utgs, utgs); // deallocate unitigs | |
41 | return 0; | |
42 | } | |
43 | ``` | |
44 | The direct assembly output is in fact a graph. You may have a look at the | |
45 | [header file][header] for details. | |
46 | ||
47 | ## Overview of the Assembly Algorithm | |
48 | ||
49 | Fermi-lite is an overlap-based assembler. Given a set of input reads, it counts | |
50 | *k*-mers, estimates the *k*-mer coverage, sets a threshold on *k*-mer | |
51 | occurrences to determine solid *k*-mers and then use them correct sequencing | |
52 | errors ([Li, 2015][bfc-paper]). After error correction, fermi-lite trims a read | |
53 | at an *l*-mer unique to the read. It then constructs an FM-index for trimmed | |
54 | reads ([Li, 2014][rb2-paper]) and builds a transitively reduced overlap graph from the | |
55 | FM-index ([Simpson and Durbin, 2010][sga-paper]; [Li, 2012][fm1-paper]), | |
56 | requiring at least *l*-bp overlaps. In this graph, fermi-lite trims tips and | |
57 | pops bubbles caused by uncorrected errors. If a sequence in the graph has | |
58 | multiple overlaps, fermi-lite discards overlaps significantly shorter than the | |
59 | longest overlap -- this is a technique applied to overlap graph only. The graph | |
60 | after these procedure is the final output. Sequences in this graph are unitigs. | |
61 | ||
62 | ## Limitations | |
63 | ||
64 | 1. Fermi-lite can efficiently assemble bacterial genomes. However, it has not | |
65 | been carefully tuned for this type of assembly. While on a few GAGE-B data | |
66 | sets fermi-lite appears to work well, it may not compete with recent | |
67 | mainstream assemblers in general. | |
68 | ||
69 | 2. Fermi-lite does not work with genomes more than tens of megabases as a | |
70 | whole. It would take too much memory to stage all data in memory. For large | |
71 | genomes, please use [fermikit][fk] instead. | |
72 | ||
73 | 3. This is the first iteration of fermi-lite. It is still immarture. In | |
74 | particular, I hope fermi-lite can be smart enough to automatically figure | |
75 | out various parameters based on input, which is very challenging given the | |
76 | high variability of input data. | |
77 | ||
78 | [sga-paper]: http://www.ncbi.nlm.nih.gov/pubmed/20529929 | |
79 | [bfc-paper]: http://www.ncbi.nlm.nih.gov/pubmed/25953801 | |
80 | [rb2-paper]: http://www.ncbi.nlm.nih.gov/pubmed/25107872 | |
81 | [fm1-paper]: http://www.ncbi.nlm.nih.gov/pubmed/22569178 | |
82 | [bfc]: http://github.com/lh3/bfc | |
83 | [rb2]: http://github.com/lh3/ropebwt2 | |
84 | [fm2]: http://github.com/lh3/fermi2 | |
85 | [fk]: http://github.com/lh3/fermikit | |
86 | [example]: https://github.com/lh3/fermi-lite/blob/master/example.c | |
87 | [header]: https://github.com/lh3/fermi-lite/blob/master/fml.h |
0 | #include <stdlib.h> | |
1 | #include <string.h> | |
2 | #include <assert.h> | |
3 | #include <limits.h> | |
4 | #include <stdio.h> | |
5 | #include "htab.h" | |
6 | #include "kmer.h" | |
7 | #include "internal.h" | |
8 | #include "fml.h" | |
9 | ||
10 | /******************* | |
11 | *** BFC options *** | |
12 | *******************/ | |
13 | ||
14 | typedef struct { | |
15 | int n_threads, q, k, l_pre; | |
16 | int min_cov; // a k-mer is considered solid if the count is no less than this | |
17 | ||
18 | int max_end_ext; | |
19 | int win_multi_ec; | |
20 | float min_trim_frac; | |
21 | ||
22 | // these ec options cannot be changed on the command line | |
23 | int w_ec, w_ec_high, w_absent, w_absent_high; | |
24 | int max_path_diff, max_heap; | |
25 | } bfc_opt_t; | |
26 | ||
27 | void bfc_opt_init(bfc_opt_t *opt) | |
28 | { | |
29 | memset(opt, 0, sizeof(bfc_opt_t)); | |
30 | opt->n_threads = 1; | |
31 | opt->q = 20; | |
32 | opt->k = -1; | |
33 | opt->l_pre = -1; | |
34 | ||
35 | opt->min_cov = 4; // in BFC, this defaults to 3 because it has Bloom pre-filter | |
36 | opt->win_multi_ec = 10; | |
37 | opt->max_end_ext = 5; | |
38 | opt->min_trim_frac = .8; | |
39 | ||
40 | opt->w_ec = 1; | |
41 | opt->w_ec_high = 7; | |
42 | opt->w_absent = 3; | |
43 | opt->w_absent_high = 1; | |
44 | opt->max_path_diff = 15; | |
45 | opt->max_heap = 100; | |
46 | } | |
47 | ||
48 | /********************** | |
49 | *** K-mer counting *** | |
50 | **********************/ | |
51 | ||
52 | #define CNT_BUF_SIZE 256 | |
53 | ||
54 | typedef struct { // cache to reduce locking | |
55 | uint64_t y[2]; | |
56 | int is_high; | |
57 | } insbuf_t; | |
58 | ||
59 | typedef struct { | |
60 | int k, q; | |
61 | int n_seqs; | |
62 | const bseq1_t *seqs; | |
63 | bfc_ch_t *ch; | |
64 | int *n_buf; | |
65 | insbuf_t **buf; | |
66 | } cnt_step_t; | |
67 | ||
68 | bfc_kmer_t bfc_kmer_null = {{0,0,0,0}}; | |
69 | ||
70 | static int bfc_kmer_bufclear(cnt_step_t *cs, int forced, int tid) | |
71 | { | |
72 | int i, k, r; | |
73 | if (cs->ch == 0) return 0; | |
74 | for (i = k = 0; i < cs->n_buf[tid]; ++i) { | |
75 | r = bfc_ch_insert(cs->ch, cs->buf[tid][i].y, cs->buf[tid][i].is_high, forced); | |
76 | if (r < 0) cs->buf[tid][k++] = cs->buf[tid][i]; | |
77 | } | |
78 | cs->n_buf[tid] = k; | |
79 | return k; | |
80 | } | |
81 | ||
82 | static void bfc_kmer_insert(cnt_step_t *cs, const bfc_kmer_t *x, int is_high, int tid) | |
83 | { | |
84 | int k = cs->k; | |
85 | uint64_t y[2], hash; | |
86 | hash = bfc_kmer_hash(k, x->x, y); | |
87 | if (bfc_ch_insert(cs->ch, y, is_high, 0) < 0) { | |
88 | insbuf_t *p; | |
89 | if (bfc_kmer_bufclear(cs, 0, tid) == CNT_BUF_SIZE) | |
90 | bfc_kmer_bufclear(cs, 1, tid); | |
91 | p = &cs->buf[tid][cs->n_buf[tid]++]; | |
92 | p->y[0] = y[0], p->y[1] = y[1], p->is_high = is_high; | |
93 | } | |
94 | } | |
95 | ||
96 | static void worker_count(void *_data, long k, int tid) | |
97 | { | |
98 | cnt_step_t *cs = (cnt_step_t*)_data; | |
99 | const bseq1_t *s = &cs->seqs[k]; | |
100 | int i, l; | |
101 | bfc_kmer_t x = bfc_kmer_null; | |
102 | uint64_t qmer = 0, mask = (1ULL<<cs->k) - 1; | |
103 | for (i = l = 0; i < s->l_seq; ++i) { | |
104 | int c = seq_nt6_table[(uint8_t)s->seq[i]] - 1; | |
105 | if (c < 4) { | |
106 | bfc_kmer_append(cs->k, x.x, c); | |
107 | qmer = (qmer<<1 | (s->qual == 0 || s->qual[i] - 33 >= cs->q)) & mask; | |
108 | if (++l >= cs->k) bfc_kmer_insert(cs, &x, (qmer == mask), tid); | |
109 | } else l = 0, qmer = 0, x = bfc_kmer_null; | |
110 | } | |
111 | } | |
112 | ||
113 | struct bfc_ch_s *fml_count(int n, const bseq1_t *seq, int k, int q, int l_pre, int n_threads) | |
114 | { | |
115 | int i; | |
116 | cnt_step_t cs; | |
117 | cs.n_seqs = n, cs.seqs = seq, cs.k = k, cs.q = q; | |
118 | cs.ch = bfc_ch_init(cs.k, l_pre); | |
119 | cs.n_buf = calloc(n_threads, sizeof(int)); | |
120 | cs.buf = calloc(n_threads, sizeof(void*)); | |
121 | for (i = 0; i < n_threads; ++i) | |
122 | cs.buf[i] = malloc(CNT_BUF_SIZE * sizeof(insbuf_t)); | |
123 | kt_for(n_threads, worker_count, &cs, cs.n_seqs); | |
124 | for (i = 0; i < n_threads; ++i) free(cs.buf[i]); | |
125 | free(cs.buf); free(cs.n_buf); | |
126 | return cs.ch; | |
127 | } | |
128 | ||
129 | /*************** | |
130 | *** Correct *** | |
131 | ***************/ | |
132 | ||
133 | #define BFC_MAX_KMER 63 | |
134 | #define BFC_MAX_BF_SHIFT 37 | |
135 | ||
136 | #define BFC_MAX_PATHS 4 | |
137 | #define BFC_EC_HIST 5 | |
138 | #define BFC_EC_HIST_HIGH 2 | |
139 | ||
140 | #define BFC_EC_MIN_COV_COEF .1 | |
141 | ||
142 | /************************** | |
143 | * Sequence struct for ec * | |
144 | **************************/ | |
145 | ||
146 | #include "kvec.h" | |
147 | ||
148 | typedef struct { // NOTE: unaligned memory | |
149 | uint8_t b:3, q:1, ob:3, oq:1; | |
150 | uint8_t dummy; | |
151 | uint16_t lcov:6, hcov:6, solid_end:1, high_end:1, ec:1, absent:1; | |
152 | int i; | |
153 | } ecbase_t; | |
154 | ||
155 | typedef kvec_t(ecbase_t) ecseq_t; | |
156 | ||
157 | static int bfc_seq_conv(const char *s, const char *q, int qthres, ecseq_t *seq) | |
158 | { | |
159 | int i, l; | |
160 | l = strlen(s); | |
161 | kv_resize(ecbase_t, *seq, l); | |
162 | seq->n = l; | |
163 | for (i = 0; i < l; ++i) { | |
164 | ecbase_t *c = &seq->a[i]; | |
165 | c->b = c->ob = seq_nt6_table[(int)s[i]] - 1; | |
166 | c->q = c->oq = !q? 1 : q[i] - 33 >= qthres? 1 : 0; | |
167 | if (c->b > 3) c->q = c->oq = 0; | |
168 | c->i = i; | |
169 | } | |
170 | return l; | |
171 | } | |
172 | ||
173 | static inline ecbase_t ecbase_comp(const ecbase_t *b) | |
174 | { | |
175 | ecbase_t r = *b; | |
176 | r.b = b->b < 4? 3 - b->b : 4; | |
177 | r.ob = b->ob < 4? 3 - b->ob : 4; | |
178 | return r; | |
179 | } | |
180 | ||
181 | static void bfc_seq_revcomp(ecseq_t *seq) | |
182 | { | |
183 | int i; | |
184 | for (i = 0; i < seq->n>>1; ++i) { | |
185 | ecbase_t tmp; | |
186 | tmp = ecbase_comp(&seq->a[i]); | |
187 | seq->a[i] = ecbase_comp(&seq->a[seq->n - 1 - i]); | |
188 | seq->a[seq->n - 1 - i] = tmp; | |
189 | } | |
190 | if (seq->n&1) seq->a[i] = ecbase_comp(&seq->a[i]); | |
191 | } | |
192 | ||
193 | /*************************** | |
194 | * Independent ec routines * | |
195 | ***************************/ | |
196 | ||
197 | int bfc_ec_greedy_k(int k, int mode, const bfc_kmer_t *x, const bfc_ch_t *ch) | |
198 | { | |
199 | int i, j, max = 0, max_ec = -1, max2 = 0; | |
200 | for (i = 0; i < k; ++i) { | |
201 | int c = (x->x[1]>>i&1)<<1 | (x->x[0]>>i&1); | |
202 | for (j = 0; j < 4; ++j) { | |
203 | bfc_kmer_t y = *x; | |
204 | int ret; | |
205 | if (j == c) continue; | |
206 | bfc_kmer_change(k, y.x, i, j); | |
207 | ret = bfc_ch_kmer_occ(ch, &y); | |
208 | if (ret < 0) continue; | |
209 | if ((max&0xff) < (ret&0xff)) max2 = max, max = ret, max_ec = i<<2 | j; | |
210 | else if ((max2&0xff) < (ret&0xff)) max2 = ret; | |
211 | } | |
212 | } | |
213 | return (max&0xff) * 3 > mode && (max2&0xff) < 3? max_ec : -1; | |
214 | } | |
215 | ||
216 | int bfc_ec_first_kmer(int k, const ecseq_t *s, int start, bfc_kmer_t *x) | |
217 | { | |
218 | int i, l; | |
219 | *x = bfc_kmer_null; | |
220 | for (i = start, l = 0; i < s->n; ++i) { | |
221 | ecbase_t *c = &s->a[i]; | |
222 | if (c->b < 4) { | |
223 | bfc_kmer_append(k, x->x, c->b); | |
224 | if (++l == k) break; | |
225 | } else l = 0, *x = bfc_kmer_null; | |
226 | } | |
227 | return i; | |
228 | } | |
229 | ||
230 | void bfc_ec_kcov(int k, int min_occ, ecseq_t *s, const bfc_ch_t *ch) | |
231 | { | |
232 | int i, l, r, j; | |
233 | bfc_kmer_t x = bfc_kmer_null; | |
234 | for (i = l = 0; i < s->n; ++i) { | |
235 | ecbase_t *c = &s->a[i]; | |
236 | c->high_end = c->solid_end = c->lcov = c->hcov = 0; | |
237 | if (c->b < 4) { | |
238 | bfc_kmer_append(k, x.x, c->b); | |
239 | if (++l >= k) { | |
240 | if ((r = bfc_ch_kmer_occ(ch, &x)) >= 0) { | |
241 | if ((r>>8&0x3f) >= min_occ+1) c->high_end = 1; | |
242 | if ((r&0xff) >= min_occ) { | |
243 | c->solid_end = 1; | |
244 | for (j = i - k + 1; j <= i; ++j) | |
245 | ++s->a[j].lcov, s->a[j].hcov += c->high_end; | |
246 | } | |
247 | } | |
248 | } | |
249 | } else l = 0, x = bfc_kmer_null; | |
250 | } | |
251 | } | |
252 | ||
253 | uint64_t bfc_ec_best_island(int k, const ecseq_t *s) | |
254 | { // IMPORTANT: call bfc_ec_kcov() before calling this function! | |
255 | int i, l, max, max_i; | |
256 | for (i = k - 1, max = l = 0, max_i = -1; i < s->n; ++i) { | |
257 | if (!s->a[i].solid_end) { | |
258 | if (l > max) max = l, max_i = i; | |
259 | l = 0; | |
260 | } else ++l; | |
261 | } | |
262 | if (l > max) max = l, max_i = i; | |
263 | return max > 0? (uint64_t)(max_i - max - k + 1) << 32 | max_i : 0; | |
264 | } | |
265 | ||
266 | /******************** | |
267 | * Correct one read * | |
268 | ********************/ | |
269 | ||
270 | #include "ksort.h" | |
271 | ||
272 | #define ECCODE_MISC 1 | |
273 | #define ECCODE_MANY_N 2 | |
274 | #define ECCODE_NO_SOLID 3 | |
275 | #define ECCODE_UNCORR_N 4 | |
276 | #define ECCODE_MANY_FAIL 5 | |
277 | ||
278 | typedef struct { | |
279 | uint32_t ec_code:3, brute:1, n_ec:14, n_ec_high:14; | |
280 | uint32_t n_absent:24, max_heap:8; | |
281 | } ecstat_t; | |
282 | ||
283 | typedef struct { | |
284 | uint8_t ec:1, ec_high:1, absent:1, absent_high:1, b:4; | |
285 | } bfc_penalty_t; | |
286 | ||
287 | typedef struct { | |
288 | int tot_pen; | |
289 | int i; // base position | |
290 | int k; // position in the stack | |
291 | int32_t ecpos_high[BFC_EC_HIST_HIGH]; | |
292 | int32_t ecpos[BFC_EC_HIST]; | |
293 | bfc_kmer_t x; | |
294 | } echeap1_t; | |
295 | ||
296 | typedef struct { | |
297 | int parent, i, tot_pen; | |
298 | uint8_t b; | |
299 | bfc_penalty_t pen; | |
300 | uint16_t cnt; | |
301 | } ecstack1_t; | |
302 | ||
303 | typedef struct { | |
304 | const bfc_opt_t *opt; | |
305 | const bfc_ch_t *ch; | |
306 | kvec_t(echeap1_t) heap; | |
307 | kvec_t(ecstack1_t) stack; | |
308 | ecseq_t seq, tmp, ec[2]; | |
309 | int mode; | |
310 | ecstat_t ori_st; | |
311 | } bfc_ec1buf_t; | |
312 | ||
313 | #define heap_lt(a, b) ((a).tot_pen > (b).tot_pen) | |
314 | KSORT_INIT(ec, echeap1_t, heap_lt) | |
315 | ||
316 | static bfc_ec1buf_t *ec1buf_init(const bfc_opt_t *opt, const bfc_ch_t *ch) | |
317 | { | |
318 | bfc_ec1buf_t *e; | |
319 | e = calloc(1, sizeof(bfc_ec1buf_t)); | |
320 | e->opt = opt, e->ch = ch; | |
321 | return e; | |
322 | } | |
323 | ||
324 | static void ec1buf_destroy(bfc_ec1buf_t *e) | |
325 | { | |
326 | free(e->heap.a); free(e->stack.a); free(e->seq.a); free(e->tmp.a); free(e->ec[0].a); free(e->ec[1].a); | |
327 | free(e); | |
328 | } | |
329 | ||
330 | #define weighted_penalty(o, p) ((o)->w_ec * (p).ec + (o)->w_ec_high * (p).ec_high + (o)->w_absent * (p).absent + (o)->w_absent_high * (p).absent_high) | |
331 | ||
332 | static void buf_update(bfc_ec1buf_t *e, const echeap1_t *prev, bfc_penalty_t pen, int cnt) | |
333 | { | |
334 | ecstack1_t *q; | |
335 | echeap1_t *r; | |
336 | const bfc_opt_t *o = e->opt; | |
337 | int b = pen.b; | |
338 | // update stack | |
339 | kv_pushp(ecstack1_t, e->stack, &q); | |
340 | q->parent = prev->k; | |
341 | q->i = prev->i; | |
342 | q->b = b; | |
343 | q->pen = pen; | |
344 | q->cnt = cnt > 0? cnt&0xff : 0; | |
345 | q->tot_pen = prev->tot_pen + weighted_penalty(o, pen); | |
346 | // update heap | |
347 | kv_pushp(echeap1_t, e->heap, &r); | |
348 | r->i = prev->i + 1; | |
349 | r->k = e->stack.n - 1; | |
350 | r->x = prev->x; | |
351 | if (pen.ec_high) { | |
352 | memcpy(r->ecpos_high + 1, prev->ecpos_high, (BFC_EC_HIST_HIGH - 1) * 4); | |
353 | r->ecpos_high[0] = prev->i; | |
354 | } else memcpy(r->ecpos_high, prev->ecpos_high, BFC_EC_HIST_HIGH * 4); | |
355 | if (pen.ec) { | |
356 | memcpy(r->ecpos + 1, prev->ecpos, (BFC_EC_HIST - 1) * 4); | |
357 | r->ecpos[0] = prev->i; | |
358 | } else memcpy(r->ecpos, prev->ecpos, BFC_EC_HIST * 4); | |
359 | r->tot_pen = q->tot_pen; | |
360 | bfc_kmer_append(e->opt->k, r->x.x, b); | |
361 | ks_heapup_ec(e->heap.n, e->heap.a); | |
362 | } | |
363 | ||
364 | static int buf_backtrack(ecstack1_t *s, int end, const ecseq_t *seq, ecseq_t *path) | |
365 | { | |
366 | int i, n_absent = 0; | |
367 | kv_resize(ecbase_t, *path, seq->n); | |
368 | path->n = seq->n; | |
369 | while (end >= 0) { | |
370 | if ((i = s[end].i) < seq->n) { | |
371 | path->a[i].b = s[end].b; | |
372 | path->a[i].ec = s[end].pen.ec; | |
373 | path->a[i].absent = s[end].pen.absent; | |
374 | n_absent += s[end].pen.absent; | |
375 | } | |
376 | end = s[end].parent; | |
377 | } | |
378 | return n_absent; | |
379 | } | |
380 | ||
381 | static int bfc_ec1dir(bfc_ec1buf_t *e, const ecseq_t *seq, ecseq_t *ec, int start, int end, int *max_heap) | |
382 | { | |
383 | echeap1_t z; | |
384 | int i, l, rv = -1, path[BFC_MAX_PATHS], n_paths = 0, min_path = -1, min_path_pen = INT_MAX, n_failures = 0; | |
385 | assert(end <= seq->n && end - start >= e->opt->k); | |
386 | e->heap.n = e->stack.n = 0; | |
387 | *max_heap = 0; | |
388 | memset(&z, 0, sizeof(echeap1_t)); | |
389 | kv_resize(ecbase_t, *ec, seq->n); | |
390 | ec->n = seq->n; | |
391 | for (z.i = start, l = 0; z.i < end; ++z.i) { | |
392 | int c = seq->a[z.i].b; | |
393 | if (c < 4) { | |
394 | if (++l == e->opt->k) break; | |
395 | bfc_kmer_append(e->opt->k, z.x.x, c); | |
396 | } else l = 0, z.x = bfc_kmer_null; | |
397 | } | |
398 | assert(z.i < end); // before calling this function, there must be at least one solid k-mer | |
399 | z.k = -1; | |
400 | for (i = 0; i < BFC_EC_HIST; ++i) z.ecpos[i] = -1; | |
401 | for (i = 0; i < BFC_EC_HIST_HIGH; ++i) z.ecpos_high[i] = -1; | |
402 | kv_push(echeap1_t, e->heap, z); | |
403 | for (i = 0; i < seq->n; ++i) ec->a[i].b = seq->a[i].b, ec->a[i].ob = seq->a[i].ob; | |
404 | // exhaustive error correction | |
405 | while (1) { | |
406 | int stop = 0; | |
407 | *max_heap = *max_heap > 255? 255 : *max_heap > e->heap.n? *max_heap : e->heap.n; | |
408 | if (e->heap.n == 0) { // may happen when there is an uncorrectable "N" | |
409 | rv = -2; | |
410 | break; | |
411 | } | |
412 | z = e->heap.a[0]; | |
413 | e->heap.a[0] = kv_pop(e->heap); | |
414 | ks_heapdown_ec(0, e->heap.n, e->heap.a); | |
415 | if (min_path >= 0 && z.tot_pen > min_path_pen + e->opt->max_path_diff) break; | |
416 | if (z.i - end > e->opt->max_end_ext) stop = 1; | |
417 | if (!stop) { | |
418 | ecbase_t *c = z.i < seq->n? &seq->a[z.i] : 0; | |
419 | int b, os = -1, fixed = 0, other_ext = 0, n_added = 0, added_cnt[4]; | |
420 | bfc_penalty_t added[4]; | |
421 | // test if the read extension alone is enough | |
422 | if (z.i > end) fixed = 1; | |
423 | if (c && c->b < 4) { // A, C, G or T | |
424 | bfc_kmer_t x = z.x; | |
425 | bfc_kmer_append(e->opt->k, x.x, c->b); | |
426 | os = bfc_ch_kmer_occ(e->ch, &x); | |
427 | if (c->q && (os&0xff) >= e->opt->min_cov + 1 && c->lcov >= e->opt->min_cov + 1) fixed = 1; | |
428 | else if (c->hcov > e->opt->k * .75) fixed = 1; | |
429 | } | |
430 | // extension | |
431 | for (b = 0; b < 4; ++b) { | |
432 | bfc_penalty_t pen; | |
433 | if (fixed && c && b != c->b) continue; | |
434 | if (c == 0 || b != c->b) { | |
435 | int s; | |
436 | bfc_kmer_t x = z.x; | |
437 | pen.ec = 0, pen.ec_high = 0, pen.absent = 0, pen.absent_high = 0, pen.b = b; | |
438 | if (c) { // not over the end | |
439 | if (c->q && z.ecpos_high[BFC_EC_HIST_HIGH-1] >= 0 && z.i - z.ecpos_high[BFC_EC_HIST_HIGH-1] < e->opt->win_multi_ec) continue; // no close highQ corrections | |
440 | if (z.ecpos[BFC_EC_HIST-1] >= 0 && z.i - z.ecpos[BFC_EC_HIST-1] < e->opt->win_multi_ec) continue; // no clustered corrections | |
441 | } | |
442 | bfc_kmer_append(e->opt->k, x.x, b); | |
443 | s = bfc_ch_kmer_occ(e->ch, &x); | |
444 | if (s < 0 || (s&0xff) < e->opt->min_cov) continue; // not solid | |
445 | //if (os >= 0 && (s&0xff) - (os&0xff) < 2) continue; // not sufficiently better than the read path | |
446 | pen.ec = c && c->b < 4? 1 : 0; | |
447 | pen.ec_high = pen.ec? c->oq : 0; | |
448 | pen.absent = 0; | |
449 | pen.absent_high = ((s>>8&0xff) < e->opt->min_cov); | |
450 | pen.b = b; | |
451 | added_cnt[n_added] = s; | |
452 | added[n_added++] = pen; | |
453 | ++other_ext; | |
454 | } else { | |
455 | pen.ec = pen.ec_high = 0; | |
456 | pen.absent = (os < 0 || (os&0xff) < e->opt->min_cov); | |
457 | pen.absent_high = (os < 0 || (os>>8&0xff) < e->opt->min_cov); | |
458 | pen.b = b; | |
459 | added_cnt[n_added] = os; | |
460 | added[n_added++] = pen; | |
461 | } | |
462 | } // ~for(b) | |
463 | if (fixed == 0 && other_ext == 0) ++n_failures; | |
464 | if (n_failures > seq->n * 2) { | |
465 | rv = -3; | |
466 | break; | |
467 | } | |
468 | if (c || n_added == 1) { | |
469 | if (n_added > 1 && e->heap.n > e->opt->max_heap) { // to prevent heap explosion | |
470 | int min_b = -1, min = INT_MAX; | |
471 | for (b = 0; b < n_added; ++b) { | |
472 | int t = weighted_penalty(e->opt, added[b]); | |
473 | if (min > t) min = t, min_b = b; | |
474 | } | |
475 | buf_update(e, &z, added[min_b], added_cnt[min_b]); | |
476 | } else { | |
477 | for (b = 0; b < n_added; ++b) | |
478 | buf_update(e, &z, added[b], added_cnt[b]); | |
479 | } | |
480 | } else { | |
481 | if (n_added == 0) | |
482 | e->stack.a[z.k].tot_pen += e->opt->w_absent * (e->opt->max_end_ext - (z.i - end)); | |
483 | stop = 1; | |
484 | } | |
485 | } // ~if(!stop) | |
486 | if (stop) { | |
487 | if (e->stack.a[z.k].tot_pen < min_path_pen) | |
488 | min_path_pen = e->stack.a[z.k].tot_pen, min_path = n_paths; | |
489 | path[n_paths++] = z.k; | |
490 | if (n_paths == BFC_MAX_PATHS) break; | |
491 | } | |
492 | } // ~while(1) | |
493 | // backtrack | |
494 | if (n_paths == 0) return rv; | |
495 | assert(min_path >= 0 && min_path < n_paths && e->stack.a[path[min_path]].tot_pen == min_path_pen); | |
496 | rv = buf_backtrack(e->stack.a, path[min_path], seq, ec); | |
497 | for (i = 0; i < ec->n; ++i) // mask out uncorrected regions | |
498 | if (i < start + e->opt->k || i >= end) ec->a[i].b = 4; | |
499 | return rv; | |
500 | } | |
501 | ||
502 | ecstat_t bfc_ec1(bfc_ec1buf_t *e, char *seq, char *qual) | |
503 | { | |
504 | int i, start = 0, end = 0, n_n = 0, rv[2], max_heap[2]; | |
505 | uint64_t r; | |
506 | ecstat_t s; | |
507 | ||
508 | s.ec_code = ECCODE_MISC, s.brute = 0, s.n_ec = s.n_ec_high = 0, s.n_absent = s.max_heap = 0; | |
509 | bfc_seq_conv(seq, qual, e->opt->q, &e->seq); | |
510 | for (i = 0; i < e->seq.n; ++i) | |
511 | if (e->seq.a[i].ob > 3) ++n_n; | |
512 | if (n_n > e->seq.n * .05) { | |
513 | s.ec_code = ECCODE_MANY_N; | |
514 | return s; | |
515 | } | |
516 | bfc_ec_kcov(e->opt->k, e->opt->min_cov, &e->seq, e->ch); | |
517 | r = bfc_ec_best_island(e->opt->k, &e->seq); | |
518 | if (r == 0) { // no solid k-mer | |
519 | bfc_kmer_t x; | |
520 | int ec = -1; | |
521 | while ((end = bfc_ec_first_kmer(e->opt->k, &e->seq, start, &x)) < e->seq.n) { | |
522 | ec = bfc_ec_greedy_k(e->opt->k, e->mode, &x, e->ch); | |
523 | if (ec >= 0) break; | |
524 | if (end + (e->opt->k>>1) >= e->seq.n) break; | |
525 | start = end - (e->opt->k>>1); | |
526 | } | |
527 | if (ec >= 0) { | |
528 | e->seq.a[end - (ec>>2)].b = ec&3; | |
529 | ++end; start = end - e->opt->k; | |
530 | s.brute = 1; | |
531 | } else { | |
532 | s.ec_code = ECCODE_NO_SOLID; | |
533 | return s; | |
534 | } | |
535 | } else start = r>>32, end = (uint32_t)r; | |
536 | if ((rv[0] = bfc_ec1dir(e, &e->seq, &e->ec[0], start, e->seq.n, &max_heap[0])) < 0) { | |
537 | s.ec_code = rv[0] == -2? ECCODE_UNCORR_N : rv[0] == -3? ECCODE_MANY_FAIL : ECCODE_MISC; | |
538 | return s; | |
539 | } | |
540 | bfc_seq_revcomp(&e->seq); | |
541 | if ((rv[1] = bfc_ec1dir(e, &e->seq, &e->ec[1], e->seq.n - end, e->seq.n, &max_heap[1])) < 0) { | |
542 | s.ec_code = rv[1] == -2? ECCODE_UNCORR_N : rv[1] == -3? ECCODE_MANY_FAIL : ECCODE_MISC; | |
543 | return s; | |
544 | } | |
545 | s.max_heap = max_heap[0] > max_heap[1]? max_heap[0] : max_heap[1]; | |
546 | s.ec_code = 0, s.n_absent = rv[0] + rv[1]; | |
547 | bfc_seq_revcomp(&e->ec[1]); | |
548 | bfc_seq_revcomp(&e->seq); | |
549 | for (i = 0; i < e->seq.n; ++i) { | |
550 | ecbase_t *c = &e->seq.a[i]; | |
551 | if (e->ec[0].a[i].b == e->ec[1].a[i].b) | |
552 | c->b = e->ec[0].a[i].b > 3? e->seq.a[i].b : e->ec[0].a[i].b; | |
553 | else if (e->ec[1].a[i].b > 3) c->b = e->ec[0].a[i].b; | |
554 | else if (e->ec[0].a[i].b > 3) c->b = e->ec[1].a[i].b; | |
555 | else c->b = e->seq.a[i].ob; | |
556 | } | |
557 | for (i = 0; i < e->seq.n; ++i) { | |
558 | int is_diff = !(e->seq.a[i].b == e->seq.a[i].ob); | |
559 | if (is_diff) { | |
560 | ++s.n_ec; | |
561 | if (e->seq.a[i].q) ++s.n_ec_high; | |
562 | } | |
563 | seq[i] = (is_diff? "acgtn" : "ACGTN")[e->seq.a[i].b]; | |
564 | if (qual) qual[i] = is_diff? 34 + e->seq.a[i].ob : "+?"[e->seq.a[i].q]; | |
565 | } | |
566 | return s; | |
567 | } | |
568 | ||
569 | /******************** | |
570 | * Error correction * | |
571 | ********************/ | |
572 | ||
573 | typedef struct { | |
574 | const bfc_opt_t *opt; | |
575 | const bfc_ch_t *ch; | |
576 | bfc_ec1buf_t **e; | |
577 | int64_t n_processed; | |
578 | int n_seqs, flt_uniq; | |
579 | bseq1_t *seqs; | |
580 | } ec_step_t; | |
581 | ||
582 | static uint64_t max_streak(int k, const bfc_ch_t *ch, const bseq1_t *s) | |
583 | { | |
584 | int i, l; | |
585 | uint64_t max = 0, t = 0; | |
586 | bfc_kmer_t x = bfc_kmer_null; | |
587 | for (i = l = 0; i < s->l_seq; ++i) { | |
588 | int c = seq_nt6_table[(uint8_t)s->seq[i]] - 1; | |
589 | if (c < 4) { // not an ambiguous base | |
590 | bfc_kmer_append(k, x.x, c); | |
591 | if (++l >= k) { // ok, we have a k-mer now | |
592 | if (bfc_ch_kmer_occ(ch, &x) > 0) t += 1ULL<<32; | |
593 | else t = i + 1; | |
594 | } else t = i + 1; | |
595 | } else l = 0, x = bfc_kmer_null, t = i + 1; | |
596 | max = max > t? max : t; | |
597 | } | |
598 | return max; | |
599 | } | |
600 | ||
601 | static void worker_ec(void *_data, long k, int tid) | |
602 | { | |
603 | ec_step_t *es = (ec_step_t*)_data; | |
604 | bseq1_t *s = &es->seqs[k]; | |
605 | if (es->flt_uniq) { | |
606 | uint64_t max; | |
607 | max = max_streak(es->opt->k, es->ch, s); | |
608 | if (max>>32 && (double)((max>>32) + es->opt->k - 1) / s->l_seq > es->opt->min_trim_frac) { | |
609 | int start = (uint32_t)max, end = start + (max>>32); | |
610 | start -= es->opt->k - 1; | |
611 | assert(start >= 0 && end <= s->l_seq); | |
612 | memmove(s->seq, s->seq + start, end - start); | |
613 | s->l_seq = end - start; | |
614 | s->seq[s->l_seq] = 0; | |
615 | if (s->qual) { | |
616 | memmove(s->qual, s->qual + start, s->l_seq); | |
617 | s->qual[s->l_seq] = 0; | |
618 | } | |
619 | } else { | |
620 | free(s->seq); free(s->qual); | |
621 | s->l_seq = 0, s->seq = s->qual = 0; | |
622 | } | |
623 | } else bfc_ec1(es->e[tid], s->seq, s->qual); | |
624 | } | |
625 | ||
626 | float fml_correct_core(const fml_opt_t *opt, int flt_uniq, int n, bseq1_t *seq) | |
627 | { | |
628 | bfc_ch_t *ch; | |
629 | int i, mode; | |
630 | uint64_t hist[256], hist_high[64], tot_len = 0, sum_k = 0, tot_k = 0; | |
631 | ec_step_t es; | |
632 | bfc_opt_t bfc_opt; | |
633 | float kcov; | |
634 | ||
635 | // initialize BFC options | |
636 | bfc_opt_init(&bfc_opt); | |
637 | bfc_opt.n_threads = opt->n_threads; // copy from FML options | |
638 | bfc_opt.k = flt_uniq? opt->min_asm_ovlp : opt->ec_k; | |
639 | for (i = 0; i < n; ++i) tot_len += seq[i].l_seq; // compute total length | |
640 | bfc_opt.l_pre = tot_len - 8 < 20? tot_len - 8 : 20; | |
641 | ||
642 | memset(&es, 0, sizeof(ec_step_t)); | |
643 | es.opt = &bfc_opt, es.n_seqs = n, es.seqs = seq, es.flt_uniq = flt_uniq; | |
644 | ||
645 | es.ch = ch = fml_count(n, seq, bfc_opt.k, bfc_opt.q, bfc_opt.l_pre, bfc_opt.n_threads); | |
646 | mode = bfc_ch_hist(ch, hist, hist_high); | |
647 | for (i = opt->min_cnt; i < 256; ++i) | |
648 | sum_k += hist[i], tot_k += i * hist[i]; | |
649 | kcov = (float)tot_k / sum_k; | |
650 | bfc_opt.min_cov = (int)(BFC_EC_MIN_COV_COEF * kcov + .499); | |
651 | bfc_opt.min_cov = bfc_opt.min_cov < opt->max_cnt? bfc_opt.min_cov : opt->max_cnt; | |
652 | bfc_opt.min_cov = bfc_opt.min_cov > opt->min_cnt? bfc_opt.min_cov : opt->min_cnt; | |
653 | ||
654 | es.e = calloc(es.opt->n_threads, sizeof(void*)); | |
655 | for (i = 0; i < es.opt->n_threads; ++i) | |
656 | es.e[i] = ec1buf_init(es.opt, ch), es.e[i]->mode = mode; | |
657 | kt_for(es.opt->n_threads, worker_ec, &es, es.n_seqs); | |
658 | for (i = 0; i < es.opt->n_threads; ++i) | |
659 | ec1buf_destroy(es.e[i]); | |
660 | free(es.e); | |
661 | bfc_ch_destroy(ch); | |
662 | return kcov; | |
663 | } | |
664 | ||
665 | float fml_correct(const fml_opt_t *opt, int n, bseq1_t *seq) | |
666 | { | |
667 | return fml_correct_core(opt, 0, n, seq); | |
668 | } | |
669 | ||
670 | float fml_fltuniq(const fml_opt_t *opt, int n, bseq1_t *seq) | |
671 | { | |
672 | return fml_correct_core(opt, 1, n, seq); | |
673 | } |
0 | #include <zlib.h> | |
1 | #include <stdio.h> | |
2 | #include <stdlib.h> | |
3 | #include <string.h> | |
4 | #include "fml.h" | |
5 | #include "kseq.h" | |
6 | KSEQ_INIT(gzFile, gzread) | |
7 | ||
8 | bseq1_t *bseq_read(const char *fn, int *n_) | |
9 | { | |
10 | gzFile fp; | |
11 | bseq1_t *seqs; | |
12 | kseq_t *ks; | |
13 | int m, n; | |
14 | uint64_t size = 0; | |
15 | ||
16 | *n_ = 0; | |
17 | fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); | |
18 | if (fp == 0) return 0; | |
19 | ks = kseq_init(fp); | |
20 | ||
21 | m = n = 0; seqs = 0; | |
22 | while (kseq_read(ks) >= 0) { | |
23 | bseq1_t *s; | |
24 | if (n >= m) { | |
25 | m = m? m<<1 : 256; | |
26 | seqs = realloc(seqs, m * sizeof(bseq1_t)); | |
27 | } | |
28 | s = &seqs[n]; | |
29 | s->seq = strdup(ks->seq.s); | |
30 | s->qual = ks->qual.l? strdup(ks->qual.s) : 0; | |
31 | s->l_seq = ks->seq.l; | |
32 | size += seqs[n++].l_seq; | |
33 | } | |
34 | *n_ = n; | |
35 | ||
36 | kseq_destroy(ks); | |
37 | gzclose(fp); | |
38 | return seqs; | |
39 | } | |
40 | ||
41 | void seq_reverse(int l, unsigned char *s) | |
42 | { | |
43 | int i; | |
44 | for (i = 0; i < l>>1; ++i) { | |
45 | int tmp = s[l-1-i]; | |
46 | s[l-1-i] = s[i]; s[i] = tmp; | |
47 | } | |
48 | } | |
49 | ||
50 | void seq_revcomp6(int l, unsigned char *s) | |
51 | { | |
52 | int i; | |
53 | for (i = 0; i < l>>1; ++i) { | |
54 | int tmp = s[l-1-i]; | |
55 | tmp = (tmp >= 1 && tmp <= 4)? 5 - tmp : tmp; | |
56 | s[l-1-i] = (s[i] >= 1 && s[i] <= 4)? 5 - s[i] : s[i]; | |
57 | s[i] = tmp; | |
58 | } | |
59 | if (l&1) s[i] = (s[i] >= 1 && s[i] <= 4)? 5 - s[i] : s[i]; | |
60 | } |
0 | #include <limits.h> | |
1 | #include <stdio.h> | |
2 | #include "mag.h" | |
3 | #include "kvec.h" | |
4 | #include "ksw.h" | |
5 | #include "internal.h" | |
6 | #include "khash.h" | |
7 | KHASH_DECLARE(64, uint64_t, uint64_t) | |
8 | ||
9 | typedef khash_t(64) hash64_t; | |
10 | ||
11 | #define MAX_N_DIFF 2.01 // for evaluating alignment after SW | |
12 | #define MAX_R_DIFF 0.1 | |
13 | #define L_DIFF_COEF 0.2 // n_diff=|l_0 - l_1|*L_DIFF_COEF | |
14 | ||
15 | #define edge_mark_del(_x) ((_x).x = (uint64_t)-2, (_x).y = 0) | |
16 | #define edge_is_del(_x) ((_x).x == (uint64_t)-2 || (_x).y == 0) | |
17 | ||
18 | static int fm_verbose = 1; | |
19 | ||
20 | /****************** | |
21 | * Closed bubbles * | |
22 | ******************/ | |
23 | ||
24 | typedef struct { | |
25 | uint64_t id; | |
26 | int cnt[2]; | |
27 | int n[2][2], d[2][2]; | |
28 | uint64_t v[2][2]; | |
29 | } trinfo_t; | |
30 | ||
31 | const trinfo_t g_trinull = {-1, {0, 0}, {{INT_MIN, INT_MIN}, {INT_MIN, INT_MIN}}, {{INT_MIN, INT_MIN}, {INT_MIN, INT_MIN}}, {{-1, -1}, {-1, -1}}}; | |
32 | ||
33 | typedef struct { | |
34 | int n, m; | |
35 | trinfo_t **buf; | |
36 | } tipool_t; | |
37 | ||
38 | struct mogb_aux { | |
39 | tipool_t pool; | |
40 | ku64_v stack; | |
41 | hash64_t *h; | |
42 | }; | |
43 | ||
44 | mogb_aux_t *mag_b_initaux(void) | |
45 | { | |
46 | mogb_aux_t *aux = calloc(1, sizeof(mogb_aux_t)); | |
47 | aux->h = kh_init(64); | |
48 | return aux; | |
49 | } | |
50 | ||
51 | void mag_b_destroyaux(mogb_aux_t *b) | |
52 | { | |
53 | int i; | |
54 | for (i = 0; i < b->pool.m; ++i) | |
55 | free(b->pool.buf[i]); | |
56 | free(b->pool.buf); free(b->stack.a); | |
57 | kh_destroy(64, b->h); | |
58 | free(b); | |
59 | } | |
60 | ||
61 | #define tiptr(p) ((trinfo_t*)(p)->ptr) | |
62 | ||
63 | static inline trinfo_t *tip_alloc(tipool_t *pool, uint32_t id) | |
64 | { // allocate an object from the memory pool | |
65 | trinfo_t *p; | |
66 | if (pool->n == pool->m) { | |
67 | int i, new_m = pool->m? pool->m<<1 : 256; | |
68 | pool->buf = realloc(pool->buf, new_m * sizeof(void*)); | |
69 | for (i = pool->m; i < new_m; ++i) | |
70 | pool->buf[i] = malloc(sizeof(trinfo_t)); | |
71 | pool->m = new_m; | |
72 | } | |
73 | p = pool->buf[pool->n++]; | |
74 | *p = g_trinull; | |
75 | p->id = id; | |
76 | return p; | |
77 | } | |
78 | ||
79 | static void backtrace(mag_t *g, uint64_t end, uint64_t start, hash64_t *h) | |
80 | { | |
81 | while (end>>32 != start) { | |
82 | int ret; | |
83 | kh_put(64, h, end>>33, &ret); | |
84 | end = tiptr(&g->v.a[end>>33])->v[(end>>32^1)&1][end&1]; | |
85 | } | |
86 | } | |
87 | ||
88 | void mag_vh_simplify_bubble(mag_t *g, uint64_t idd, int max_vtx, int max_dist, mogb_aux_t *a) | |
89 | { | |
90 | int i, n_pending = 0; | |
91 | magv_t *p, *q; | |
92 | ||
93 | p = &g->v.a[idd>>1]; | |
94 | if (p->len < 0 || p->nei[idd&1].n < 2) return; // stop if p is deleted or it has 0 or 1 neighbor | |
95 | // reset aux data | |
96 | a->stack.n = a->pool.n = 0; | |
97 | if (kh_n_buckets(a->h) >= 64) { | |
98 | kh_destroy(64, a->h); | |
99 | a->h = kh_init(64); | |
100 | } else kh_clear(64, a->h); | |
101 | // add the initial vertex | |
102 | p->ptr = tip_alloc(&a->pool, idd>>1); | |
103 | tiptr(p)->d[(idd&1)^1][0] = -p->len; | |
104 | tiptr(p)->n[(idd&1)^1][0] = -p->nsr; | |
105 | kv_push(uint64_t, a->stack, idd^1); | |
106 | // essentially a topological sorting | |
107 | while (a->stack.n) { | |
108 | uint64_t x, y; | |
109 | ku128_v *r; | |
110 | if (a->stack.n == 1 && a->stack.a[0] != (idd^1) && n_pending == 0) break; // found the other end of the bubble | |
111 | x = kv_pop(a->stack); | |
112 | p = &g->v.a[x>>1]; | |
113 | //printf("%lld:%lld\n", p->k[0], p->k[1]); | |
114 | r = &p->nei[(x&1)^1]; // we will look the the neighbors from the other end of the unitig | |
115 | if (a->pool.n > max_vtx || tiptr(p)->d[x&1][0] > max_dist || tiptr(p)->d[x&1][1] > max_dist || r->n == 0) break; // we failed | |
116 | // set the distance to p's neighbors | |
117 | for (i = 0; i < r->n; ++i) { | |
118 | int nsr, dist, which; | |
119 | if ((int64_t)r->a[i].x < 0) continue; | |
120 | y = mag_tid2idd(g->h, r->a[i].x); | |
121 | if (y == (idd^1)) { // there is a loop involving the initial vertex | |
122 | a->stack.n = 0; | |
123 | break; // not a bubble; stop; this will jump out of the while() loop | |
124 | } | |
125 | q = &g->v.a[y>>1]; | |
126 | if (q->ptr == 0) { // has not been attempted | |
127 | q->ptr = tip_alloc(&a->pool, y>>1), ++n_pending; | |
128 | mag_v128_clean(&q->nei[y&1]); // make sure there are no deleted edges | |
129 | } | |
130 | nsr = tiptr(p)->n[x&1][0] + p->nsr; which = 0; | |
131 | dist = tiptr(p)->d[x&1][0] + p->len - r->a[i].y; | |
132 | //printf("01 [%d]\t[%d,%d]\t[%d,%d]\n", i, tiptr(q)->n[y&1][0], tiptr(q)->n[y&1][1], tiptr(q)->d[y&1][0], tiptr(q)->d[y&1][1]); | |
133 | // test and possibly update the tentative distance | |
134 | if (nsr > tiptr(q)->n[y&1][0]) { // then move the best to the 2nd best and update the best | |
135 | tiptr(q)->n[y&1][1] = tiptr(q)->n[y&1][0]; tiptr(q)->n[y&1][0] = nsr; | |
136 | tiptr(q)->v[y&1][1] = tiptr(q)->v[y&1][0]; tiptr(q)->v[y&1][0] = (x^1)<<32|i<<1|which; | |
137 | tiptr(q)->d[y&1][1] = tiptr(q)->d[y&1][0]; tiptr(q)->d[y&1][0] = dist; | |
138 | nsr = tiptr(p)->n[x&1][1] + p->nsr; which = 1; // now nsr is the 2nd best | |
139 | dist = tiptr(p)->d[x&1][1] + p->len - r->a[i].y; | |
140 | } | |
141 | if (nsr > tiptr(q)->n[y&1][1]) // update the 2nd best | |
142 | tiptr(q)->n[y&1][1] = nsr, tiptr(q)->v[y&1][1] = (x^1)<<32|i<<1|which, tiptr(q)->d[y&1][1] = dist; | |
143 | if (++tiptr(q)->cnt[y&1] == q->nei[y&1].n) { // all q's predecessors have been processed; then push | |
144 | kv_push(uint64_t, a->stack, y); | |
145 | --n_pending; | |
146 | } | |
147 | } | |
148 | } | |
149 | if (n_pending == 0 && a->stack.n == 1) { // found a bubble | |
150 | uint64_t x = a->stack.a[0]; | |
151 | p = &g->v.a[x>>1]; | |
152 | //printf("(%d,%d)\t(%d,%d)\n", tiptr(p)->n[x&1][0], tiptr(p)->n[x&1][1], tiptr(p)->d[x&1][0], tiptr(p)->d[x&1][1]); | |
153 | backtrace(g, tiptr(p)->v[x&1][0], idd, a->h); | |
154 | backtrace(g, tiptr(p)->v[x&1][1], idd, a->h); | |
155 | } | |
156 | for (i = 0; i < a->pool.n; ++i) // reset p->ptr | |
157 | g->v.a[a->pool.buf[i]->id].ptr = 0; | |
158 | if (kh_size(a->h)) { // bubble detected; then remove verticies not in the top two paths | |
159 | for (i = 1; i < a->pool.n; ++i) { // i=0 corresponds to the initial vertex which we want to exclude | |
160 | uint64_t id = a->pool.buf[i]->id; | |
161 | if (id != a->stack.a[0]>>1 && kh_get(64, a->h, id) == kh_end(a->h)) // not in the top two paths | |
162 | mag_v_del(g, &g->v.a[id]); | |
163 | } | |
164 | } | |
165 | } | |
166 | ||
167 | void mag_g_simplify_bubble(mag_t *g, int max_vtx, int max_dist) | |
168 | { | |
169 | int64_t i; | |
170 | mogb_aux_t *a; | |
171 | a = mag_b_initaux(); | |
172 | for (i = 0; i < g->v.n; ++i) { | |
173 | mag_vh_simplify_bubble(g, i<<1|0, max_vtx, max_dist, a); | |
174 | mag_vh_simplify_bubble(g, i<<1|1, max_vtx, max_dist, a); | |
175 | } | |
176 | mag_b_destroyaux(a); | |
177 | mag_g_merge(g, 0, 0); | |
178 | } | |
179 | ||
180 | int mag_vh_pop_simple(mag_t *g, uint64_t idd, float max_cov, float max_frac, int aggressive) | |
181 | { | |
182 | magv_t *p = &g->v.a[idd>>1], *q[2]; | |
183 | ku128_v *r; | |
184 | int i, j, k, dir[2], l[2], ret = -1; | |
185 | char *seq[2], *cov[2]; | |
186 | float n_diff, r_diff, avg[2], max_n_diff = aggressive? MAX_N_DIFF * 2. : MAX_N_DIFF; | |
187 | ||
188 | if (p->len < 0 || p->nei[idd&1].n != 2) return ret; // deleted or no bubble | |
189 | r = &p->nei[idd&1]; | |
190 | for (j = 0; j < 2; ++j) { | |
191 | uint64_t x; | |
192 | if ((int64_t)r->a[j].x < 0) return ret; | |
193 | x = mag_tid2idd(g->h, r->a[j].x); | |
194 | dir[j] = x&1; | |
195 | q[j] = &g->v.a[x>>1]; | |
196 | if (q[j]->nei[0].n != 1 || q[j]->nei[1].n != 1) return ret; // no bubble | |
197 | l[j] = q[j]->len - (int)(q[j]->nei[0].a->y + q[j]->nei[1].a->y); | |
198 | } | |
199 | if (q[0]->nei[dir[0]^1].a->x != q[1]->nei[dir[1]^1].a->x) return ret; // no bubble | |
200 | for (j = 0; j < 2; ++j) { // set seq[] and cov[], and compute avg[] | |
201 | if (l[j] > 0) { | |
202 | seq[j] = malloc(l[j]<<1); | |
203 | cov[j] = seq[j] + l[j]; | |
204 | for (i = 0; i < l[j]; ++i) { | |
205 | seq[j][i] = q[j]->seq[i + q[j]->nei[0].a->y]; | |
206 | cov[j][i] = q[j]->cov[i + q[j]->nei[0].a->y]; | |
207 | } | |
208 | if (dir[j]) { | |
209 | seq_revcomp6(l[j], (uint8_t*)seq[j]); | |
210 | seq_reverse(l[j], (uint8_t*)cov[j]); | |
211 | } | |
212 | for (i = 0, avg[j] = 0.; i < l[j]; ++i) { | |
213 | --seq[j][i]; // change DNA6 encoding to DNA4 for SW below | |
214 | avg[j] += cov[j][i] - 33; | |
215 | } | |
216 | avg[j] /= l[j]; | |
217 | } else { // l[j] <= 0; this may happen around a tandem repeat | |
218 | int beg, end; | |
219 | seq[j] = cov[j] = 0; | |
220 | beg = q[j]->nei[0].a->y; end = q[j]->len - q[j]->nei[1].a->y; | |
221 | if (beg > end) beg ^= end, end ^= beg, beg ^= end; // swap | |
222 | if (beg < end) { | |
223 | for (i = beg, avg[j] = 0.; i < end; ++i) | |
224 | avg[j] += q[j]->cov[i] - 33; | |
225 | avg[j] /= end - beg; | |
226 | } else avg[j] = q[j]->cov[beg] - 33; // FIXME: when q[j] is contained, weird thing may happen | |
227 | } | |
228 | } | |
229 | ret = 1; | |
230 | if (l[0] > 0 && l[1] > 0) { // then do SW to compute n_diff and r_diff | |
231 | int8_t mat[16]; | |
232 | kswr_t aln; | |
233 | for (i = k = 0; i < 4; ++i) | |
234 | for (j = 0; j < 4; ++j) | |
235 | mat[k++] = i == j? 5 : -4; | |
236 | aln = ksw_align(l[0], (uint8_t*)seq[0], l[1], (uint8_t*)seq[1], 4, mat, 5, 2, 0, 0); | |
237 | n_diff = ((l[0] < l[1]? l[0] : l[1]) * 5. - aln.score) / (5. + 4.); // 5: matching score; -4: mismatchig score | |
238 | r_diff = n_diff / ((l[0] + l[1]) / 2.); | |
239 | //fprintf(stderr, "===> %f %f <===\n", n_diff, r_diff); for (j = 0; j < 2; ++j) { for (i = 0; i < l[j]; ++i) fputc("ACGTN"[(int)seq[j][i]], stderr); fputc('\n', stderr); } | |
240 | } else { | |
241 | n_diff = abs(l[0] - l[1]) * L_DIFF_COEF; | |
242 | r_diff = 1.; | |
243 | //fprintf(stderr, "---> (%d,%d) <---\n", l[0], l[1]); | |
244 | } | |
245 | if (n_diff < max_n_diff || r_diff < MAX_R_DIFF) { | |
246 | j = avg[0] < avg[1]? 0 : 1; | |
247 | if (aggressive || (avg[j] < max_cov && avg[j] / (avg[j^1] + avg[j]) < max_frac)) { | |
248 | mag_v_del(g, q[j]); | |
249 | ret = 2; | |
250 | } | |
251 | } | |
252 | free(seq[0]); free(seq[1]); | |
253 | return ret; | |
254 | } | |
255 | ||
256 | void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int aggressive) | |
257 | { | |
258 | int64_t i, n_examined = 0, n_popped = 0; | |
259 | int ret; | |
260 | ||
261 | for (i = 0; i < g->v.n; ++i) { | |
262 | ret = mag_vh_pop_simple(g, i<<1|0, max_cov, max_frac, aggressive); | |
263 | if (ret >= 1) ++n_examined; | |
264 | if (ret >= 2) ++n_popped; | |
265 | ret = mag_vh_pop_simple(g, i<<1|1, max_cov, max_frac, aggressive); | |
266 | if (ret >= 1) ++n_examined; | |
267 | if (ret >= 2) ++n_popped; | |
268 | } | |
269 | if (fm_verbose >= 3) | |
270 | fprintf(stderr, "[M::%s] examined %ld bubbles and popped %ld\n", __func__, (long)n_examined, (long)n_popped); | |
271 | mag_g_merge(g, 0, min_merge_len); | |
272 | } | |
273 | ||
274 | /**************** | |
275 | * Open bubbles * | |
276 | ****************/ | |
277 | ||
278 | void mag_v_pop_open(mag_t *g, magv_t *p, int min_elen) | |
279 | { | |
280 | int i, j, k, l, dir, max_l, l_qry; | |
281 | magv_t *q, *t; | |
282 | ku128_v *r, *s; | |
283 | uint8_t *seq; | |
284 | int8_t mat[16]; | |
285 | ||
286 | if (p->len < 0 || p->len >= min_elen) return; | |
287 | //if (p->nei[0].n && p->nei[1].n) return; // FIXME: between this and the next line, which is better? | |
288 | if (p->nei[0].n + p->nei[1].n != 1) return; | |
289 | dir = p->nei[0].n? 0 : 1; | |
290 | // initialize the scoring system | |
291 | for (i = k = 0; i < 4; ++i) | |
292 | for (j = 0; j < 4; ++j) | |
293 | mat[k++] = i == j? 5 : -4; | |
294 | ||
295 | s = &p->nei[dir]; | |
296 | for (l = 0; l < s->n; ++l) { // if we use "if (p->nei[0].n + p->nei[1].n != 1)", s->n == 1 | |
297 | uint64_t v; | |
298 | kswq_t *qry; | |
299 | if ((int64_t)s->a[l].x < 0) continue; | |
300 | v = mag_tid2idd(g->h, s->a[l].x); | |
301 | q = &g->v.a[v>>1]; | |
302 | if (q == p || q->nei[v&1].n == 1) continue; | |
303 | // get the query ready | |
304 | max_l = (p->len - s->a[l].y) * 2; | |
305 | seq = malloc(max_l + 1); | |
306 | if (dir == 0) { // forward strand | |
307 | for (j = s->a[l].y, k = 0; j < p->len; ++j) | |
308 | seq[k++] = p->seq[j] - 1; | |
309 | } else { // reverse | |
310 | for (j = p->len - s->a[l].y - 1, k = 0; j >= 0; --j) | |
311 | seq[k++] = 4 - p->seq[j]; | |
312 | } | |
313 | l_qry = k; | |
314 | qry = ksw_qinit(2, l_qry, seq, 4, mat); | |
315 | //fprintf(stderr, "===> %lld:%lld:%d[%d], %d, %ld <===\n", p->k[0], p->k[1], s->n, l, p->nsr, q->nei[v&1].n); | |
316 | //for (j = 0; j < k; ++j) fputc("ACGTN"[(int)seq[j]], stderr); fputc('\n', stderr); | |
317 | ||
318 | r = &q->nei[v&1]; | |
319 | for (i = 0; i < r->n; ++i) { | |
320 | uint64_t w; | |
321 | kswr_t aln; | |
322 | if (r->a[i].x == p->k[dir] || (int64_t)r->a[i].x < 0) continue; | |
323 | w = mag_tid2idd(g->h, r->a[i].x); | |
324 | // get the target sequence | |
325 | t = &g->v.a[w>>1]; | |
326 | if (w&1) { // reverse strand | |
327 | for (j = t->len - r->a[i].y - 1, k = 0; j >= 0 && k < max_l; --j) | |
328 | seq[k++] = 4 - t->seq[j]; | |
329 | } else { | |
330 | for (j = r->a[i].y, k = 0; j < t->len && k < max_l; ++j) | |
331 | seq[k++] = t->seq[j] - 1; | |
332 | } | |
333 | aln = ksw_align(0, 0, k, seq, 4, mat, 5, 2, 0, &qry); | |
334 | //for (j = 0; j < k; ++j) fputc("ACGTN"[(int)seq[j]], stderr); fprintf(stderr, "\t%d\t%f\n", aln.score, (l_qry * 5. - aln.score) / (5. + 4.)); | |
335 | if (aln.score >= l_qry * 5 / 2) { | |
336 | double r_diff, n_diff; | |
337 | n_diff = (l_qry * 5. - aln.score) / (5. + 4.); // 5: matching score; -4: mismatchig score | |
338 | r_diff = n_diff / l_qry; | |
339 | if (n_diff < MAX_N_DIFF || r_diff < MAX_R_DIFF) break; | |
340 | } | |
341 | } | |
342 | ||
343 | if (i != r->n) { | |
344 | // mark delete in p and delete in q | |
345 | edge_mark_del(s->a[l]); | |
346 | for (i = 0; i < r->n; ++i) | |
347 | if (r->a[i].x == p->k[dir]) | |
348 | edge_mark_del(r->a[i]); | |
349 | } | |
350 | free(seq); free(qry); | |
351 | } | |
352 | ||
353 | for (i = 0; i < s->n; ++i) | |
354 | if (!edge_is_del(s->a[i])) break; | |
355 | if (i == s->n) mag_v_del(g, p); // p is not connected to any other vertices | |
356 | } | |
357 | ||
358 | void mag_g_pop_open(mag_t *g, int min_elen) | |
359 | { | |
360 | int64_t i; | |
361 | for (i = 0; i < g->v.n; ++i) | |
362 | mag_v_pop_open(g, &g->v.a[i], min_elen); | |
363 | if (fm_verbose >= 3) | |
364 | fprintf(stderr, "[M:%s] popped open bubbles\n", __func__); | |
365 | mag_g_merge(g, 0, 0); | |
366 | } |
0 | #include <unistd.h> | |
1 | #include <stdlib.h> | |
2 | #include <stdio.h> | |
3 | #include "fml.h" | |
4 | ||
5 | int main(int argc, char *argv[]) | |
6 | { | |
7 | fml_opt_t opt; | |
8 | int c, n_seqs, n_utg; | |
9 | bseq1_t *seqs; | |
10 | fml_utg_t *utg; | |
11 | ||
12 | fml_opt_init(&opt); | |
13 | while ((c = getopt(argc, argv, "Ae:l:r:t:c:")) >= 0) { | |
14 | if (c == 'e') opt.ec_k = atoi(optarg); | |
15 | else if (c == 'l') opt.min_asm_ovlp = atoi(optarg); | |
16 | else if (c == 'r') opt.mag_opt.min_dratio1 = atof(optarg); | |
17 | else if (c == 'A') opt.mag_opt.flag |= MAG_F_AGGRESSIVE; | |
18 | else if (c == 't') opt.n_threads = atoi(optarg); | |
19 | else if (c == 'c') { | |
20 | char *p; | |
21 | opt.min_cnt = strtol(optarg, &p, 10); | |
22 | if (*p == ',') opt.max_cnt = strtol(p + 1, &p, 10); | |
23 | } | |
24 | } | |
25 | if (argc == optind) { | |
26 | fprintf(stderr, "Usage: fml-asm [options] <in.fq>\n"); | |
27 | fprintf(stderr, "Options:\n"); | |
28 | fprintf(stderr, " -e INT k-mer length for error correction (0 for auto; -1 to disable) [%d]\n", opt.ec_k); | |
29 | fprintf(stderr, " -c INT1[,INT2] range of k-mer & read count thresholds for ec and graph cleaning [%d,%d]\n", opt.min_cnt, opt.max_cnt); | |
30 | fprintf(stderr, " -l INT min overlap length during initial assembly [%d]\n", opt.min_asm_ovlp); | |
31 | fprintf(stderr, " -r FLOAT drop an overlap if its length is below maxOvlpLen*FLOAT [%g]\n", opt.mag_opt.min_dratio1); | |
32 | fprintf(stderr, " -t INT number of threads (don't use multi-threading for small data sets) [%d]\n", opt.n_threads); | |
33 | fprintf(stderr, " -A discard heterozygotes (apply this to assemble bacterial genomes)\n"); | |
34 | return 1; | |
35 | } | |
36 | seqs = bseq_read(argv[optind], &n_seqs); | |
37 | utg = fml_assemble(&opt, n_seqs, seqs, &n_utg); | |
38 | fml_utg_print(n_utg, utg); | |
39 | fml_utg_destroy(n_utg, utg); | |
40 | return 0; | |
41 | } |
0 | #ifndef FML_H | |
1 | #define FML_H | |
2 | ||
3 | #define FML_VERSION "r41" | |
4 | ||
5 | #include <stdint.h> | |
6 | ||
7 | typedef struct { | |
8 | int32_t l_seq; | |
9 | char *seq, *qual; | |
10 | } bseq1_t; | |
11 | ||
12 | #define MAG_F_AGGRESSIVE 0x20 | |
13 | #define MAG_F_NO_SIMPL 0x80 | |
14 | ||
15 | typedef struct { | |
16 | int flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bvtx, min_merge_len, trim_len, trim_depth; | |
17 | float min_dratio1, max_bcov, max_bfrac; | |
18 | } magopt_t; | |
19 | ||
20 | typedef struct { | |
21 | int n_threads; // number of threads; don't use multi-threading for small data sets | |
22 | int ec_k; // k-mer length for error correction; 0 for auto estimate | |
23 | int min_cnt, max_cnt; // both occ threshold in ec and tip threshold in cleaning lie in [min_cnt,max_cnt] | |
24 | int min_asm_ovlp; // min overlap length during assembly | |
25 | int min_merge_len; // during assembly, don't explicitly merge an overlap if shorter than this value | |
26 | magopt_t mag_opt; // graph cleaning options | |
27 | } fml_opt_t; | |
28 | ||
29 | struct rld_t; | |
30 | struct mag_t; | |
31 | ||
32 | typedef struct { | |
33 | uint32_t tid; | |
34 | uint32_t len:31, from:1; | |
35 | } fml_ovlp_t; | |
36 | ||
37 | typedef struct { | |
38 | int32_t len; // length of sequence | |
39 | int32_t nsr; // number of supporting reads | |
40 | char *seq; // unitig sequence | |
41 | char *cov; // cov[i]-33 gives per-base coverage at i | |
42 | int n_ovlp[2]; // number of 5'-end [0] and 3'-end [1] overlaps | |
43 | fml_ovlp_t *ovlp; // overlaps, of size n_ovlp[0]+n_ovlp[1] | |
44 | } fml_utg_t; | |
45 | ||
46 | #ifdef __cplusplus | |
47 | extern "C" { | |
48 | #endif | |
49 | ||
50 | /************************ | |
51 | * High-level functions * | |
52 | ************************/ | |
53 | ||
54 | /** | |
55 | * Read all sequences from a FASTA/FASTQ file | |
56 | * | |
57 | * @param fn filename; NULL or "-" for stdin | |
58 | * @param n (out) number of sequences read into RAM | |
59 | * | |
60 | * @return array of sequences | |
61 | */ | |
62 | bseq1_t *bseq_read(const char *fn, int *n); | |
63 | ||
64 | /** | |
65 | * Initialize default parameters | |
66 | * | |
67 | * @param opt (out) pointer to parameters | |
68 | */ | |
69 | void fml_opt_init(fml_opt_t *opt); | |
70 | ||
71 | /** | |
72 | * Assemble a list of sequences | |
73 | * | |
74 | * @param opt parameters | |
75 | * @param n_seqs number of input sequences | |
76 | * @param seqs sequences to assemble; FREED on return | |
77 | * @param n_utg (out) number of unitigs in return | |
78 | * | |
79 | * @return array of unitigs | |
80 | */ | |
81 | fml_utg_t *fml_assemble(const fml_opt_t *opt, int n_seqs, bseq1_t *seqs, int *n_utg); | |
82 | ||
83 | /** | |
84 | * Free unitigs | |
85 | * | |
86 | * @param n_utg number of unitigs | |
87 | * @param utg array of unitigs | |
88 | */ | |
89 | void fml_utg_destroy(int n_utg, fml_utg_t *utg); | |
90 | ||
91 | /************************************************ | |
92 | * Mid-level functions called by fml_assemble() * | |
93 | ************************************************/ | |
94 | ||
95 | /** | |
96 | * Adjust parameters based on input sequences | |
97 | * | |
98 | * @param opt parameters to update IN PLACE | |
99 | * @param n_seqs number of sequences | |
100 | * @param seqs array of sequences | |
101 | */ | |
102 | void fml_opt_adjust(fml_opt_t *opt, int n_seqs, const bseq1_t *seqs); | |
103 | ||
104 | /** | |
105 | * Error correction | |
106 | * | |
107 | * @param opt parameters | |
108 | * @param n number of sequences | |
109 | * @param seq array of sequences; corrected IN PLACE | |
110 | * | |
111 | * @return k-mer coverage | |
112 | */ | |
113 | float fml_correct(const fml_opt_t *opt, int n, bseq1_t *seq); | |
114 | float fml_fltuniq(const fml_opt_t *opt, int n, bseq1_t *seq); | |
115 | ||
116 | /** | |
117 | * Construct FMD-index | |
118 | * | |
119 | * @param opt parameters | |
120 | * @param n number of sequences | |
121 | * @param seq array of sequences; FREED on return | |
122 | * | |
123 | * @return FMD-index | |
124 | */ | |
125 | struct rld_t *fml_seq2fmi(const fml_opt_t *opt, int n, bseq1_t *seq); | |
126 | ||
127 | /** | |
128 | * Generate initial overlap graph | |
129 | * | |
130 | * @param opt parameters | |
131 | * @param e FMD-index; FREED on return | |
132 | * | |
133 | * @return overlap graph in the "mag" structure | |
134 | */ | |
135 | struct mag_t *fml_fmi2mag(const fml_opt_t *opt, struct rld_t *e); | |
136 | ||
137 | /** | |
138 | * Clean a mag graph | |
139 | * | |
140 | * @param opt parameters | |
141 | * @param g overlap graph; modified IN PLACE | |
142 | */ | |
143 | void fml_mag_clean(const fml_opt_t *opt, struct mag_t *g); | |
144 | ||
145 | /** | |
146 | * Convert a graph in mag to fml_utg_t | |
147 | * | |
148 | * @param g graph in the "mag" structure; FREED on return | |
149 | * @param n_utg (out) number of unitigs | |
150 | * | |
151 | * @return array of unitigs | |
152 | */ | |
153 | fml_utg_t *fml_mag2utg(struct mag_t *g, int *n_utg); | |
154 | ||
155 | /** | |
156 | * Output unitig graph in the mag format | |
157 | * | |
158 | * @param n_utg number of unitigs | |
159 | * @param utg array of unitigs | |
160 | */ | |
161 | void fml_utg_print(int n_utgs, const fml_utg_t *utg); | |
162 | ||
163 | /** | |
164 | * Deallocate an FM-index | |
165 | * | |
166 | * @param e pointer to the FM-index | |
167 | */ | |
168 | void fml_fmi_destroy(struct rld_t *e); | |
169 | ||
170 | /** | |
171 | * Deallocate a mag graph | |
172 | * | |
173 | * @param g pointer to the mag graph | |
174 | */ | |
175 | void fml_mag_destroy(struct mag_t *g); | |
176 | ||
177 | #ifdef __cplusplus | |
178 | } | |
179 | #endif | |
180 | ||
181 | #endif |
0 | #include <stdio.h> | |
1 | #include <stdlib.h> | |
2 | #include <assert.h> | |
3 | #include "htab.h" | |
4 | #include "khash.h" | |
5 | ||
6 | #define _cnt_eq(a, b) ((a)>>14 == (b)>>14) | |
7 | #define _cnt_hash(a) ((a)>>14) | |
8 | KHASH_INIT(cnt, uint64_t, char, 0, _cnt_hash, _cnt_eq) | |
9 | typedef khash_t(cnt) cnthash_t; | |
10 | ||
11 | struct bfc_ch_s { | |
12 | int k; | |
13 | cnthash_t **h; | |
14 | // private | |
15 | int l_pre; | |
16 | }; | |
17 | ||
18 | bfc_ch_t *bfc_ch_init(int k, int l_pre) | |
19 | { | |
20 | bfc_ch_t *ch; | |
21 | int i; | |
22 | assert(k <= 63); | |
23 | if (k * 2 - l_pre > BFC_CH_KEYBITS) | |
24 | l_pre = k * 2 - BFC_CH_KEYBITS; | |
25 | if (l_pre > BFC_CH_MAXPRE) l_pre = BFC_CH_MAXPRE; | |
26 | assert(k - l_pre < BFC_CH_KEYBITS); | |
27 | ch = calloc(1, sizeof(bfc_ch_t)); | |
28 | ch->k = k, ch->l_pre = l_pre; | |
29 | ch->h = calloc(1<<ch->l_pre, sizeof(void*)); | |
30 | for (i = 0; i < 1<<ch->l_pre; ++i) | |
31 | ch->h[i] = kh_init(cnt); | |
32 | return ch; | |
33 | } | |
34 | ||
35 | void bfc_ch_destroy(bfc_ch_t *ch) | |
36 | { | |
37 | int i; | |
38 | if (ch == 0) return; | |
39 | for (i = 0; i < 1<<ch->l_pre; ++i) | |
40 | kh_destroy(cnt, ch->h[i]); | |
41 | free(ch->h); free(ch); | |
42 | } | |
43 | ||
44 | static inline cnthash_t *get_subhash(const bfc_ch_t *ch, const uint64_t x[2], uint64_t *key) | |
45 | { | |
46 | if (ch->k <= 32) { | |
47 | int t = ch->k * 2 - ch->l_pre; | |
48 | uint64_t z = x[0] << ch->k | x[1]; | |
49 | *key = (z & ((1ULL<<t) - 1)) << 14 | 1; | |
50 | return ch->h[z>>t]; | |
51 | } else { | |
52 | int t = ch->k - ch->l_pre; | |
53 | int shift = t + ch->k < BFC_CH_KEYBITS? ch->k : BFC_CH_KEYBITS - t; | |
54 | *key = ((x[0] & ((1ULL<<t) - 1)) << shift ^ x[1]) << 14 | 1; | |
55 | return ch->h[x[0]>>t]; | |
56 | } | |
57 | } | |
58 | ||
59 | int bfc_ch_insert(bfc_ch_t *ch, const uint64_t x[2], int is_high, int forced) | |
60 | { | |
61 | int absent; | |
62 | uint64_t key; | |
63 | cnthash_t *h; | |
64 | khint_t k; | |
65 | h = get_subhash(ch, x, &key); | |
66 | if (__sync_lock_test_and_set(&h->lock, 1)) { | |
67 | if (forced) // then wait until the hash table is unlocked by the thread using it | |
68 | while (__sync_lock_test_and_set(&h->lock, 1)) | |
69 | while (h->lock); // lock | |
70 | else return -1; | |
71 | } | |
72 | k = kh_put(cnt, h, key, &absent); | |
73 | if (absent) { | |
74 | if (is_high) kh_key(h, k) |= 1<<8; | |
75 | } else { | |
76 | if ((kh_key(h, k) & 0xff) != 0xff) ++kh_key(h, k); | |
77 | if (is_high && (kh_key(h, k) >> 8 & 0x3f) != 0x3f) kh_key(h, k) += 1<<8; | |
78 | } | |
79 | __sync_lock_release(&h->lock); // unlock | |
80 | return 0; | |
81 | } | |
82 | ||
83 | int bfc_ch_get(const bfc_ch_t *ch, const uint64_t x[2]) | |
84 | { | |
85 | uint64_t key; | |
86 | cnthash_t *h; | |
87 | khint_t itr; | |
88 | h = get_subhash(ch, x, &key); | |
89 | itr = kh_get(cnt, h, key); | |
90 | return itr == kh_end(h)? -1 : kh_key(h, itr) & 0x3fff; | |
91 | } | |
92 | ||
93 | int bfc_ch_kmer_occ(const bfc_ch_t *ch, const bfc_kmer_t *z) | |
94 | { | |
95 | uint64_t x[2]; | |
96 | bfc_kmer_hash(ch->k, z->x, x); | |
97 | return bfc_ch_get(ch, x); | |
98 | } | |
99 | ||
100 | uint64_t bfc_ch_count(const bfc_ch_t *ch) | |
101 | { | |
102 | int i; | |
103 | uint64_t cnt = 0; | |
104 | for (i = 0; i < 1<<ch->l_pre; ++i) | |
105 | cnt += kh_size(ch->h[i]); | |
106 | return cnt; | |
107 | } | |
108 | ||
109 | int bfc_ch_hist(const bfc_ch_t *ch, uint64_t cnt[256], uint64_t high[64]) | |
110 | { | |
111 | int i, max_i = -1; | |
112 | uint64_t max; | |
113 | memset(cnt, 0, 256 * 8); | |
114 | memset(high, 0, 64 * 8); | |
115 | for (i = 0; i < 1<<ch->l_pre; ++i) { | |
116 | khint_t k; | |
117 | cnthash_t *h = ch->h[i]; | |
118 | for (k = 0; k != kh_end(h); ++k) | |
119 | if (kh_exist(h, k)) | |
120 | ++cnt[kh_key(h, k) & 0xff], ++high[kh_key(h, k)>>8 & 0x3f]; | |
121 | } | |
122 | for (i = 3, max = 0; i < 256; ++i) | |
123 | if (cnt[i] > max) | |
124 | max = cnt[i], max_i = i; | |
125 | return max_i; | |
126 | } | |
127 | ||
128 | int bfc_ch_get_k(const bfc_ch_t *ch) | |
129 | { | |
130 | return ch->k; | |
131 | } |
0 | #ifndef BFC_HTAB_H | |
1 | #define BFC_HTAB_H | |
2 | ||
3 | #include <stdint.h> | |
4 | #include "kmer.h" | |
5 | ||
6 | #define BFC_CH_KEYBITS 50 | |
7 | #define BFC_CH_MAXPRE 20 | |
8 | ||
9 | struct bfc_ch_s; | |
10 | typedef struct bfc_ch_s bfc_ch_t; | |
11 | ||
12 | bfc_ch_t *bfc_ch_init(int k, int l_pre); | |
13 | void bfc_ch_destroy(bfc_ch_t *ch); | |
14 | int bfc_ch_insert(bfc_ch_t *ch, const uint64_t x[2], int is_high, int forced); | |
15 | int bfc_ch_get(const bfc_ch_t *ch, const uint64_t x[2]); | |
16 | uint64_t bfc_ch_count(const bfc_ch_t *ch); | |
17 | int bfc_ch_hist(const bfc_ch_t *ch, uint64_t cnt[256], uint64_t high[64]); | |
18 | int bfc_ch_get_k(const bfc_ch_t *ch); | |
19 | ||
20 | int bfc_ch_kmer_occ(const bfc_ch_t *ch, const bfc_kmer_t *z); | |
21 | ||
22 | #endif |
0 | #ifndef FML_INTERNAL_H | |
1 | #define FML_INTERNAL_H | |
2 | ||
3 | #include "fml.h" | |
4 | ||
5 | extern unsigned char seq_nt6_table[256]; | |
6 | ||
7 | #ifdef __cplusplus | |
8 | extern "C" { | |
9 | #endif | |
10 | ||
11 | void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n); | |
12 | void seq_reverse(int l, unsigned char *s); | |
13 | void seq_revcomp6(int l, unsigned char *s); | |
14 | struct bfc_ch_s *fml_count(int n, const bseq1_t *seq, int k, int q, int l_pre, int n_threads); | |
15 | ||
16 | #ifdef __cplusplus | |
17 | } | |
18 | #endif | |
19 | ||
20 | #endif |
0 | /* The MIT License | |
1 | ||
2 | Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk> | |
3 | ||
4 | Permission is hereby granted, free of charge, to any person obtaining | |
5 | a copy of this software and associated documentation files (the | |
6 | "Software"), to deal in the Software without restriction, including | |
7 | without limitation the rights to use, copy, modify, merge, publish, | |
8 | distribute, sublicense, and/or sell copies of the Software, and to | |
9 | permit persons to whom the Software is furnished to do so, subject to | |
10 | the following conditions: | |
11 | ||
12 | The above copyright notice and this permission notice shall be | |
13 | included in all copies or substantial portions of the Software. | |
14 | ||
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
19 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
20 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 | SOFTWARE. | |
23 | */ | |
24 | ||
25 | /* | |
26 | An example: | |
27 | ||
28 | #include "khash.h" | |
29 | KHASH_MAP_INIT_INT(32, char) | |
30 | int main() { | |
31 | int ret, is_missing; | |
32 | khiter_t k; | |
33 | khash_t(32) *h = kh_init(32); | |
34 | k = kh_put(32, h, 5, &ret); | |
35 | kh_value(h, k) = 10; | |
36 | k = kh_get(32, h, 10); | |
37 | is_missing = (k == kh_end(h)); | |
38 | k = kh_get(32, h, 5); | |
39 | kh_del(32, h, k); | |
40 | for (k = kh_begin(h); k != kh_end(h); ++k) | |
41 | if (kh_exist(h, k)) kh_value(h, k) = 1; | |
42 | kh_destroy(32, h); | |
43 | return 0; | |
44 | } | |
45 | */ | |
46 | ||
47 | /* | |
48 | 2013-05-02 (0.2.8): | |
49 | ||
50 | * Use quadratic probing. When the capacity is power of 2, stepping function | |
51 | i*(i+1)/2 guarantees to traverse each bucket. It is better than double | |
52 | hashing on cache performance and is more robust than linear probing. | |
53 | ||
54 | In theory, double hashing should be more robust than quadratic probing. | |
55 | However, my implementation is probably not for large hash tables, because | |
56 | the second hash function is closely tied to the first hash function, | |
57 | which reduce the effectiveness of double hashing. | |
58 | ||
59 | Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php | |
60 | ||
61 | 2011-12-29 (0.2.7): | |
62 | ||
63 | * Minor code clean up; no actual effect. | |
64 | ||
65 | 2011-09-16 (0.2.6): | |
66 | ||
67 | * The capacity is a power of 2. This seems to dramatically improve the | |
68 | speed for simple keys. Thank Zilong Tan for the suggestion. Reference: | |
69 | ||
70 | - http://code.google.com/p/ulib/ | |
71 | - http://nothings.org/computer/judy/ | |
72 | ||
73 | * Allow to optionally use linear probing which usually has better | |
74 | performance for random input. Double hashing is still the default as it | |
75 | is more robust to certain non-random input. | |
76 | ||
77 | * Added Wang's integer hash function (not used by default). This hash | |
78 | function is more robust to certain non-random input. | |
79 | ||
80 | 2011-02-14 (0.2.5): | |
81 | ||
82 | * Allow to declare global functions. | |
83 | ||
84 | 2009-09-26 (0.2.4): | |
85 | ||
86 | * Improve portability | |
87 | ||
88 | 2008-09-19 (0.2.3): | |
89 | ||
90 | * Corrected the example | |
91 | * Improved interfaces | |
92 | ||
93 | 2008-09-11 (0.2.2): | |
94 | ||
95 | * Improved speed a little in kh_put() | |
96 | ||
97 | 2008-09-10 (0.2.1): | |
98 | ||
99 | * Added kh_clear() | |
100 | * Fixed a compiling error | |
101 | ||
102 | 2008-09-02 (0.2.0): | |
103 | ||
104 | * Changed to token concatenation which increases flexibility. | |
105 | ||
106 | 2008-08-31 (0.1.2): | |
107 | ||
108 | * Fixed a bug in kh_get(), which has not been tested previously. | |
109 | ||
110 | 2008-08-31 (0.1.1): | |
111 | ||
112 | * Added destructor | |
113 | */ | |
114 | ||
115 | ||
116 | #ifndef __AC_KHASH_H | |
117 | #define __AC_KHASH_H | |
118 | ||
119 | /*! | |
120 | @header | |
121 | ||
122 | Generic hash table library. | |
123 | */ | |
124 | ||
125 | #define AC_VERSION_KHASH_H "0.2.8" | |
126 | ||
127 | #include <stdlib.h> | |
128 | #include <string.h> | |
129 | #include <limits.h> | |
130 | ||
131 | /* compiler specific configuration */ | |
132 | ||
133 | #if UINT_MAX == 0xffffffffu | |
134 | typedef unsigned int khint32_t; | |
135 | #elif ULONG_MAX == 0xffffffffu | |
136 | typedef unsigned long khint32_t; | |
137 | #endif | |
138 | ||
139 | #if ULONG_MAX == ULLONG_MAX | |
140 | typedef unsigned long khint64_t; | |
141 | #else | |
142 | typedef unsigned long long khint64_t; | |
143 | #endif | |
144 | ||
145 | #ifdef _MSC_VER | |
146 | #define kh_inline __inline | |
147 | #else | |
148 | #define kh_inline inline | |
149 | #endif | |
150 | ||
151 | typedef khint32_t khint_t; | |
152 | typedef khint_t khiter_t; | |
153 | ||
154 | #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) | |
155 | #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) | |
156 | #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) | |
157 | #define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) | |
158 | #define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) | |
159 | #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) | |
160 | #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) | |
161 | ||
162 | #define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) | |
163 | ||
164 | #ifndef kroundup32 | |
165 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) | |
166 | #endif | |
167 | ||
168 | #ifndef kcalloc | |
169 | #define kcalloc(N,Z) calloc(N,Z) | |
170 | #endif | |
171 | #ifndef kmalloc | |
172 | #define kmalloc(Z) malloc(Z) | |
173 | #endif | |
174 | #ifndef krealloc | |
175 | #define krealloc(P,Z) realloc(P,Z) | |
176 | #endif | |
177 | #ifndef kfree | |
178 | #define kfree(P) free(P) | |
179 | #endif | |
180 | ||
181 | #define __KHASH_TYPE(name, khkey_t, khval_t) \ | |
182 | typedef struct kh_##name##_s { \ | |
183 | khint_t n_buckets, size, n_occupied; \ | |
184 | volatile int lock; \ | |
185 | khint32_t *flags; \ | |
186 | khkey_t *keys; \ | |
187 | khval_t *vals; \ | |
188 | } kh_##name##_t; | |
189 | ||
190 | #define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ | |
191 | extern kh_##name##_t *kh_init_##name(void); \ | |
192 | extern void kh_destroy_##name(kh_##name##_t *h); \ | |
193 | extern void kh_clear_##name(kh_##name##_t *h); \ | |
194 | extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ | |
195 | extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ | |
196 | extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ | |
197 | extern void kh_del_##name(kh_##name##_t *h, khint_t x); | |
198 | ||
199 | #define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ | |
200 | SCOPE kh_##name##_t *kh_init_##name(void) { \ | |
201 | return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ | |
202 | } \ | |
203 | SCOPE void kh_destroy_##name(kh_##name##_t *h) \ | |
204 | { \ | |
205 | if (h) { \ | |
206 | kfree((void *)h->keys); kfree(h->flags); \ | |
207 | kfree((void *)h->vals); \ | |
208 | kfree(h); \ | |
209 | } \ | |
210 | } \ | |
211 | SCOPE void kh_clear_##name(kh_##name##_t *h) \ | |
212 | { \ | |
213 | if (h && h->flags) { \ | |
214 | memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ | |
215 | h->size = h->n_occupied = 0; \ | |
216 | } \ | |
217 | } \ | |
218 | SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ | |
219 | { \ | |
220 | if (h->n_buckets) { \ | |
221 | khint_t k, i, last, mask, step = 0; \ | |
222 | mask = h->n_buckets - 1; \ | |
223 | k = __hash_func(key); i = k & mask; \ | |
224 | last = i; \ | |
225 | while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ | |
226 | i = (i + (++step)) & mask; \ | |
227 | if (i == last) return h->n_buckets; \ | |
228 | } \ | |
229 | return __ac_iseither(h->flags, i)? h->n_buckets : i; \ | |
230 | } else return 0; \ | |
231 | } \ | |
232 | SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ | |
233 | { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ | |
234 | khint32_t *new_flags = 0; \ | |
235 | khint_t j = 1; \ | |
236 | { \ | |
237 | kroundup32(new_n_buckets); \ | |
238 | if (new_n_buckets < 4) new_n_buckets = 4; \ | |
239 | if (h->size >= (new_n_buckets>>1) + (new_n_buckets>>2)) j = 0; /* requested size is too small */ \ | |
240 | else { /* hash table size to be changed (shrink or expand); rehash */ \ | |
241 | new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ | |
242 | if (!new_flags) return -1; \ | |
243 | memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ | |
244 | if (h->n_buckets < new_n_buckets) { /* expand */ \ | |
245 | khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ | |
246 | if (!new_keys) return -1; \ | |
247 | h->keys = new_keys; \ | |
248 | if (kh_is_map) { \ | |
249 | khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ | |
250 | if (!new_vals) return -1; \ | |
251 | h->vals = new_vals; \ | |
252 | } \ | |
253 | } /* otherwise shrink */ \ | |
254 | } \ | |
255 | } \ | |
256 | if (j) { /* rehashing is needed */ \ | |
257 | for (j = 0; j != h->n_buckets; ++j) { \ | |
258 | if (__ac_iseither(h->flags, j) == 0) { \ | |
259 | khkey_t key = h->keys[j]; \ | |
260 | khval_t val; \ | |
261 | khint_t new_mask; \ | |
262 | new_mask = new_n_buckets - 1; \ | |
263 | if (kh_is_map) val = h->vals[j]; \ | |
264 | __ac_set_isdel_true(h->flags, j); \ | |
265 | while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ | |
266 | khint_t k, i, step = 0; \ | |
267 | k = __hash_func(key); \ | |
268 | i = k & new_mask; \ | |
269 | while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ | |
270 | __ac_set_isempty_false(new_flags, i); \ | |
271 | if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ | |
272 | { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ | |
273 | if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ | |
274 | __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ | |
275 | } else { /* write the element and jump out of the loop */ \ | |
276 | h->keys[i] = key; \ | |
277 | if (kh_is_map) h->vals[i] = val; \ | |
278 | break; \ | |
279 | } \ | |
280 | } \ | |
281 | } \ | |
282 | } \ | |
283 | if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ | |
284 | h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ | |
285 | if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ | |
286 | } \ | |
287 | kfree(h->flags); /* free the working space */ \ | |
288 | h->flags = new_flags; \ | |
289 | h->n_buckets = new_n_buckets; \ | |
290 | h->n_occupied = h->size; \ | |
291 | } \ | |
292 | return 0; \ | |
293 | } \ | |
294 | SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ | |
295 | { \ | |
296 | khint_t x; \ | |
297 | if (h->n_occupied >= (h->n_buckets>>2) + (h->n_buckets>>1)) { /* update the hash table */ \ | |
298 | if (h->n_buckets > (h->size<<1)) { \ | |
299 | if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ | |
300 | *ret = -1; return h->n_buckets; \ | |
301 | } \ | |
302 | } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ | |
303 | *ret = -1; return h->n_buckets; \ | |
304 | } \ | |
305 | } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ | |
306 | { \ | |
307 | khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ | |
308 | x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ | |
309 | if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ | |
310 | else { \ | |
311 | last = i; \ | |
312 | while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ | |
313 | if (__ac_isdel(h->flags, i)) site = i; \ | |
314 | i = (i + (++step)) & mask; \ | |
315 | if (i == last) { x = site; break; } \ | |
316 | } \ | |
317 | if (x == h->n_buckets) { \ | |
318 | if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ | |
319 | else x = i; \ | |
320 | } \ | |
321 | } \ | |
322 | } \ | |
323 | if (__ac_isempty(h->flags, x)) { /* not present at all */ \ | |
324 | h->keys[x] = key; \ | |
325 | __ac_set_isboth_false(h->flags, x); \ | |
326 | ++h->size; ++h->n_occupied; \ | |
327 | *ret = 1; \ | |
328 | } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ | |
329 | h->keys[x] = key; \ | |
330 | __ac_set_isboth_false(h->flags, x); \ | |
331 | ++h->size; \ | |
332 | *ret = 2; \ | |
333 | } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ | |
334 | return x; \ | |
335 | } \ | |
336 | SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ | |
337 | { \ | |
338 | if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ | |
339 | __ac_set_isdel_true(h->flags, x); \ | |
340 | --h->size; \ | |
341 | } \ | |
342 | } | |
343 | ||
344 | #define KHASH_DECLARE(name, khkey_t, khval_t) \ | |
345 | __KHASH_TYPE(name, khkey_t, khval_t) \ | |
346 | __KHASH_PROTOTYPES(name, khkey_t, khval_t) | |
347 | ||
348 | #define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ | |
349 | __KHASH_TYPE(name, khkey_t, khval_t) \ | |
350 | __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) | |
351 | ||
352 | #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ | |
353 | KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) | |
354 | ||
355 | /* --- BEGIN OF HASH FUNCTIONS --- */ | |
356 | ||
357 | /*! @function | |
358 | @abstract Integer hash function | |
359 | @param key The integer [khint32_t] | |
360 | @return The hash value [khint_t] | |
361 | */ | |
362 | #define kh_int_hash_func(key) (khint32_t)(key) | |
363 | /*! @function | |
364 | @abstract Integer comparison function | |
365 | */ | |
366 | #define kh_int_hash_equal(a, b) ((a) == (b)) | |
367 | /*! @function | |
368 | @abstract 64-bit integer hash function | |
369 | @param key The integer [khint64_t] | |
370 | @return The hash value [khint_t] | |
371 | */ | |
372 | #define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) | |
373 | /*! @function | |
374 | @abstract 64-bit integer comparison function | |
375 | */ | |
376 | #define kh_int64_hash_equal(a, b) ((a) == (b)) | |
377 | /*! @function | |
378 | @abstract const char* hash function | |
379 | @param s Pointer to a null terminated string | |
380 | @return The hash value | |
381 | */ | |
382 | static kh_inline khint_t __ac_X31_hash_string(const char *s) | |
383 | { | |
384 | khint_t h = (khint_t)*s; | |
385 | if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; | |
386 | return h; | |
387 | } | |
388 | /*! @function | |
389 | @abstract Another interface to const char* hash function | |
390 | @param key Pointer to a null terminated string [const char*] | |
391 | @return The hash value [khint_t] | |
392 | */ | |
393 | #define kh_str_hash_func(key) __ac_X31_hash_string(key) | |
394 | /*! @function | |
395 | @abstract Const char* comparison function | |
396 | */ | |
397 | #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) | |
398 | ||
399 | static kh_inline khint_t __ac_Wang_hash(khint_t key) | |
400 | { | |
401 | key += ~(key << 15); | |
402 | key ^= (key >> 10); | |
403 | key += (key << 3); | |
404 | key ^= (key >> 6); | |
405 | key += ~(key << 11); | |
406 | key ^= (key >> 16); | |
407 | return key; | |
408 | } | |
409 | #define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) | |
410 | ||
411 | /* --- END OF HASH FUNCTIONS --- */ | |
412 | ||
413 | /* Other convenient macros... */ | |
414 | ||
415 | /*! | |
416 | @abstract Type of the hash table. | |
417 | @param name Name of the hash table [symbol] | |
418 | */ | |
419 | #define khash_t(name) kh_##name##_t | |
420 | ||
421 | /*! @function | |
422 | @abstract Initiate a hash table. | |
423 | @param name Name of the hash table [symbol] | |
424 | @return Pointer to the hash table [khash_t(name)*] | |
425 | */ | |
426 | #define kh_init(name) kh_init_##name() | |
427 | ||
428 | /*! @function | |
429 | @abstract Destroy a hash table. | |
430 | @param name Name of the hash table [symbol] | |
431 | @param h Pointer to the hash table [khash_t(name)*] | |
432 | */ | |
433 | #define kh_destroy(name, h) kh_destroy_##name(h) | |
434 | ||
435 | /*! @function | |
436 | @abstract Reset a hash table without deallocating memory. | |
437 | @param name Name of the hash table [symbol] | |
438 | @param h Pointer to the hash table [khash_t(name)*] | |
439 | */ | |
440 | #define kh_clear(name, h) kh_clear_##name(h) | |
441 | ||
442 | /*! @function | |
443 | @abstract Resize a hash table. | |
444 | @param name Name of the hash table [symbol] | |
445 | @param h Pointer to the hash table [khash_t(name)*] | |
446 | @param s New size [khint_t] | |
447 | */ | |
448 | #define kh_resize(name, h, s) kh_resize_##name(h, s) | |
449 | ||
450 | /*! @function | |
451 | @abstract Insert a key to the hash table. | |
452 | @param name Name of the hash table [symbol] | |
453 | @param h Pointer to the hash table [khash_t(name)*] | |
454 | @param k Key [type of keys] | |
455 | @param r Extra return code: 0 if the key is present in the hash table; | |
456 | 1 if the bucket is empty (never used); 2 if the element in | |
457 | the bucket has been deleted [int*] | |
458 | @return Iterator to the inserted element [khint_t] | |
459 | */ | |
460 | #define kh_put(name, h, k, r) kh_put_##name(h, k, r) | |
461 | ||
462 | /*! @function | |
463 | @abstract Retrieve a key from the hash table. | |
464 | @param name Name of the hash table [symbol] | |
465 | @param h Pointer to the hash table [khash_t(name)*] | |
466 | @param k Key [type of keys] | |
467 | @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] | |
468 | */ | |
469 | #define kh_get(name, h, k) kh_get_##name(h, k) | |
470 | ||
471 | /*! @function | |
472 | @abstract Remove a key from the hash table. | |
473 | @param name Name of the hash table [symbol] | |
474 | @param h Pointer to the hash table [khash_t(name)*] | |
475 | @param k Iterator to the element to be deleted [khint_t] | |
476 | */ | |
477 | #define kh_del(name, h, k) kh_del_##name(h, k) | |
478 | ||
479 | /*! @function | |
480 | @abstract Test whether a bucket contains data. | |
481 | @param h Pointer to the hash table [khash_t(name)*] | |
482 | @param x Iterator to the bucket [khint_t] | |
483 | @return 1 if containing data; 0 otherwise [int] | |
484 | */ | |
485 | #define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) | |
486 | ||
487 | /*! @function | |
488 | @abstract Get key given an iterator | |
489 | @param h Pointer to the hash table [khash_t(name)*] | |
490 | @param x Iterator to the bucket [khint_t] | |
491 | @return Key [type of keys] | |
492 | */ | |
493 | #define kh_key(h, x) ((h)->keys[x]) | |
494 | ||
495 | /*! @function | |
496 | @abstract Get value given an iterator | |
497 | @param h Pointer to the hash table [khash_t(name)*] | |
498 | @param x Iterator to the bucket [khint_t] | |
499 | @return Value [type of values] | |
500 | @discussion For hash sets, calling this results in segfault. | |
501 | */ | |
502 | #define kh_val(h, x) ((h)->vals[x]) | |
503 | ||
504 | /*! @function | |
505 | @abstract Alias of kh_val() | |
506 | */ | |
507 | #define kh_value(h, x) ((h)->vals[x]) | |
508 | ||
509 | /*! @function | |
510 | @abstract Get the start iterator | |
511 | @param h Pointer to the hash table [khash_t(name)*] | |
512 | @return The start iterator [khint_t] | |
513 | */ | |
514 | #define kh_begin(h) (khint_t)(0) | |
515 | ||
516 | /*! @function | |
517 | @abstract Get the end iterator | |
518 | @param h Pointer to the hash table [khash_t(name)*] | |
519 | @return The end iterator [khint_t] | |
520 | */ | |
521 | #define kh_end(h) ((h)->n_buckets) | |
522 | ||
523 | /*! @function | |
524 | @abstract Get the number of elements in the hash table | |
525 | @param h Pointer to the hash table [khash_t(name)*] | |
526 | @return Number of elements in the hash table [khint_t] | |
527 | */ | |
528 | #define kh_size(h) ((h)->size) | |
529 | ||
530 | /*! @function | |
531 | @abstract Get the number of buckets in the hash table | |
532 | @param h Pointer to the hash table [khash_t(name)*] | |
533 | @return Number of buckets in the hash table [khint_t] | |
534 | */ | |
535 | #define kh_n_buckets(h) ((h)->n_buckets) | |
536 | ||
537 | /*! @function | |
538 | @abstract Iterate over the entries in the hash table | |
539 | @param h Pointer to the hash table [khash_t(name)*] | |
540 | @param kvar Variable to which key will be assigned | |
541 | @param vvar Variable to which value will be assigned | |
542 | @param code Block of code to execute | |
543 | */ | |
544 | #define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ | |
545 | for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ | |
546 | if (!kh_exist(h,__i)) continue; \ | |
547 | (kvar) = kh_key(h,__i); \ | |
548 | (vvar) = kh_val(h,__i); \ | |
549 | code; \ | |
550 | } } | |
551 | ||
552 | /*! @function | |
553 | @abstract Iterate over the values in the hash table | |
554 | @param h Pointer to the hash table [khash_t(name)*] | |
555 | @param vvar Variable to which value will be assigned | |
556 | @param code Block of code to execute | |
557 | */ | |
558 | #define kh_foreach_value(h, vvar, code) { khint_t __i; \ | |
559 | for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ | |
560 | if (!kh_exist(h,__i)) continue; \ | |
561 | (vvar) = kh_val(h,__i); \ | |
562 | code; \ | |
563 | } } | |
564 | ||
565 | /* More conenient interfaces */ | |
566 | ||
567 | /*! @function | |
568 | @abstract Instantiate a hash set containing integer keys | |
569 | @param name Name of the hash table [symbol] | |
570 | */ | |
571 | #define KHASH_SET_INIT_INT(name) \ | |
572 | KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) | |
573 | ||
574 | /*! @function | |
575 | @abstract Instantiate a hash map containing integer keys | |
576 | @param name Name of the hash table [symbol] | |
577 | @param khval_t Type of values [type] | |
578 | */ | |
579 | #define KHASH_MAP_INIT_INT(name, khval_t) \ | |
580 | KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) | |
581 | ||
582 | /*! @function | |
583 | @abstract Instantiate a hash map containing 64-bit integer keys | |
584 | @param name Name of the hash table [symbol] | |
585 | */ | |
586 | #define KHASH_SET_INIT_INT64(name) \ | |
587 | KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) | |
588 | ||
589 | /*! @function | |
590 | @abstract Instantiate a hash map containing 64-bit integer keys | |
591 | @param name Name of the hash table [symbol] | |
592 | @param khval_t Type of values [type] | |
593 | */ | |
594 | #define KHASH_MAP_INIT_INT64(name, khval_t) \ | |
595 | KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) | |
596 | ||
597 | typedef const char *kh_cstr_t; | |
598 | /*! @function | |
599 | @abstract Instantiate a hash map containing const char* keys | |
600 | @param name Name of the hash table [symbol] | |
601 | */ | |
602 | #define KHASH_SET_INIT_STR(name) \ | |
603 | KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) | |
604 | ||
605 | /*! @function | |
606 | @abstract Instantiate a hash map containing const char* keys | |
607 | @param name Name of the hash table [symbol] | |
608 | @param khval_t Type of values [type] | |
609 | */ | |
610 | #define KHASH_MAP_INIT_STR(name, khval_t) \ | |
611 | KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) | |
612 | ||
613 | #endif /* __AC_KHASH_H */ |
0 | #ifndef BFC_KMER_H | |
1 | #define BFC_KMER_H | |
2 | ||
3 | #include <stdint.h> | |
4 | ||
5 | typedef struct { | |
6 | uint64_t x[4]; | |
7 | } bfc_kmer_t; | |
8 | ||
9 | static inline void bfc_kmer_append(int k, uint64_t x[4], int c) | |
10 | { // IMPORTANT: 0 <= c < 4 | |
11 | uint64_t mask = (1ULL<<k) - 1; | |
12 | x[0] = (x[0]<<1 | (c&1)) & mask; | |
13 | x[1] = (x[1]<<1 | (c>>1)) & mask; | |
14 | x[2] = x[2]>>1 | (1ULL^(c&1))<<(k-1); | |
15 | x[3] = x[3]>>1 | (1ULL^c>>1) <<(k-1); | |
16 | } | |
17 | ||
18 | static inline void bfc_kmer_change(int k, uint64_t x[4], int d, int c) // d-bp from the 3'-end of k-mer; 0<=d<k | |
19 | { // IMPORTANT: 0 <= c < 4 | |
20 | uint64_t t = ~(1ULL<<d); | |
21 | x[0] = (uint64_t) (c&1)<<d | (x[0]&t); | |
22 | x[1] = (uint64_t)(c>>1)<<d | (x[1]&t); | |
23 | t = ~(1ULL<<(k-1-d)); | |
24 | x[2] = (uint64_t)(1^(c&1))<<(k-1-d) | (x[2]&t); | |
25 | x[3] = (uint64_t)(1^ c>>1)<<(k-1-d) | (x[3]&t); | |
26 | } | |
27 | ||
28 | // Thomas Wang's integer hash functions. See <https://gist.github.com/lh3/59882d6b96166dfc3d8d> for a snapshot. | |
29 | static inline uint64_t bfc_hash_64(uint64_t key, uint64_t mask) | |
30 | { | |
31 | key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1; | |
32 | key = key ^ key >> 24; | |
33 | key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265 | |
34 | key = key ^ key >> 14; | |
35 | key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21 | |
36 | key = key ^ key >> 28; | |
37 | key = (key + (key << 31)) & mask; | |
38 | return key; | |
39 | } | |
40 | ||
41 | static inline uint64_t bfc_hash_64_inv(uint64_t key, uint64_t mask) | |
42 | { | |
43 | uint64_t tmp; | |
44 | ||
45 | // Invert key = key + (key << 31) | |
46 | tmp = (key - (key << 31)); | |
47 | key = (key - (tmp << 31)) & mask; | |
48 | ||
49 | // Invert key = key ^ (key >> 28) | |
50 | tmp = key ^ key >> 28; | |
51 | key = key ^ tmp >> 28; | |
52 | ||
53 | // Invert key *= 21 | |
54 | key = (key * 14933078535860113213ull) & mask; | |
55 | ||
56 | // Invert key = key ^ (key >> 14) | |
57 | tmp = key ^ key >> 14; | |
58 | tmp = key ^ tmp >> 14; | |
59 | tmp = key ^ tmp >> 14; | |
60 | key = key ^ tmp >> 14; | |
61 | ||
62 | // Invert key *= 265 | |
63 | key = (key * 15244667743933553977ull) & mask; | |
64 | ||
65 | // Invert key = key ^ (key >> 24) | |
66 | tmp = key ^ key >> 24; | |
67 | key = key ^ tmp >> 24; | |
68 | ||
69 | // Invert key = (~key) + (key << 21) | |
70 | tmp = ~key; | |
71 | tmp = ~(key - (tmp << 21)); | |
72 | tmp = ~(key - (tmp << 21)); | |
73 | key = ~(key - (tmp << 21)) & mask; | |
74 | ||
75 | return key; | |
76 | } | |
77 | ||
78 | static inline uint64_t bfc_kmer_hash(int k, const uint64_t x[4], uint64_t h[2]) | |
79 | { | |
80 | int t = k>>1, u = ((x[1]>>t&1) > (x[3]>>t&1)); // the middle base is always different | |
81 | uint64_t mask = (1ULL<<k) - 1, ret; | |
82 | h[0] = bfc_hash_64((x[u<<1|0] + x[u<<1|1]) & mask, mask); | |
83 | h[1] = bfc_hash_64(h[0] ^ x[u<<1|1], mask); | |
84 | ret = (h[0] ^ h[1]) << k | ((h[0] + h[1]) & mask); | |
85 | h[0] = (h[0] + h[1]) & mask; | |
86 | return ret; | |
87 | } | |
88 | ||
89 | static inline void bfc_kmer_hash_inv(int k, const uint64_t h[2], uint64_t y[2]) | |
90 | { | |
91 | uint64_t mask = (1ULL<<k) - 1, t = (h[0] - h[1]) & mask; | |
92 | y[1] = bfc_hash_64_inv(h[1], mask) ^ t; | |
93 | y[0] = (bfc_hash_64_inv(t, mask) - y[1]) & mask; | |
94 | } | |
95 | ||
96 | static inline char *bfc_kmer_2str(int k, const uint64_t y[2], char *buf) | |
97 | { | |
98 | int l; | |
99 | for (l = 0; l < k; ++l) | |
100 | buf[k - 1 - l] = "ACGT"[(y[1]>>l&1)<<1 | (y[0]>>l&1)]; | |
101 | buf[k] = 0; | |
102 | return buf; | |
103 | } | |
104 | ||
105 | #endif |
0 | /* The MIT License | |
1 | ||
2 | Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk> | |
3 | ||
4 | Permission is hereby granted, free of charge, to any person obtaining | |
5 | a copy of this software and associated documentation files (the | |
6 | "Software"), to deal in the Software without restriction, including | |
7 | without limitation the rights to use, copy, modify, merge, publish, | |
8 | distribute, sublicense, and/or sell copies of the Software, and to | |
9 | permit persons to whom the Software is furnished to do so, subject to | |
10 | the following conditions: | |
11 | ||
12 | The above copyright notice and this permission notice shall be | |
13 | included in all copies or substantial portions of the Software. | |
14 | ||
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
19 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
20 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 | SOFTWARE. | |
23 | */ | |
24 | ||
25 | /* Last Modified: 05MAR2012 */ | |
26 | ||
27 | #ifndef AC_KSEQ_H | |
28 | #define AC_KSEQ_H | |
29 | ||
30 | #include <ctype.h> | |
31 | #include <string.h> | |
32 | #include <stdlib.h> | |
33 | ||
34 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r | |
35 | #define KS_SEP_TAB 1 // isspace() && !' ' | |
36 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) | |
37 | #define KS_SEP_MAX 2 | |
38 | ||
39 | #define __KS_TYPE(type_t) \ | |
40 | typedef struct __kstream_t { \ | |
41 | int begin, end; \ | |
42 | int is_eof:2, bufsize:30; \ | |
43 | type_t f; \ | |
44 | unsigned char *buf; \ | |
45 | } kstream_t; | |
46 | ||
47 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) | |
48 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) | |
49 | ||
50 | #define __KS_BASIC(SCOPE, type_t, __bufsize) \ | |
51 | SCOPE kstream_t *ks_init(type_t f) \ | |
52 | { \ | |
53 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ | |
54 | ks->f = f; ks->bufsize = __bufsize; \ | |
55 | ks->buf = (unsigned char*)malloc(__bufsize); \ | |
56 | return ks; \ | |
57 | } \ | |
58 | SCOPE void ks_destroy(kstream_t *ks) \ | |
59 | { \ | |
60 | if (!ks) return; \ | |
61 | free(ks->buf); \ | |
62 | free(ks); \ | |
63 | } | |
64 | ||
65 | #define __KS_INLINED(__read) \ | |
66 | static inline int ks_getc(kstream_t *ks) \ | |
67 | { \ | |
68 | if (ks->is_eof && ks->begin >= ks->end) return -1; \ | |
69 | if (ks->begin >= ks->end) { \ | |
70 | ks->begin = 0; \ | |
71 | ks->end = __read(ks->f, ks->buf, ks->bufsize); \ | |
72 | if (ks->end < ks->bufsize) ks->is_eof = 1; \ | |
73 | if (ks->end == 0) return -1; \ | |
74 | } \ | |
75 | return (int)ks->buf[ks->begin++]; \ | |
76 | } \ | |
77 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ | |
78 | { return ks_getuntil2(ks, delimiter, str, dret, 0); } | |
79 | ||
80 | #ifndef KSTRING_T | |
81 | #define KSTRING_T kstring_t | |
82 | typedef struct __kstring_t { | |
83 | size_t l, m; | |
84 | char *s; | |
85 | } kstring_t; | |
86 | #endif | |
87 | ||
88 | #ifndef kroundup32 | |
89 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) | |
90 | #endif | |
91 | ||
92 | #define __KS_GETUNTIL(SCOPE, __read) \ | |
93 | SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ | |
94 | { \ | |
95 | if (dret) *dret = 0; \ | |
96 | str->l = append? str->l : 0; \ | |
97 | if (ks->begin >= ks->end && ks->is_eof) return -1; \ | |
98 | for (;;) { \ | |
99 | int i; \ | |
100 | if (ks->begin >= ks->end) { \ | |
101 | if (!ks->is_eof) { \ | |
102 | ks->begin = 0; \ | |
103 | ks->end = __read(ks->f, ks->buf, ks->bufsize); \ | |
104 | if (ks->end < ks->bufsize) ks->is_eof = 1; \ | |
105 | if (ks->end == 0) break; \ | |
106 | } else break; \ | |
107 | } \ | |
108 | if (delimiter == KS_SEP_LINE) { \ | |
109 | for (i = ks->begin; i < ks->end; ++i) \ | |
110 | if (ks->buf[i] == '\n') break; \ | |
111 | } else if (delimiter > KS_SEP_MAX) { \ | |
112 | for (i = ks->begin; i < ks->end; ++i) \ | |
113 | if (ks->buf[i] == delimiter) break; \ | |
114 | } else if (delimiter == KS_SEP_SPACE) { \ | |
115 | for (i = ks->begin; i < ks->end; ++i) \ | |
116 | if (isspace(ks->buf[i])) break; \ | |
117 | } else if (delimiter == KS_SEP_TAB) { \ | |
118 | for (i = ks->begin; i < ks->end; ++i) \ | |
119 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ | |
120 | } else i = 0; /* never come to here! */ \ | |
121 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ | |
122 | str->m = str->l + (i - ks->begin) + 1; \ | |
123 | kroundup32(str->m); \ | |
124 | str->s = (char*)realloc(str->s, str->m); \ | |
125 | } \ | |
126 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ | |
127 | str->l = str->l + (i - ks->begin); \ | |
128 | ks->begin = i + 1; \ | |
129 | if (i < ks->end) { \ | |
130 | if (dret) *dret = ks->buf[i]; \ | |
131 | break; \ | |
132 | } \ | |
133 | } \ | |
134 | if (str->s == 0) { \ | |
135 | str->m = 1; \ | |
136 | str->s = (char*)calloc(1, 1); \ | |
137 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ | |
138 | str->s[str->l] = '\0'; \ | |
139 | return str->l; \ | |
140 | } | |
141 | ||
142 | #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ | |
143 | __KS_TYPE(type_t) \ | |
144 | __KS_BASIC(SCOPE, type_t, __bufsize) \ | |
145 | __KS_GETUNTIL(SCOPE, __read) \ | |
146 | __KS_INLINED(__read) | |
147 | ||
148 | #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) | |
149 | ||
150 | #define KSTREAM_DECLARE(type_t, __read) \ | |
151 | __KS_TYPE(type_t) \ | |
152 | extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ | |
153 | extern kstream_t *ks_init(type_t f); \ | |
154 | extern void ks_destroy(kstream_t *ks); \ | |
155 | __KS_INLINED(__read) | |
156 | ||
157 | /****************** | |
158 | * FASTA/Q parser * | |
159 | ******************/ | |
160 | ||
161 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) | |
162 | ||
163 | #define __KSEQ_BASIC(SCOPE, type_t) \ | |
164 | SCOPE kseq_t *kseq_init(type_t fd) \ | |
165 | { \ | |
166 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ | |
167 | s->f = ks_init(fd); \ | |
168 | return s; \ | |
169 | } \ | |
170 | SCOPE void kseq_destroy(kseq_t *ks) \ | |
171 | { \ | |
172 | if (!ks) return; \ | |
173 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ | |
174 | ks_destroy(ks->f); \ | |
175 | free(ks); \ | |
176 | } | |
177 | ||
178 | /* Return value: | |
179 | >=0 length of the sequence (normal) | |
180 | -1 end-of-file | |
181 | -2 truncated quality string | |
182 | */ | |
183 | #define __KSEQ_READ(SCOPE) \ | |
184 | SCOPE int kseq_read(kseq_t *seq) \ | |
185 | { \ | |
186 | int c; \ | |
187 | kstream_t *ks = seq->f; \ | |
188 | if (seq->last_char == 0) { /* then jump to the next header line */ \ | |
189 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ | |
190 | if (c == -1) return -1; /* end of file */ \ | |
191 | seq->last_char = c; \ | |
192 | } /* else: the first header char has been read in the previous call */ \ | |
193 | seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ | |
194 | if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ | |
195 | if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ | |
196 | if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ | |
197 | seq->seq.m = 256; \ | |
198 | seq->seq.s = (char*)malloc(seq->seq.m); \ | |
199 | } \ | |
200 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ | |
201 | if (c == '\n') continue; /* skip empty lines */ \ | |
202 | seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ | |
203 | ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ | |
204 | } \ | |
205 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ | |
206 | if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ | |
207 | seq->seq.m = seq->seq.l + 2; \ | |
208 | kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ | |
209 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ | |
210 | } \ | |
211 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ | |
212 | if (c != '+') return seq->seq.l; /* FASTA */ \ | |
213 | if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ | |
214 | seq->qual.m = seq->seq.m; \ | |
215 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ | |
216 | } \ | |
217 | while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ | |
218 | if (c == -1) return -2; /* error: no quality string */ \ | |
219 | while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ | |
220 | seq->last_char = 0; /* we have not come to the next header line */ \ | |
221 | if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ | |
222 | return seq->seq.l; \ | |
223 | } | |
224 | ||
225 | #define __KSEQ_TYPE(type_t) \ | |
226 | typedef struct { \ | |
227 | kstring_t name, comment, seq, qual; \ | |
228 | int last_char; \ | |
229 | kstream_t *f; \ | |
230 | } kseq_t; | |
231 | ||
232 | #define KSEQ_INIT2(SCOPE, type_t, __read) \ | |
233 | KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \ | |
234 | __KSEQ_TYPE(type_t) \ | |
235 | __KSEQ_BASIC(SCOPE, type_t) \ | |
236 | __KSEQ_READ(SCOPE) | |
237 | ||
238 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) | |
239 | ||
240 | #define KSEQ_DECLARE(type_t) \ | |
241 | __KS_TYPE(type_t) \ | |
242 | __KSEQ_TYPE(type_t) \ | |
243 | extern kseq_t *kseq_init(type_t fd); \ | |
244 | void kseq_destroy(kseq_t *ks); \ | |
245 | int kseq_read(kseq_t *seq); | |
246 | ||
247 | #endif |
0 | /* The MIT License | |
1 | ||
2 | Copyright (c) 2008, 2011 Attractive Chaos <attractor@live.co.uk> | |
3 | ||
4 | Permission is hereby granted, free of charge, to any person obtaining | |
5 | a copy of this software and associated documentation files (the | |
6 | "Software"), to deal in the Software without restriction, including | |
7 | without limitation the rights to use, copy, modify, merge, publish, | |
8 | distribute, sublicense, and/or sell copies of the Software, and to | |
9 | permit persons to whom the Software is furnished to do so, subject to | |
10 | the following conditions: | |
11 | ||
12 | The above copyright notice and this permission notice shall be | |
13 | included in all copies or substantial portions of the Software. | |
14 | ||
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
19 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
20 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 | SOFTWARE. | |
23 | */ | |
24 | ||
25 | /* | |
26 | 2011-04-10 (0.1.6): | |
27 | ||
28 | * Added sample | |
29 | ||
30 | 2011-03 (0.1.5): | |
31 | ||
32 | * Added shuffle/permutation | |
33 | ||
34 | 2008-11-16 (0.1.4): | |
35 | ||
36 | * Fixed a bug in introsort() that happens in rare cases. | |
37 | ||
38 | 2008-11-05 (0.1.3): | |
39 | ||
40 | * Fixed a bug in introsort() for complex comparisons. | |
41 | ||
42 | * Fixed a bug in mergesort(). The previous version is not stable. | |
43 | ||
44 | 2008-09-15 (0.1.2): | |
45 | ||
46 | * Accelerated introsort. On my Mac (not on another Linux machine), | |
47 | my implementation is as fast as std::sort on random input. | |
48 | ||
49 | * Added combsort and in introsort, switch to combsort if the | |
50 | recursion is too deep. | |
51 | ||
52 | 2008-09-13 (0.1.1): | |
53 | ||
54 | * Added k-small algorithm | |
55 | ||
56 | 2008-09-05 (0.1.0): | |
57 | ||
58 | * Initial version | |
59 | ||
60 | */ | |
61 | ||
62 | #ifndef AC_KSORT_H | |
63 | #define AC_KSORT_H | |
64 | ||
65 | #include <stdlib.h> | |
66 | #include <string.h> | |
67 | ||
68 | typedef struct { | |
69 | void *left, *right; | |
70 | int depth; | |
71 | } ks_isort_stack_t; | |
72 | ||
73 | #define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } | |
74 | ||
75 | #define KSORT_INIT(name, type_t, __sort_lt) \ | |
76 | void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ | |
77 | { \ | |
78 | type_t *a2[2], *a, *b; \ | |
79 | int curr, shift; \ | |
80 | \ | |
81 | a2[0] = array; \ | |
82 | a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ | |
83 | for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) { \ | |
84 | a = a2[curr]; b = a2[1-curr]; \ | |
85 | if (shift == 0) { \ | |
86 | type_t *p = b, *i, *eb = a + n; \ | |
87 | for (i = a; i < eb; i += 2) { \ | |
88 | if (i == eb - 1) *p++ = *i; \ | |
89 | else { \ | |
90 | if (__sort_lt(*(i+1), *i)) { \ | |
91 | *p++ = *(i+1); *p++ = *i; \ | |
92 | } else { \ | |
93 | *p++ = *i; *p++ = *(i+1); \ | |
94 | } \ | |
95 | } \ | |
96 | } \ | |
97 | } else { \ | |
98 | size_t i, step = 1ul<<shift; \ | |
99 | for (i = 0; i < n; i += step<<1) { \ | |
100 | type_t *p, *j, *k, *ea, *eb; \ | |
101 | if (n < i + step) { \ | |
102 | ea = a + n; eb = a; \ | |
103 | } else { \ | |
104 | ea = a + i + step; \ | |
105 | eb = a + (n < i + (step<<1)? n : i + (step<<1)); \ | |
106 | } \ | |
107 | j = a + i; k = a + i + step; p = b + i; \ | |
108 | while (j < ea && k < eb) { \ | |
109 | if (__sort_lt(*k, *j)) *p++ = *k++; \ | |
110 | else *p++ = *j++; \ | |
111 | } \ | |
112 | while (j < ea) *p++ = *j++; \ | |
113 | while (k < eb) *p++ = *k++; \ | |
114 | } \ | |
115 | } \ | |
116 | curr = 1 - curr; \ | |
117 | } \ | |
118 | if (curr == 1) { \ | |
119 | type_t *p = a2[0], *i = a2[1], *eb = array + n; \ | |
120 | for (; p < eb; ++i) *p++ = *i; \ | |
121 | } \ | |
122 | if (temp == 0) free(a2[1]); \ | |
123 | } \ | |
124 | void ks_heapdown_##name(size_t i, size_t n, type_t l[]) \ | |
125 | { \ | |
126 | size_t k = i; \ | |
127 | type_t tmp = l[i]; \ | |
128 | while ((k = (k << 1) + 1) < n) { \ | |
129 | if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k; \ | |
130 | if (__sort_lt(l[k], tmp)) break; \ | |
131 | l[i] = l[k]; i = k; \ | |
132 | } \ | |
133 | l[i] = tmp; \ | |
134 | } \ | |
135 | void ks_heapup_##name(size_t n, type_t l[]) \ | |
136 | { \ | |
137 | size_t i, k = n - 1; \ | |
138 | type_t tmp = l[k]; \ | |
139 | while (k) { \ | |
140 | i = (k - 1) >> 1; \ | |
141 | if (__sort_lt(tmp, l[i])) break; \ | |
142 | l[k] = l[i]; k = i; \ | |
143 | } \ | |
144 | l[k] = tmp; \ | |
145 | } \ | |
146 | void ks_heapmake_##name(size_t lsize, type_t l[]) \ | |
147 | { \ | |
148 | size_t i; \ | |
149 | for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i) \ | |
150 | ks_heapdown_##name(i, lsize, l); \ | |
151 | } \ | |
152 | void ks_heapsort_##name(size_t lsize, type_t l[]) \ | |
153 | { \ | |
154 | size_t i; \ | |
155 | for (i = lsize - 1; i > 0; --i) { \ | |
156 | type_t tmp; \ | |
157 | tmp = *l; *l = l[i]; l[i] = tmp; ks_heapdown_##name(0, i, l); \ | |
158 | } \ | |
159 | } \ | |
160 | static inline void __ks_insertsort_##name(type_t *s, type_t *t) \ | |
161 | { \ | |
162 | type_t *i, *j, swap_tmp; \ | |
163 | for (i = s + 1; i < t; ++i) \ | |
164 | for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ | |
165 | swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ | |
166 | } \ | |
167 | } \ | |
168 | void ks_combsort_##name(size_t n, type_t a[]) \ | |
169 | { \ | |
170 | const double shrink_factor = 1.2473309501039786540366528676643; \ | |
171 | int do_swap; \ | |
172 | size_t gap = n; \ | |
173 | type_t tmp, *i, *j; \ | |
174 | do { \ | |
175 | if (gap > 2) { \ | |
176 | gap = (size_t)(gap / shrink_factor); \ | |
177 | if (gap == 9 || gap == 10) gap = 11; \ | |
178 | } \ | |
179 | do_swap = 0; \ | |
180 | for (i = a; i < a + n - gap; ++i) { \ | |
181 | j = i + gap; \ | |
182 | if (__sort_lt(*j, *i)) { \ | |
183 | tmp = *i; *i = *j; *j = tmp; \ | |
184 | do_swap = 1; \ | |
185 | } \ | |
186 | } \ | |
187 | } while (do_swap || gap > 2); \ | |
188 | if (gap != 1) __ks_insertsort_##name(a, a + n); \ | |
189 | } \ | |
190 | void ks_introsort_##name(size_t n, type_t a[]) \ | |
191 | { \ | |
192 | int d; \ | |
193 | ks_isort_stack_t *top, *stack; \ | |
194 | type_t rp, swap_tmp; \ | |
195 | type_t *s, *t, *i, *j, *k; \ | |
196 | \ | |
197 | if (n < 1) return; \ | |
198 | else if (n == 2) { \ | |
199 | if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ | |
200 | return; \ | |
201 | } \ | |
202 | for (d = 2; 1ul<<d < n; ++d); \ | |
203 | stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \ | |
204 | top = stack; s = a; t = a + (n-1); d <<= 1; \ | |
205 | while (1) { \ | |
206 | if (s < t) { \ | |
207 | if (--d == 0) { \ | |
208 | ks_combsort_##name(t - s + 1, s); \ | |
209 | t = s; \ | |
210 | continue; \ | |
211 | } \ | |
212 | i = s; j = t; k = i + ((j-i)>>1) + 1; \ | |
213 | if (__sort_lt(*k, *i)) { \ | |
214 | if (__sort_lt(*k, *j)) k = j; \ | |
215 | } else k = __sort_lt(*j, *i)? i : j; \ | |
216 | rp = *k; \ | |
217 | if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ | |
218 | for (;;) { \ | |
219 | do ++i; while (__sort_lt(*i, rp)); \ | |
220 | do --j; while (i <= j && __sort_lt(rp, *j)); \ | |
221 | if (j <= i) break; \ | |
222 | swap_tmp = *i; *i = *j; *j = swap_tmp; \ | |
223 | } \ | |
224 | swap_tmp = *i; *i = *t; *t = swap_tmp; \ | |
225 | if (i-s > t-i) { \ | |
226 | if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ | |
227 | s = t-i > 16? i+1 : t; \ | |
228 | } else { \ | |
229 | if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ | |
230 | t = i-s > 16? i-1 : s; \ | |
231 | } \ | |
232 | } else { \ | |
233 | if (top == stack) { \ | |
234 | free(stack); \ | |
235 | __ks_insertsort_##name(a, a+n); \ | |
236 | return; \ | |
237 | } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ | |
238 | } \ | |
239 | } \ | |
240 | } \ | |
241 | /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ | |
242 | /* 0 <= kk < n */ \ | |
243 | type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ | |
244 | { \ | |
245 | type_t *low, *high, *k, *ll, *hh, *mid; \ | |
246 | low = arr; high = arr + n - 1; k = arr + kk; \ | |
247 | for (;;) { \ | |
248 | if (high <= low) return *k; \ | |
249 | if (high == low + 1) { \ | |
250 | if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ | |
251 | return *k; \ | |
252 | } \ | |
253 | mid = low + (high - low) / 2; \ | |
254 | if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ | |
255 | if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ | |
256 | if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ | |
257 | KSORT_SWAP(type_t, *mid, *(low+1)); \ | |
258 | ll = low + 1; hh = high; \ | |
259 | for (;;) { \ | |
260 | do ++ll; while (__sort_lt(*ll, *low)); \ | |
261 | do --hh; while (__sort_lt(*low, *hh)); \ | |
262 | if (hh < ll) break; \ | |
263 | KSORT_SWAP(type_t, *ll, *hh); \ | |
264 | } \ | |
265 | KSORT_SWAP(type_t, *low, *hh); \ | |
266 | if (hh <= k) low = ll; \ | |
267 | if (hh >= k) high = hh - 1; \ | |
268 | } \ | |
269 | } \ | |
270 | void ks_shuffle_##name(size_t n, type_t a[]) \ | |
271 | { \ | |
272 | int i, j; \ | |
273 | for (i = n; i > 1; --i) { \ | |
274 | type_t tmp; \ | |
275 | j = (int)(drand48() * i); \ | |
276 | tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; \ | |
277 | } \ | |
278 | } \ | |
279 | void ks_sample_##name(size_t n, size_t r, type_t a[]) /* FIXME: NOT TESTED!!! */ \ | |
280 | { /* reference: http://code.activestate.com/recipes/272884/ */ \ | |
281 | int i, k, pop = n; \ | |
282 | for (i = (int)r, k = 0; i >= 0; --i) { \ | |
283 | double z = 1., x = drand48(); \ | |
284 | type_t tmp; \ | |
285 | while (x < z) z -= z * i / (pop--); \ | |
286 | if (k != n - pop - 1) tmp = a[k], a[k] = a[n-pop-1], a[n-pop-1] = tmp; \ | |
287 | ++k; \ | |
288 | } \ | |
289 | } | |
290 | ||
291 | #define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) | |
292 | #define ks_introsort(name, n, a) ks_introsort_##name(n, a) | |
293 | #define ks_combsort(name, n, a) ks_combsort_##name(n, a) | |
294 | #define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) | |
295 | #define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) | |
296 | #define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) | |
297 | #define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) | |
298 | #define ks_shuffle(name, n, a) ks_shuffle_##name(n, a) | |
299 | ||
300 | #define ks_lt_generic(a, b) ((a) < (b)) | |
301 | #define ks_lt_str(a, b) (strcmp((a), (b)) < 0) | |
302 | ||
303 | typedef const char *ksstr_t; | |
304 | ||
305 | #define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) | |
306 | #define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) | |
307 | ||
308 | #endif |
0 | /* The MIT License | |
1 | ||
2 | Copyright (c) by Attractive Chaos <attractor@live.co.uk> | |
3 | ||
4 | Permission is hereby granted, free of charge, to any person obtaining | |
5 | a copy of this software and associated documentation files (the | |
6 | "Software"), to deal in the Software without restriction, including | |
7 | without limitation the rights to use, copy, modify, merge, publish, | |
8 | distribute, sublicense, and/or sell copies of the Software, and to | |
9 | permit persons to whom the Software is furnished to do so, subject to | |
10 | the following conditions: | |
11 | ||
12 | The above copyright notice and this permission notice shall be | |
13 | included in all copies or substantial portions of the Software. | |
14 | ||
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
19 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
20 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 | SOFTWARE. | |
23 | */ | |
24 | ||
25 | #ifndef KSTRING_H | |
26 | #define KSTRING_H | |
27 | ||
28 | #include <stdlib.h> | |
29 | #include <string.h> | |
30 | #include <stdint.h> | |
31 | ||
32 | #ifndef kroundup32 | |
33 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) | |
34 | #endif | |
35 | ||
36 | #ifndef KSTRING_T | |
37 | #define KSTRING_T kstring_t | |
38 | typedef struct __kstring_t { | |
39 | size_t l, m; | |
40 | char *s; | |
41 | } kstring_t; | |
42 | #endif | |
43 | ||
44 | typedef struct { | |
45 | uint64_t tab[4]; | |
46 | int sep, finished; | |
47 | const char *p; // end of the current token | |
48 | } ks_tokaux_t; | |
49 | ||
50 | #ifdef __cplusplus | |
51 | extern "C" { | |
52 | #endif | |
53 | ||
54 | int ksprintf(kstring_t *s, const char *fmt, ...); | |
55 | int ksprintf_fast(kstring_t *s, const char *fmt, ...); | |
56 | int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); | |
57 | char *kstrstr(const char *str, const char *pat, int **_prep); | |
58 | char *kstrnstr(const char *str, const char *pat, int n, int **_prep); | |
59 | void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep); | |
60 | ||
61 | /* kstrtok() is similar to strtok_r() except that str is not | |
62 | * modified and both str and sep can be NULL. For efficiency, it is | |
63 | * actually recommended to set both to NULL in the subsequent calls | |
64 | * if sep is not changed. */ | |
65 | char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); | |
66 | ||
67 | #ifdef __cplusplus | |
68 | } | |
69 | #endif | |
70 | ||
71 | static inline void ks_resize(kstring_t *s, size_t size) | |
72 | { | |
73 | if (s->m < size) { | |
74 | s->m = size; | |
75 | kroundup32(s->m); | |
76 | s->s = (char*)realloc(s->s, s->m); | |
77 | } | |
78 | } | |
79 | ||
80 | static inline int kputsn(const char *p, int l, kstring_t *s) | |
81 | { | |
82 | if (s->l + l + 1 >= s->m) { | |
83 | s->m = s->l + l + 2; | |
84 | kroundup32(s->m); | |
85 | s->s = (char*)realloc(s->s, s->m); | |
86 | } | |
87 | memcpy(s->s + s->l, p, l); | |
88 | s->l += l; | |
89 | s->s[s->l] = 0; | |
90 | return l; | |
91 | } | |
92 | ||
93 | static inline int kputs(const char *p, kstring_t *s) | |
94 | { | |
95 | return kputsn(p, strlen(p), s); | |
96 | } | |
97 | ||
98 | static inline int kputc(int c, kstring_t *s) | |
99 | { | |
100 | if (s->l + 1 >= s->m) { | |
101 | s->m = s->l + 2; | |
102 | kroundup32(s->m); | |
103 | s->s = (char*)realloc(s->s, s->m); | |
104 | } | |
105 | s->s[s->l++] = c; | |
106 | s->s[s->l] = 0; | |
107 | return c; | |
108 | } | |
109 | ||
110 | static inline int kputw(int c, kstring_t *s) | |
111 | { | |
112 | char buf[16]; | |
113 | int l, x; | |
114 | if (c == 0) return kputc('0', s); | |
115 | for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; | |
116 | if (c < 0) buf[l++] = '-'; | |
117 | if (s->l + l + 1 >= s->m) { | |
118 | s->m = s->l + l + 2; | |
119 | kroundup32(s->m); | |
120 | s->s = (char*)realloc(s->s, s->m); | |
121 | } | |
122 | for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; | |
123 | s->s[s->l] = 0; | |
124 | return 0; | |
125 | } | |
126 | ||
127 | static inline int kputuw(unsigned c, kstring_t *s) | |
128 | { | |
129 | char buf[16]; | |
130 | int l, i; | |
131 | unsigned x; | |
132 | if (c == 0) return kputc('0', s); | |
133 | for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; | |
134 | if (s->l + l + 1 >= s->m) { | |
135 | s->m = s->l + l + 2; | |
136 | kroundup32(s->m); | |
137 | s->s = (char*)realloc(s->s, s->m); | |
138 | } | |
139 | for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; | |
140 | s->s[s->l] = 0; | |
141 | return 0; | |
142 | } | |
143 | ||
144 | static inline int kputl(long c, kstring_t *s) | |
145 | { | |
146 | char buf[32]; | |
147 | long l, x; | |
148 | if (c == 0) return kputc('0', s); | |
149 | for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; | |
150 | if (c < 0) buf[l++] = '-'; | |
151 | if (s->l + l + 1 >= s->m) { | |
152 | s->m = s->l + l + 2; | |
153 | kroundup32(s->m); | |
154 | s->s = (char*)realloc(s->s, s->m); | |
155 | } | |
156 | for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; | |
157 | s->s[s->l] = 0; | |
158 | return 0; | |
159 | } | |
160 | ||
161 | static inline int *ksplit(kstring_t *s, int delimiter, int *n) | |
162 | { | |
163 | int max = 0, *offsets = 0; | |
164 | *n = ksplit_core(s->s, delimiter, &max, &offsets); | |
165 | return offsets; | |
166 | } | |
167 | ||
168 | #endif |
0 | /* The MIT License | |
1 | ||
2 | Copyright (c) 2011 by Attractive Chaos <attractor@live.co.uk> | |
3 | ||
4 | Permission is hereby granted, free of charge, to any person obtaining | |
5 | a copy of this software and associated documentation files (the | |
6 | "Software"), to deal in the Software without restriction, including | |
7 | without limitation the rights to use, copy, modify, merge, publish, | |
8 | distribute, sublicense, and/or sell copies of the Software, and to | |
9 | permit persons to whom the Software is furnished to do so, subject to | |
10 | the following conditions: | |
11 | ||
12 | The above copyright notice and this permission notice shall be | |
13 | included in all copies or substantial portions of the Software. | |
14 | ||
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
19 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
20 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 | SOFTWARE. | |
23 | */ | |
24 | ||
25 | #include <stdlib.h> | |
26 | #include <stdint.h> | |
27 | #include <emmintrin.h> | |
28 | #include "ksw.h" | |
29 | ||
30 | #ifdef __GNUC__ | |
31 | #define LIKELY(x) __builtin_expect((x),1) | |
32 | #define UNLIKELY(x) __builtin_expect((x),0) | |
33 | #else | |
34 | #define LIKELY(x) (x) | |
35 | #define UNLIKELY(x) (x) | |
36 | #endif | |
37 | ||
38 | const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 }; | |
39 | ||
40 | struct _kswq_t { | |
41 | int qlen, slen; | |
42 | uint8_t shift, mdiff, max, size; | |
43 | __m128i *qp, *H0, *H1, *E, *Hmax; | |
44 | }; | |
45 | ||
46 | /** | |
47 | * Initialize the query data structure | |
48 | * | |
49 | * @param size Number of bytes used to store a score; valid valures are 1 or 2 | |
50 | * @param qlen Length of the query sequence | |
51 | * @param query Query sequence | |
52 | * @param m Size of the alphabet | |
53 | * @param mat Scoring matrix in a one-dimension array | |
54 | * | |
55 | * @return Query data structure | |
56 | */ | |
57 | kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) | |
58 | { | |
59 | kswq_t *q; | |
60 | int slen, a, tmp, p; | |
61 | ||
62 | size = size > 1? 2 : 1; | |
63 | p = 8 * (3 - size); // # values per __m128i | |
64 | slen = (qlen + p - 1) / p; // segmented length | |
65 | q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory | |
66 | q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory | |
67 | q->H0 = q->qp + slen * m; | |
68 | q->H1 = q->H0 + slen; | |
69 | q->E = q->H1 + slen; | |
70 | q->Hmax = q->E + slen; | |
71 | q->slen = slen; q->qlen = qlen; q->size = size; | |
72 | // compute shift | |
73 | tmp = m * m; | |
74 | for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score | |
75 | if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; | |
76 | if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; | |
77 | } | |
78 | q->max = q->mdiff; | |
79 | q->shift = 256 - q->shift; // NB: q->shift is uint8_t | |
80 | q->mdiff += q->shift; // this is the difference between the min and max scores | |
81 | // An example: p=8, qlen=19, slen=3 and segmentation: | |
82 | // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} | |
83 | if (size == 1) { | |
84 | int8_t *t = (int8_t*)q->qp; | |
85 | for (a = 0; a < m; ++a) { | |
86 | int i, k, nlen = slen * p; | |
87 | const int8_t *ma = mat + a * m; | |
88 | for (i = 0; i < slen; ++i) | |
89 | for (k = i; k < nlen; k += slen) // p iterations | |
90 | *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift; | |
91 | } | |
92 | } else { | |
93 | int16_t *t = (int16_t*)q->qp; | |
94 | for (a = 0; a < m; ++a) { | |
95 | int i, k, nlen = slen * p; | |
96 | const int8_t *ma = mat + a * m; | |
97 | for (i = 0; i < slen; ++i) | |
98 | for (k = i; k < nlen; k += slen) // p iterations | |
99 | *t++ = (k >= qlen? 0 : ma[query[k]]); | |
100 | } | |
101 | } | |
102 | return q; | |
103 | } | |
104 | ||
105 | kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e) | |
106 | { | |
107 | int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; | |
108 | uint64_t *b; | |
109 | __m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax; | |
110 | kswr_t r; | |
111 | ||
112 | #define __max_16(ret, xx) do { \ | |
113 | (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \ | |
114 | (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \ | |
115 | (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \ | |
116 | (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \ | |
117 | (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \ | |
118 | } while (0) | |
119 | ||
120 | // initialization | |
121 | r = g_defr; | |
122 | minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; | |
123 | endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; | |
124 | m_b = n_b = 0; b = 0; | |
125 | zero = _mm_set1_epi32(0); | |
126 | gapoe = _mm_set1_epi8(_gapo + _gape); | |
127 | gape = _mm_set1_epi8(_gape); | |
128 | shift = _mm_set1_epi8(q->shift); | |
129 | H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; | |
130 | slen = q->slen; | |
131 | for (i = 0; i < slen; ++i) { | |
132 | _mm_store_si128(E + i, zero); | |
133 | _mm_store_si128(H0 + i, zero); | |
134 | _mm_store_si128(Hmax + i, zero); | |
135 | } | |
136 | // the core loop | |
137 | for (i = 0; i < tlen; ++i) { | |
138 | int j, k, cmp, imax; | |
139 | __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector | |
140 | h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example | |
141 | h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian | |
142 | for (j = 0; LIKELY(j < slen); ++j) { | |
143 | /* SW cells are computed in the following order: | |
144 | * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} | |
145 | * E(i+1,j) = max{H(i,j)-q, E(i,j)-r} | |
146 | * F(i,j+1) = max{H(i,j)-q, F(i,j)-r} | |
147 | */ | |
148 | // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1) | |
149 | h = _mm_adds_epu8(h, _mm_load_si128(S + j)); | |
150 | h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j) | |
151 | e = _mm_load_si128(E + j); // e=E'(i,j) | |
152 | h = _mm_max_epu8(h, e); | |
153 | h = _mm_max_epu8(h, f); // h=H'(i,j) | |
154 | max = _mm_max_epu8(max, h); // set max | |
155 | _mm_store_si128(H1 + j, h); // save to H'(i,j) | |
156 | // now compute E'(i+1,j) | |
157 | h = _mm_subs_epu8(h, gapoe); // h=H'(i,j)-gapo | |
158 | e = _mm_subs_epu8(e, gape); // e=E'(i,j)-gape | |
159 | e = _mm_max_epu8(e, h); // e=E'(i+1,j) | |
160 | _mm_store_si128(E + j, e); // save to E'(i+1,j) | |
161 | // now compute F'(i,j+1) | |
162 | f = _mm_subs_epu8(f, gape); | |
163 | f = _mm_max_epu8(f, h); | |
164 | // get H'(i-1,j) and prepare for the next j | |
165 | h = _mm_load_si128(H0 + j); // h=H'(i-1,j) | |
166 | } | |
167 | // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion | |
168 | for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max | |
169 | f = _mm_slli_si128(f, 1); | |
170 | for (j = 0; LIKELY(j < slen); ++j) { | |
171 | h = _mm_load_si128(H1 + j); | |
172 | h = _mm_max_epu8(h, f); // h=H'(i,j) | |
173 | _mm_store_si128(H1 + j, h); | |
174 | h = _mm_subs_epu8(h, gapoe); | |
175 | f = _mm_subs_epu8(f, gape); | |
176 | cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero)); | |
177 | if (UNLIKELY(cmp == 0xffff)) goto end_loop16; | |
178 | } | |
179 | } | |
180 | end_loop16: | |
181 | //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n"); | |
182 | __max_16(imax, max); // imax is the maximum number in max | |
183 | if (imax >= minsc) { // write the b array; this condition adds branching unfornately | |
184 | if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append | |
185 | if (n_b == m_b) { | |
186 | m_b = m_b? m_b<<1 : 8; | |
187 | b = (uint64_t*)realloc(b, 8 * m_b); | |
188 | } | |
189 | b[n_b++] = (uint64_t)imax<<32 | i; | |
190 | } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last | |
191 | } | |
192 | if (imax > gmax) { | |
193 | gmax = imax; te = i; // te is the end position on the target | |
194 | for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector | |
195 | _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); | |
196 | if (gmax + q->shift >= 255 || gmax >= endsc) break; | |
197 | } | |
198 | S = H1; H1 = H0; H0 = S; // swap H0 and H1 | |
199 | } | |
200 | r.score = gmax + q->shift < 255? gmax : 255; | |
201 | r.te = te; | |
202 | if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score | |
203 | int max = -1, low, high, qlen = slen * 16; | |
204 | uint8_t *t = (uint8_t*)Hmax; | |
205 | for (i = 0; i < qlen; ++i, ++t) | |
206 | if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen; | |
207 | //printf("%d,%d\n", max, gmax); | |
208 | if (b) { | |
209 | i = (r.score + q->max - 1) / q->max; | |
210 | low = te - i; high = te + i; | |
211 | for (i = 0; i < n_b; ++i) { | |
212 | int e = (int32_t)b[i]; | |
213 | if ((e < low || e > high) && b[i]>>32 > (uint32_t)r.score2) | |
214 | r.score2 = b[i]>>32, r.te2 = e; | |
215 | } | |
216 | } | |
217 | } | |
218 | free(b); | |
219 | return r; | |
220 | } | |
221 | ||
222 | kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e) | |
223 | { | |
224 | int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; | |
225 | uint64_t *b; | |
226 | __m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax; | |
227 | kswr_t r; | |
228 | ||
229 | #define __max_8(ret, xx) do { \ | |
230 | (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ | |
231 | (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \ | |
232 | (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \ | |
233 | (ret) = _mm_extract_epi16((xx), 0); \ | |
234 | } while (0) | |
235 | ||
236 | // initialization | |
237 | r = g_defr; | |
238 | minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; | |
239 | endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; | |
240 | m_b = n_b = 0; b = 0; | |
241 | zero = _mm_set1_epi32(0); | |
242 | gapoe = _mm_set1_epi16(_gapo + _gape); | |
243 | gape = _mm_set1_epi16(_gape); | |
244 | H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; | |
245 | slen = q->slen; | |
246 | for (i = 0; i < slen; ++i) { | |
247 | _mm_store_si128(E + i, zero); | |
248 | _mm_store_si128(H0 + i, zero); | |
249 | _mm_store_si128(Hmax + i, zero); | |
250 | } | |
251 | // the core loop | |
252 | for (i = 0; i < tlen; ++i) { | |
253 | int j, k, imax; | |
254 | __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector | |
255 | h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example | |
256 | h = _mm_slli_si128(h, 2); | |
257 | for (j = 0; LIKELY(j < slen); ++j) { | |
258 | h = _mm_adds_epi16(h, *S++); | |
259 | e = _mm_load_si128(E + j); | |
260 | h = _mm_max_epi16(h, e); | |
261 | h = _mm_max_epi16(h, f); | |
262 | max = _mm_max_epi16(max, h); | |
263 | _mm_store_si128(H1 + j, h); | |
264 | h = _mm_subs_epu16(h, gapoe); | |
265 | e = _mm_subs_epu16(e, gape); | |
266 | e = _mm_max_epi16(e, h); | |
267 | _mm_store_si128(E + j, e); | |
268 | f = _mm_subs_epu16(f, gape); | |
269 | f = _mm_max_epi16(f, h); | |
270 | h = _mm_load_si128(H0 + j); | |
271 | } | |
272 | for (k = 0; LIKELY(k < 16); ++k) { | |
273 | f = _mm_slli_si128(f, 2); | |
274 | for (j = 0; LIKELY(j < slen); ++j) { | |
275 | h = _mm_load_si128(H1 + j); | |
276 | h = _mm_max_epi16(h, f); | |
277 | _mm_store_si128(H1 + j, h); | |
278 | h = _mm_subs_epu16(h, gapoe); | |
279 | f = _mm_subs_epu16(f, gape); | |
280 | if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8; | |
281 | } | |
282 | } | |
283 | end_loop8: | |
284 | __max_8(imax, max); | |
285 | if (imax >= minsc) { | |
286 | if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { | |
287 | if (n_b == m_b) { | |
288 | m_b = m_b? m_b<<1 : 8; | |
289 | b = (uint64_t*)realloc(b, 8 * m_b); | |
290 | } | |
291 | b[n_b++] = (uint64_t)imax<<32 | i; | |
292 | } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last | |
293 | } | |
294 | if (imax > gmax) { | |
295 | gmax = imax; te = i; | |
296 | for (j = 0; LIKELY(j < slen); ++j) | |
297 | _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); | |
298 | if (gmax >= endsc) break; | |
299 | } | |
300 | S = H1; H1 = H0; H0 = S; | |
301 | } | |
302 | r.score = gmax; r.te = te; | |
303 | { | |
304 | int max = -1, low, high, qlen = slen * 8; | |
305 | uint16_t *t = (uint16_t*)Hmax; | |
306 | for (i = 0, r.qe = -1; i < qlen; ++i, ++t) | |
307 | if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen; | |
308 | if (b) { | |
309 | i = (r.score + q->max - 1) / q->max; | |
310 | low = te - i; high = te + i; | |
311 | for (i = 0; i < n_b; ++i) { | |
312 | int e = (int32_t)b[i]; | |
313 | if ((e < low || e > high) && b[i]>>32 > (uint32_t)r.score2) | |
314 | r.score2 = b[i]>>32, r.te2 = e; | |
315 | } | |
316 | } | |
317 | } | |
318 | free(b); | |
319 | return r; | |
320 | } | |
321 | ||
322 | static void revseq(int l, uint8_t *s) | |
323 | { | |
324 | int i, t; | |
325 | for (i = 0; i < l>>1; ++i) | |
326 | t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t; | |
327 | } | |
328 | ||
329 | kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry) | |
330 | { | |
331 | int size; | |
332 | kswq_t *q; | |
333 | kswr_t r, rr; | |
334 | kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int); | |
335 | ||
336 | q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat); | |
337 | if (qry && *qry == 0) *qry = q; | |
338 | func = q->size == 2? ksw_i16 : ksw_u8; | |
339 | size = q->size; | |
340 | r = func(q, tlen, target, gapo, gape, xtra); | |
341 | if (qry == 0) free(q); | |
342 | if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r; | |
343 | revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end | |
344 | q = ksw_qinit(size, r.qe + 1, query, m, mat); | |
345 | rr = func(q, tlen, target, gapo, gape, KSW_XSTOP | r.score); | |
346 | revseq(r.qe + 1, query); revseq(r.te + 1, target); | |
347 | free(q); | |
348 | if (r.score == rr.score) | |
349 | r.tb = r.te - rr.te, r.qb = r.qe - rr.qe; | |
350 | return r; | |
351 | } |
0 | #ifndef __AC_KSW_H | |
1 | #define __AC_KSW_H | |
2 | ||
3 | #include <stdint.h> | |
4 | ||
5 | #define KSW_XBYTE 0x10000 | |
6 | #define KSW_XSTOP 0x20000 | |
7 | #define KSW_XSUBO 0x40000 | |
8 | #define KSW_XSTART 0x80000 | |
9 | ||
10 | struct _kswq_t; | |
11 | typedef struct _kswq_t kswq_t; | |
12 | ||
13 | typedef struct { | |
14 | int score; // best score | |
15 | int te, qe; // target end and query end | |
16 | int score2, te2; // second best score and ending position on the target | |
17 | int tb, qb; // target start and query start | |
18 | } kswr_t; | |
19 | ||
20 | #ifdef __cplusplus | |
21 | extern "C" { | |
22 | #endif | |
23 | ||
24 | /** | |
25 | * Aligning two sequences | |
26 | * | |
27 | * @param qlen length of the query sequence (typically <tlen) | |
28 | * @param query query sequence with 0 <= query[i] < m | |
29 | * @param tlen length of the target sequence | |
30 | * @param target target sequence | |
31 | * @param m number of residue types | |
32 | * @param mat m*m scoring matrix in one-dimention array | |
33 | * @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)" | |
34 | * @param gape gap extension penalty | |
35 | * @param xtra extra information (see below) | |
36 | * @param qry query profile (see below) | |
37 | * | |
38 | * @return alignment information in a struct; unset values to -1 | |
39 | * | |
40 | * When xtra==0, ksw_align() uses a signed two-byte integer to store a | |
41 | * score and only finds the best score and the end positions. The 2nd best | |
42 | * score or the start positions are not attempted. The default behavior can | |
43 | * be tuned by setting KSW_X* flags: | |
44 | * | |
45 | * KSW_XBYTE: use an unsigned byte to store a score. If overflow occurs, | |
46 | * kswr_t::score will be set to 255 | |
47 | * | |
48 | * KSW_XSUBO: track the 2nd best score and the ending position on the | |
49 | * target if the 2nd best is higher than (xtra&0xffff) | |
50 | * | |
51 | * KSW_XSTOP: stop if the maximum score is above (xtra&0xffff) | |
52 | * | |
53 | * KSW_XSTART: find the start positions | |
54 | * | |
55 | * When *qry==NULL, ksw_align() will compute and allocate the query profile | |
56 | * and when the function returns, *qry will point to the profile, which can | |
57 | * be deallocated simply by free(). If one query is aligned against multiple | |
58 | * target sequences, *qry should be set to NULL during the first call and | |
59 | * freed after the last call. Note that qry can equal 0. In this case, the | |
60 | * query profile will be deallocated in ksw_align(). | |
61 | */ | |
62 | kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry); | |
63 | ||
64 | kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat); | |
65 | ||
66 | #ifdef __cplusplus | |
67 | } | |
68 | #endif | |
69 | ||
70 | #endif |
0 | #include <pthread.h> | |
1 | #include <stdlib.h> | |
2 | #include <limits.h> | |
3 | ||
4 | /************ | |
5 | * kt_for() * | |
6 | ************/ | |
7 | ||
8 | struct kt_for_t; | |
9 | ||
10 | typedef struct { | |
11 | struct kt_for_t *t; | |
12 | long i; | |
13 | } ktf_worker_t; | |
14 | ||
15 | typedef struct kt_for_t { | |
16 | int n_threads; | |
17 | long n; | |
18 | ktf_worker_t *w; | |
19 | void (*func)(void*,long,int); | |
20 | void *data; | |
21 | } kt_for_t; | |
22 | ||
23 | static inline long steal_work(kt_for_t *t) | |
24 | { | |
25 | int i, min_i = -1; | |
26 | long k, min = LONG_MAX; | |
27 | for (i = 0; i < t->n_threads; ++i) | |
28 | if (min > t->w[i].i) min = t->w[i].i, min_i = i; | |
29 | k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads); | |
30 | return k >= t->n? -1 : k; | |
31 | } | |
32 | ||
33 | static void *ktf_worker(void *data) | |
34 | { | |
35 | ktf_worker_t *w = (ktf_worker_t*)data; | |
36 | long i; | |
37 | for (;;) { | |
38 | i = __sync_fetch_and_add(&w->i, w->t->n_threads); | |
39 | if (i >= w->t->n) break; | |
40 | w->t->func(w->t->data, i, w - w->t->w); | |
41 | } | |
42 | while ((i = steal_work(w->t)) >= 0) | |
43 | w->t->func(w->t->data, i, w - w->t->w); | |
44 | pthread_exit(0); | |
45 | } | |
46 | ||
47 | void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n) | |
48 | { | |
49 | if (n_threads > 1) { | |
50 | int i; | |
51 | kt_for_t t; | |
52 | pthread_t *tid; | |
53 | t.func = func, t.data = data, t.n_threads = n_threads, t.n = n; | |
54 | t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t)); | |
55 | tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t)); | |
56 | for (i = 0; i < n_threads; ++i) | |
57 | t.w[i].t = &t, t.w[i].i = i; | |
58 | for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]); | |
59 | for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); | |
60 | } else { | |
61 | long j; | |
62 | for (j = 0; j < n; ++j) func(data, j, 0); | |
63 | } | |
64 | } |
0 | /* The MIT License | |
1 | ||
2 | Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk> | |
3 | ||
4 | Permission is hereby granted, free of charge, to any person obtaining | |
5 | a copy of this software and associated documentation files (the | |
6 | "Software"), to deal in the Software without restriction, including | |
7 | without limitation the rights to use, copy, modify, merge, publish, | |
8 | distribute, sublicense, and/or sell copies of the Software, and to | |
9 | permit persons to whom the Software is furnished to do so, subject to | |
10 | the following conditions: | |
11 | ||
12 | The above copyright notice and this permission notice shall be | |
13 | included in all copies or substantial portions of the Software. | |
14 | ||
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
19 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
20 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 | SOFTWARE. | |
23 | */ | |
24 | ||
25 | /* | |
26 | An example: | |
27 | ||
28 | #include "kvec.h" | |
29 | int main() { | |
30 | kvec_t(int) array; | |
31 | kv_init(array); | |
32 | kv_push(int, array, 10); // append | |
33 | kv_a(int, array, 20) = 5; // dynamic | |
34 | kv_A(array, 20) = 4; // static | |
35 | kv_destroy(array); | |
36 | return 0; | |
37 | } | |
38 | */ | |
39 | ||
40 | /* | |
41 | 2008-09-22 (0.1.0): | |
42 | ||
43 | * The initial version. | |
44 | ||
45 | */ | |
46 | ||
47 | #ifndef AC_KVEC_H | |
48 | #define AC_KVEC_H | |
49 | ||
50 | #include <stdlib.h> | |
51 | ||
52 | #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) | |
53 | ||
54 | #define kvec_t(type) struct { size_t n, m; type *a; } | |
55 | #define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) | |
56 | #define kv_destroy(v) free((v).a) | |
57 | #define kv_A(v, i) ((v).a[(i)]) | |
58 | #define kv_pop(v) ((v).a[--(v).n]) | |
59 | #define kv_size(v) ((v).n) | |
60 | #define kv_max(v) ((v).m) | |
61 | ||
62 | #define kv_resize(type, v, s) do { \ | |
63 | if ((v).m < (s)) { \ | |
64 | (v).m = (s); \ | |
65 | kv_roundup32((v).m); \ | |
66 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ | |
67 | } \ | |
68 | } while (0) | |
69 | ||
70 | #define kv_copy(type, v1, v0) do { \ | |
71 | if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ | |
72 | (v1).n = (v0).n; \ | |
73 | memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ | |
74 | } while (0) \ | |
75 | ||
76 | #define kv_push(type, v, x) do { \ | |
77 | if ((v).n == (v).m) { \ | |
78 | (v).m = (v).m? (v).m<<1 : 2; \ | |
79 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ | |
80 | } \ | |
81 | (v).a[(v).n++] = (x); \ | |
82 | } while (0) | |
83 | ||
84 | #define kv_pushp(type, v, p) do { \ | |
85 | if ((v).n == (v).m) { \ | |
86 | (v).m = (v).m? (v).m<<1 : 2; \ | |
87 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ | |
88 | } \ | |
89 | *(p) = &(v).a[(v).n++]; \ | |
90 | } while (0) | |
91 | ||
92 | #define kv_a(type, v, i) ((v).m <= (size_t)(i)? \ | |
93 | ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ | |
94 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ | |
95 | : (v).n <= (size_t)(i)? (v).n = (i) \ | |
96 | : 0), (v).a[(i)] | |
97 | ||
98 | #define kv_reverse(type, v, start) do { \ | |
99 | if ((v).m > 0 && (v).n > (start)) { \ | |
100 | size_t __i, __end = (v).n - (start); \ | |
101 | type *__a = (v).a + (start); \ | |
102 | for (__i = 0; __i < __end>>1; ++__i) { \ | |
103 | type __t = __a[__end - 1 - __i]; \ | |
104 | __a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \ | |
105 | } \ | |
106 | } \ | |
107 | } while (0) | |
108 | ||
109 | #endif |
0 | /* remaining problems: | |
1 | ||
2 | 1. multiedges due to tandem repeats | |
3 | */ | |
4 | ||
5 | #include <math.h> | |
6 | #include <zlib.h> | |
7 | #include <stdio.h> | |
8 | #include <assert.h> | |
9 | #include "mag.h" | |
10 | #include "kvec.h" | |
11 | #include "internal.h" | |
12 | #include "kseq.h" | |
13 | KSEQ_DECLARE(gzFile) | |
14 | ||
15 | #include "khash.h" | |
16 | KHASH_INIT2(64,, khint64_t, uint64_t, 1, kh_int64_hash_func, kh_int64_hash_equal) | |
17 | ||
18 | typedef khash_t(64) hash64_t; | |
19 | ||
20 | #define ku128_xlt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y > (b).y)) | |
21 | #define ku128_ylt(a, b) ((int64_t)(a).y > (int64_t)(b).y) | |
22 | #include "ksort.h" | |
23 | KSORT_INIT(128x, ku128_t, ku128_xlt) | |
24 | KSORT_INIT(128y, ku128_t, ku128_ylt) | |
25 | KSORT_INIT_GENERIC(uint64_t) | |
26 | ||
27 | #define edge_mark_del(_x) ((_x).x = (uint64_t)-2, (_x).y = 0) | |
28 | #define edge_is_del(_x) ((_x).x == (uint64_t)-2 || (_x).y == 0) | |
29 | ||
30 | static int fm_verbose = 1; | |
31 | ||
32 | /********************* | |
33 | * Vector operations * | |
34 | *********************/ | |
35 | ||
36 | static inline void v128_clean(ku128_v *r) | |
37 | { | |
38 | int i, j; | |
39 | for (i = j = 0; i < r->n; ++i) | |
40 | if (!edge_is_del(r->a[i])) { // keep this arc | |
41 | if (j != i) r->a[j++] = r->a[i]; | |
42 | else ++j; | |
43 | } | |
44 | r->n = j; | |
45 | } | |
46 | ||
47 | void mag_v128_clean(ku128_v *r) | |
48 | { | |
49 | v128_clean(r); | |
50 | } | |
51 | ||
52 | static inline void v128_rmdup(ku128_v *r) | |
53 | { | |
54 | int l, cnt; | |
55 | uint64_t x; | |
56 | if (r->n > 1) ks_introsort(128x, r->n, r->a); | |
57 | for (l = cnt = 0; l < r->n; ++l) // jump to the first node to be retained | |
58 | if (edge_is_del(r->a[l])) ++cnt; | |
59 | else break; | |
60 | if (l == r->n) { // no good arcs | |
61 | r->n = 0; | |
62 | return; | |
63 | } | |
64 | x = r->a[l].x; | |
65 | for (++l; l < r->n; ++l) { // mark duplicated node | |
66 | if (edge_is_del(r->a[l]) || r->a[l].x == x) | |
67 | edge_mark_del(r->a[l]), ++cnt; | |
68 | else x = r->a[l].x; | |
69 | } | |
70 | if (cnt) v128_clean(r); | |
71 | } | |
72 | ||
73 | static inline void v128_cap(ku128_v *r, int max) | |
74 | { | |
75 | int i, thres; | |
76 | if (r->n <= max) return; | |
77 | ks_introsort(128y, r->n, r->a); | |
78 | thres = r->a[max].y; | |
79 | for (i = 0; i < r->n; ++i) | |
80 | if (r->a[i].y == thres) break; | |
81 | r->n = i; | |
82 | } | |
83 | ||
84 | /************************************************* | |
85 | * Mapping between vertex id and interval end id * | |
86 | *************************************************/ | |
87 | ||
88 | void mag_g_build_hash(mag_t *g) | |
89 | { | |
90 | long i; | |
91 | int j, ret; | |
92 | hash64_t *h; | |
93 | h = kh_init(64); | |
94 | for (i = 0; i < g->v.n; ++i) { | |
95 | const magv_t *p = &g->v.a[i]; | |
96 | for (j = 0; j < 2; ++j) { | |
97 | khint_t k = kh_put(64, h, p->k[j], &ret); | |
98 | if (ret == 0) { | |
99 | if (fm_verbose >= 2) | |
100 | fprintf(stderr, "[W::%s] terminal %ld is duplicated.\n", __func__, (long)p->k[j]); | |
101 | kh_val(h, k) = (uint64_t)-1; | |
102 | } else kh_val(h, k) = i<<1|j; | |
103 | } | |
104 | } | |
105 | g->h = h; | |
106 | } | |
107 | ||
108 | static inline uint64_t tid2idd(hash64_t *h, uint64_t tid) | |
109 | { | |
110 | khint_t k = kh_get(64, h, tid); | |
111 | assert(k != kh_end(h)); | |
112 | return kh_val(h, k); | |
113 | } | |
114 | ||
115 | uint64_t mag_tid2idd(void *h, uint64_t tid) // exported version | |
116 | { | |
117 | return tid2idd(h, tid); | |
118 | } | |
119 | ||
120 | void mag_g_amend(mag_t *g) | |
121 | { | |
122 | int i, j, l, ll; | |
123 | for (i = 0; i < g->v.n; ++i) { | |
124 | magv_t *p = &g->v.a[i]; | |
125 | ku128_v *r; | |
126 | for (j = 0; j < 2; ++j) { | |
127 | for (l = 0; l < p->nei[j].n; ++l) { | |
128 | khint_t k; | |
129 | uint64_t z, x = p->nei[j].a[l].x; | |
130 | k = kh_get(64, g->h, x); | |
131 | if (k == kh_end((hash64_t*)g->h)) { // neighbor is not in the hash table; likely due to tip removal | |
132 | edge_mark_del(p->nei[j].a[l]); | |
133 | continue; | |
134 | } else z = kh_val((hash64_t*)g->h, k); | |
135 | r = &g->v.a[z>>1].nei[z&1]; | |
136 | for (ll = 0, z = p->k[j]; ll < r->n; ++ll) | |
137 | if (r->a[ll].x == z) break; | |
138 | if (ll == r->n) // not in neighbor's neighor | |
139 | edge_mark_del(p->nei[j].a[l]); | |
140 | } | |
141 | v128_rmdup(&p->nei[j]); | |
142 | } | |
143 | } | |
144 | } | |
145 | ||
146 | /********************************* | |
147 | * Graph I/O initialization etc. * | |
148 | *********************************/ | |
149 | ||
150 | void mag_v_write(const magv_t *p, kstring_t *out) | |
151 | { | |
152 | int j, k; | |
153 | if (p->len <= 0) return; | |
154 | out->l = 0; | |
155 | kputc('@', out); kputl(p->k[0], out); kputc(':', out); kputl(p->k[1], out); | |
156 | kputc('\t', out); kputw(p->nsr, out); | |
157 | for (j = 0; j < 2; ++j) { | |
158 | const ku128_v *r = &p->nei[j]; | |
159 | kputc('\t', out); | |
160 | for (k = 0; k < r->n; ++k) { | |
161 | if (edge_is_del(r->a[k])) continue; | |
162 | kputl(r->a[k].x, out); kputc(',', out); kputw((int32_t)r->a[k].y, out); | |
163 | kputc(';', out); | |
164 | } | |
165 | if (p->nei[j].n == 0) kputc('.', out); | |
166 | } | |
167 | kputc('\n', out); | |
168 | ks_resize(out, out->l + 2 * p->len + 5); | |
169 | for (j = 0; j < p->len; ++j) | |
170 | out->s[out->l++] = "ACGT"[(int)p->seq[j] - 1]; | |
171 | out->s[out->l] = 0; | |
172 | kputsn("\n+\n", 3, out); | |
173 | kputsn(p->cov, p->len, out); | |
174 | kputc('\n', out); | |
175 | } | |
176 | ||
177 | void mag_g_print(const mag_t *g) | |
178 | { | |
179 | int i; | |
180 | kstring_t out; | |
181 | out.l = out.m = 0; out.s = 0; | |
182 | for (i = 0; i < g->v.n; ++i) { | |
183 | if (g->v.a[i].len < 0) continue; | |
184 | mag_v_write(&g->v.a[i], &out); | |
185 | fwrite(out.s, 1, out.l, stdout); | |
186 | } | |
187 | free(out.s); | |
188 | fflush(stdout); | |
189 | } | |
190 | ||
191 | /************************** | |
192 | * Basic graph operations * | |
193 | **************************/ | |
194 | ||
195 | void mag_v_destroy(magv_t *v) | |
196 | { | |
197 | free(v->nei[0].a); free(v->nei[1].a); | |
198 | free(v->seq); free(v->cov); | |
199 | memset(v, 0, sizeof(magv_t)); | |
200 | v->len = -1; | |
201 | } | |
202 | ||
203 | void mag_g_destroy(mag_t *g) | |
204 | { | |
205 | int i; | |
206 | kh_destroy(64, g->h); | |
207 | for (i = 0; i < g->v.n; ++i) | |
208 | mag_v_destroy(&g->v.a[i]); | |
209 | free(g->v.a); | |
210 | free(g); | |
211 | } | |
212 | ||
213 | void mag_v_copy_to_empty(magv_t *dst, const magv_t *src) // NB: memory leak if dst is allocated | |
214 | { | |
215 | memcpy(dst, src, sizeof(magv_t)); | |
216 | dst->max_len = dst->len + 1; | |
217 | kroundup32(dst->max_len); | |
218 | dst->seq = calloc(dst->max_len, 1); memcpy(dst->seq, src->seq, src->len); | |
219 | dst->cov = calloc(dst->max_len, 1); memcpy(dst->cov, src->cov, src->len); | |
220 | kv_init(dst->nei[0]); kv_copy(ku128_t, dst->nei[0], src->nei[0]); | |
221 | kv_init(dst->nei[1]); kv_copy(ku128_t, dst->nei[1], src->nei[1]); | |
222 | } | |
223 | ||
224 | void mag_eh_add(mag_t *g, uint64_t u, uint64_t v, int ovlp) // add v to u | |
225 | { | |
226 | ku128_v *r; | |
227 | ku128_t *q; | |
228 | uint64_t idd; | |
229 | int i; | |
230 | if ((int64_t)u < 0) return; | |
231 | idd = tid2idd(g->h, u); | |
232 | r = &g->v.a[idd>>1].nei[idd&1]; | |
233 | for (i = 0; i < r->n; ++i) // no multi-edges | |
234 | if (r->a[i].x == v) return; | |
235 | kv_pushp(ku128_t, *r, &q); | |
236 | q->x = v; q->y = ovlp; | |
237 | } | |
238 | ||
239 | void mag_eh_markdel(mag_t *g, uint64_t u, uint64_t v) // mark deletion of v from u | |
240 | { | |
241 | int i; | |
242 | uint64_t idd; | |
243 | if ((int64_t)u < 0) return; | |
244 | idd = tid2idd(g->h, u); | |
245 | ku128_v *r = &g->v.a[idd>>1].nei[idd&1]; | |
246 | for (i = 0; i < r->n; ++i) | |
247 | if (r->a[i].x == v) edge_mark_del(r->a[i]); | |
248 | } | |
249 | ||
250 | void mag_v_del(mag_t *g, magv_t *p) | |
251 | { | |
252 | int i, j; | |
253 | khint_t k; | |
254 | if (p->len < 0) return; | |
255 | for (i = 0; i < 2; ++i) { | |
256 | ku128_v *r = &p->nei[i]; | |
257 | for (j = 0; j < r->n; ++j) | |
258 | if (!edge_is_del(r->a[j]) && r->a[j].x != p->k[0] && r->a[j].x != p->k[1]) | |
259 | mag_eh_markdel(g, r->a[j].x, p->k[i]); | |
260 | } | |
261 | for (i = 0; i < 2; ++i) { | |
262 | k = kh_get(64, g->h, p->k[i]); | |
263 | kh_del(64, g->h, k); | |
264 | } | |
265 | mag_v_destroy(p); | |
266 | } | |
267 | ||
268 | void mag_v_transdel(mag_t *g, magv_t *p, int min_ovlp) | |
269 | { | |
270 | if (p->nei[0].n && p->nei[1].n) { | |
271 | int i, j, ovlp; | |
272 | for (i = 0; i < p->nei[0].n; ++i) { | |
273 | if (edge_is_del(p->nei[0].a[i]) || p->nei[0].a[i].x == p->k[0] || p->nei[0].a[i].x == p->k[1]) continue; // due to p->p loop | |
274 | for (j = 0; j < p->nei[1].n; ++j) { | |
275 | if (edge_is_del(p->nei[1].a[j]) || p->nei[1].a[j].x == p->k[0] || p->nei[1].a[j].x == p->k[1]) continue; | |
276 | ovlp = (int)(p->nei[0].a[i].y + p->nei[1].a[j].y) - p->len; | |
277 | if (ovlp >= min_ovlp) { | |
278 | mag_eh_add(g, p->nei[0].a[i].x, p->nei[1].a[j].x, ovlp); | |
279 | mag_eh_add(g, p->nei[1].a[j].x, p->nei[0].a[i].x, ovlp); | |
280 | } | |
281 | } | |
282 | } | |
283 | } | |
284 | mag_v_del(g, p); | |
285 | } | |
286 | ||
287 | void mag_v_flip(mag_t *g, magv_t *p) | |
288 | { | |
289 | ku128_v t; | |
290 | khint_t k; | |
291 | hash64_t *h = (hash64_t*)g->h; | |
292 | ||
293 | seq_revcomp6(p->len, (uint8_t*)p->seq); | |
294 | seq_reverse(p->len, (uint8_t*)p->cov); | |
295 | p->k[0] ^= p->k[1]; p->k[1] ^= p->k[0]; p->k[0] ^= p->k[1]; | |
296 | t = p->nei[0]; p->nei[0] = p->nei[1]; p->nei[1] = t; | |
297 | k = kh_get(64, h, p->k[0]); | |
298 | assert(k != kh_end(h)); | |
299 | kh_val(h, k) ^= 1; | |
300 | k = kh_get(64, h, p->k[1]); | |
301 | assert(k != kh_end(h)); | |
302 | kh_val(h, k) ^= 1; | |
303 | } | |
304 | ||
305 | /********************* | |
306 | * Unambiguous merge * | |
307 | *********************/ | |
308 | ||
309 | int mag_vh_merge_try(mag_t *g, magv_t *p, int min_merge_len) // merge p's neighbor to the right-end of p | |
310 | { | |
311 | magv_t *q; | |
312 | khint_t kp, kq; | |
313 | int i, j, new_l; | |
314 | hash64_t *h = (hash64_t*)g->h; | |
315 | ||
316 | // check if an unambiguous merge can be performed | |
317 | if (p->nei[1].n != 1) return -1; // multiple or no neighbor; do not merge | |
318 | if ((int64_t)p->nei[1].a[0].x < 0) return -2; // deleted neighbor | |
319 | if ((int)p->nei[1].a[0].y < min_merge_len) return -5; | |
320 | kq = kh_get(64, g->h, p->nei[1].a[0].x); | |
321 | assert(kq != kh_end(h)); // otherwise the neighbor is non-existant | |
322 | q = &g->v.a[kh_val((hash64_t*)g->h, kq)>>1]; | |
323 | if (p == q) return -3; // we have a loop p->p. We cannot merge in this case | |
324 | if (q->nei[kh_val(h, kq)&1].n != 1) return -4; // the neighbor q has multiple neighbors. cannot be an unambiguous merge | |
325 | ||
326 | // we can perform a merge; do further consistency check (mostly check bugs) | |
327 | if (kh_val(h, kq)&1) mag_v_flip(g, q); // a "><" bidirectional arc; flip q | |
328 | kp = kh_get(64, g->h, p->k[1]); assert(kp != kh_end(h)); // get the iterator to p | |
329 | kh_del(64, g->h, kp); kh_del(64, g->h, kq); // remove the two ends of the arc in the hash table | |
330 | assert(p->k[1] == q->nei[0].a[0].x && q->k[0] == p->nei[1].a[0].x); // otherwise inconsistent topology | |
331 | assert(p->nei[1].a[0].y == q->nei[0].a[0].y); // the overlap length must be the same | |
332 | assert(p->len >= p->nei[1].a[0].y && q->len >= p->nei[1].a[0].y); // and the overlap is shorter than both vertices | |
333 | ||
334 | // update the read count and sequence length | |
335 | p->nsr += q->nsr; | |
336 | new_l = p->len + q->len - p->nei[1].a[0].y; | |
337 | if (new_l + 1 > p->max_len) { // then double p->seq and p->cov | |
338 | p->max_len = new_l + 1; | |
339 | kroundup32(p->max_len); | |
340 | p->seq = realloc(p->seq, p->max_len); | |
341 | p->cov = realloc(p->cov, p->max_len); | |
342 | } | |
343 | // merge seq and cov | |
344 | for (i = p->len - p->nei[1].a[0].y, j = 0; j < q->len; ++i, ++j) { // write seq and cov | |
345 | p->seq[i] = q->seq[j]; | |
346 | if (i < p->len) { | |
347 | if ((int)p->cov[i] + (q->cov[j] - 33) > 126) p->cov[i] = 126; | |
348 | else p->cov[i] += q->cov[j] - 33; | |
349 | } else p->cov[i] = q->cov[j]; | |
350 | } | |
351 | p->seq[new_l] = p->cov[new_l] = 0; | |
352 | p->len = new_l; | |
353 | // merge neighbors | |
354 | free(p->nei[1].a); | |
355 | p->nei[1] = q->nei[1]; p->k[1] = q->k[1]; | |
356 | q->nei[1].a = 0; // to avoid freeing p->nei[1] by mag_v_destroy() below | |
357 | // update the hash table for the right end of p | |
358 | kp = kh_get(64, g->h, p->k[1]); | |
359 | assert(kp != kh_end((hash64_t*)g->h)); | |
360 | kh_val(h, kp) = (p - g->v.a)<<1 | 1; | |
361 | // clean up q | |
362 | mag_v_destroy(q); | |
363 | return 0; | |
364 | } | |
365 | ||
366 | void mag_g_merge(mag_t *g, int rmdup, int min_merge_len) | |
367 | { | |
368 | int i; | |
369 | uint64_t n = 0; | |
370 | for (i = 0; i < g->v.n; ++i) { // remove multiedges; FIXME: should we do that? | |
371 | if (rmdup) { | |
372 | v128_rmdup(&g->v.a[i].nei[0]); | |
373 | v128_rmdup(&g->v.a[i].nei[1]); | |
374 | } else { | |
375 | v128_clean(&g->v.a[i].nei[0]); | |
376 | v128_clean(&g->v.a[i].nei[1]); | |
377 | } | |
378 | } | |
379 | for (i = 0; i < g->v.n; ++i) { | |
380 | magv_t *p = &g->v.a[i]; | |
381 | if (p->len < 0) continue; | |
382 | while (mag_vh_merge_try(g, p, min_merge_len) == 0) ++n; | |
383 | mag_v_flip(g, p); | |
384 | while (mag_vh_merge_try(g, p, min_merge_len) == 0) ++n; | |
385 | } | |
386 | if (fm_verbose >= 3) | |
387 | fprintf(stderr, "[M::%s] unambiguously merged %ld pairs of vertices\n", __func__, (long)n); | |
388 | } | |
389 | ||
390 | /***************************** | |
391 | * Easy graph simplification * | |
392 | *****************************/ | |
393 | ||
394 | typedef magv_t *magv_p; | |
395 | ||
396 | #define mag_vlt1(a, b) ((a)->nsr < (b)->nsr || ((a)->nsr == (b)->nsr && (a)->len < (b)->len)) | |
397 | KSORT_INIT(vlt1, magv_p, mag_vlt1) | |
398 | ||
399 | #define mag_vlt2(a, b) ((a)->nei[0].n + (a)->nei[1].n < (b)->nei[0].n + (b)->nei[1].n) | |
400 | KSORT_INIT(vlt2, magv_p, mag_vlt2) | |
401 | ||
402 | int mag_g_rm_vext(mag_t *g, int min_len, int min_nsr) | |
403 | { | |
404 | int i; | |
405 | kvec_t(magv_p) a = {0,0,0}; | |
406 | ||
407 | for (i = 0; i < g->v.n; ++i) { | |
408 | magv_t *p = &g->v.a[i]; | |
409 | if (p->len < 0 || (p->nei[0].n > 0 && p->nei[1].n > 0)) continue; | |
410 | if (p->len >= min_len || p->nsr >= min_nsr) continue; | |
411 | kv_push(magv_p, a, p); | |
412 | } | |
413 | ks_introsort(vlt1, a.n, a.a); | |
414 | for (i = 0; i < a.n; ++i) mag_v_del(g, a.a[i]); | |
415 | free(a.a); | |
416 | if (fm_verbose >= 3) | |
417 | fprintf(stderr, "[M::%s] removed %ld tips (min_len=%d, min_nsr=%d)\n", __func__, a.n, min_len, min_nsr); | |
418 | return a.n; | |
419 | } | |
420 | ||
421 | int mag_g_rm_vint(mag_t *g, int min_len, int min_nsr, int min_ovlp) | |
422 | { | |
423 | int i; | |
424 | kvec_t(magv_p) a = {0,0,0}; | |
425 | ||
426 | for (i = 0; i < g->v.n; ++i) { | |
427 | magv_t *p = &g->v.a[i]; | |
428 | if (p->len >= 0 && p->len < min_len && p->nsr < min_nsr) | |
429 | kv_push(magv_p, a, p); | |
430 | } | |
431 | ks_introsort(vlt1, a.n, a.a); | |
432 | for (i = 0; i < a.n; ++i) mag_v_transdel(g, a.a[i], min_ovlp); | |
433 | free(a.a); | |
434 | if (fm_verbose >= 3) | |
435 | fprintf(stderr, "[M::%s] removed %ld internal vertices (min_len=%d, min_nsr=%d)\n", __func__, a.n, min_len, min_nsr); | |
436 | return a.n; | |
437 | } | |
438 | ||
439 | void mag_g_rm_edge(mag_t *g, int min_ovlp, double min_ratio, int min_len, int min_nsr) | |
440 | { | |
441 | int i, j, k; | |
442 | kvec_t(magv_p) a = {0,0,0}; | |
443 | uint64_t n_marked = 0; | |
444 | ||
445 | for (i = 0; i < g->v.n; ++i) { | |
446 | magv_t *p = &g->v.a[i]; | |
447 | if (p->len < 0) continue; | |
448 | if ((p->nei[0].n == 0 || p->nei[1].n == 0) && p->len < min_len && p->nsr < min_nsr) | |
449 | continue; // skip tips | |
450 | kv_push(magv_p, a, p); | |
451 | } | |
452 | ks_introsort(vlt1, a.n, a.a); | |
453 | ||
454 | for (i = a.n - 1; i >= 0; --i) { | |
455 | magv_t *p = a.a[i]; | |
456 | for (j = 0; j < 2; ++j) { | |
457 | ku128_v *r = &p->nei[j]; | |
458 | int max_ovlp = min_ovlp, max_k = -1; | |
459 | if (r->n == 0) continue; // no overlapping reads | |
460 | for (k = 0; k < r->n; ++k) // get the max overlap length | |
461 | if (max_ovlp < r->a[k].y) | |
462 | max_ovlp = r->a[k].y, max_k = k; | |
463 | if (max_k >= 0) { // test if max_k is a tip | |
464 | uint64_t x = tid2idd(g->h, r->a[max_k].x); | |
465 | magv_t *q = &g->v.a[x>>1]; | |
466 | if (q->len >= 0 && (q->nei[0].n == 0 || q->nei[1].n == 0) && q->len < min_len && q->nsr < min_nsr) | |
467 | max_ovlp = min_ovlp; | |
468 | } | |
469 | for (k = 0; k < r->n; ++k) { | |
470 | if (edge_is_del(r->a[k])) continue; | |
471 | if (r->a[k].y < min_ovlp || (double)r->a[k].y / max_ovlp < min_ratio) { | |
472 | mag_eh_markdel(g, r->a[k].x, p->k[j]); // FIXME: should we check if r->a[k] is p itself? | |
473 | edge_mark_del(r->a[k]); | |
474 | ++n_marked; | |
475 | } | |
476 | } | |
477 | } | |
478 | } | |
479 | free(a.a); | |
480 | if (fm_verbose >= 3) | |
481 | fprintf(stderr, "[M::%s] removed %ld edges\n", __func__, (long)n_marked); | |
482 | } | |
483 | ||
484 | /********************************************* | |
485 | * A-statistics and simplistic flow analysis * | |
486 | *********************************************/ | |
487 | ||
488 | #define A_THRES 20. | |
489 | #define A_MIN_SUPP 5 | |
490 | ||
491 | double mag_cal_rdist(mag_t *g) | |
492 | { | |
493 | magv_v *v = &g->v; | |
494 | int j; | |
495 | uint64_t *srt; | |
496 | double rdist = -1.; | |
497 | int64_t i, sum_n_all, sum_n, sum_l; | |
498 | ||
499 | srt = calloc(v->n, 8); | |
500 | for (i = 0, sum_n_all = 0; i < v->n; ++i) { | |
501 | srt[i] = (uint64_t)v->a[i].nsr<<32 | i; | |
502 | sum_n_all += v->a[i].nsr; | |
503 | } | |
504 | ks_introsort_uint64_t(v->n, srt); | |
505 | ||
506 | for (j = 0; j < 2; ++j) { | |
507 | sum_n = sum_l = 0; | |
508 | for (i = v->n - 1; i >= 0; --i) { | |
509 | const magv_t *p = &v->a[srt[i]<<32>>32]; | |
510 | int tmp1, tmp2; | |
511 | tmp1 = tmp2 = 0; | |
512 | if (p->nei[0].n) ++tmp1, tmp2 += p->nei[0].a[0].y; | |
513 | if (p->nei[1].n) ++tmp1, tmp2 += p->nei[1].a[0].y; | |
514 | if (tmp1) tmp2 /= tmp1; | |
515 | if (rdist > 0.) { | |
516 | double A = (p->len - tmp1) / rdist - p->nsr * M_LN2; | |
517 | if (A < A_THRES) continue; | |
518 | } | |
519 | sum_n += p->nsr; | |
520 | sum_l += p->len - tmp1; | |
521 | if (sum_n >= sum_n_all * 0.5) break; | |
522 | } | |
523 | rdist = (double)sum_l / sum_n; | |
524 | } | |
525 | if (fm_verbose >= 3) { | |
526 | fprintf(stderr, "[M::%s] average read distance %.3f.\n", __func__, rdist); | |
527 | fprintf(stderr, "[M::%s] approximate genome size: %.0f (inaccurate!)\n", __func__, rdist * sum_n_all); | |
528 | } | |
529 | ||
530 | free(srt); | |
531 | return rdist; | |
532 | } | |
533 | ||
534 | /************** | |
535 | * Key portal * | |
536 | **************/ | |
537 | ||
538 | void mag_init_opt(magopt_t *o) | |
539 | { | |
540 | memset(o, 0, sizeof(magopt_t)); | |
541 | o->trim_len = 0; | |
542 | o->trim_depth = 6; | |
543 | ||
544 | o->min_elen = 300; | |
545 | o->min_ovlp = 0; | |
546 | o->min_merge_len = 0; | |
547 | o->min_ensr = 4; | |
548 | o->min_insr = 3; | |
549 | o->min_dratio1 = 0.7; | |
550 | ||
551 | o->max_bcov = 10.; | |
552 | o->max_bfrac = 0.15; | |
553 | o->max_bvtx = 64; | |
554 | o->max_bdist = 512; | |
555 | } | |
556 | ||
557 | void mag_g_clean(mag_t *g, const magopt_t *opt) | |
558 | { | |
559 | int j; | |
560 | ||
561 | if (g->min_ovlp < opt->min_ovlp) g->min_ovlp = opt->min_ovlp; | |
562 | for (j = 2; j <= opt->min_ensr; ++j) | |
563 | mag_g_rm_vext(g, opt->min_elen, j); | |
564 | mag_g_merge(g, 0, opt->min_merge_len); | |
565 | mag_g_rm_edge(g, g->min_ovlp, opt->min_dratio1, opt->min_elen, opt->min_ensr); | |
566 | mag_g_merge(g, 1, opt->min_merge_len); | |
567 | for (j = 2; j <= opt->min_ensr; ++j) | |
568 | mag_g_rm_vext(g, opt->min_elen, j); | |
569 | mag_g_merge(g, 0, opt->min_merge_len); | |
570 | if (opt->flag & MAG_F_AGGRESSIVE) mag_g_pop_open(g, opt->min_elen); | |
571 | if (!(opt->flag & MAG_F_NO_SIMPL)) mag_g_simplify_bubble(g, opt->max_bvtx, opt->max_bdist); | |
572 | mag_g_pop_simple(g, opt->max_bcov, opt->max_bfrac, opt->min_merge_len, opt->flag & MAG_F_AGGRESSIVE); | |
573 | mag_g_rm_vint(g, opt->min_elen, opt->min_insr, g->min_ovlp); | |
574 | mag_g_rm_edge(g, g->min_ovlp, opt->min_dratio1, opt->min_elen, opt->min_ensr); | |
575 | mag_g_merge(g, 1, opt->min_merge_len); | |
576 | mag_g_rm_vext(g, opt->min_elen, opt->min_ensr); | |
577 | mag_g_merge(g, 0, opt->min_merge_len); | |
578 | if (opt->flag & MAG_F_AGGRESSIVE) mag_g_pop_open(g, opt->min_elen); | |
579 | mag_g_rm_vext(g, opt->min_elen, opt->min_ensr); | |
580 | mag_g_merge(g, 0, opt->min_merge_len); | |
581 | } | |
582 | ||
583 | void mag_v_trim_open(mag_t *g, magv_t *v, int trim_len, int trim_depth) | |
584 | { | |
585 | int i, j, tl[2]; | |
586 | if (v->nei[0].n > 0 && v->nei[1].n > 0) return; // no open end; do nothing | |
587 | if (v->nei[0].n == 0 && v->nei[1].n == 0 && v->len < trim_len * 3) { // disconnected short vertex | |
588 | mag_v_del(g, v); | |
589 | return; | |
590 | } | |
591 | for (j = 0; j < 2; ++j) { | |
592 | ku128_v *r = &v->nei[!j]; | |
593 | int max_ovlp = 0; | |
594 | for (i = 0; i < r->n; ++i) | |
595 | max_ovlp = max_ovlp > r->a[i].y? max_ovlp : r->a[i].y; | |
596 | tl[j] = v->len - max_ovlp < trim_len? v->len - max_ovlp : trim_len; | |
597 | } | |
598 | if (v->nei[0].n == 0) { | |
599 | for (i = 0; i < tl[0] && v->cov[i] - 33 < trim_depth; ++i); | |
600 | tl[0] = i; | |
601 | v->len -= i; | |
602 | memmove(v->seq, v->seq + tl[0], v->len); | |
603 | memmove(v->cov, v->cov + tl[0], v->len); | |
604 | } | |
605 | if (v->nei[1].n == 0) { | |
606 | for (i = v->len - 1; i >= v->len - tl[1] && v->cov[i] - 33 < trim_depth; --i); | |
607 | tl[1] = v->len - 1 - i; | |
608 | v->len -= tl[1]; | |
609 | } | |
610 | } | |
611 | ||
612 | void mag_g_trim_open(mag_t *g, const magopt_t *opt) | |
613 | { | |
614 | int i; | |
615 | if (opt->trim_len == 0) return; | |
616 | for (i = 0; i < g->v.n; ++i) | |
617 | mag_v_trim_open(g, &g->v.a[i], opt->trim_len, opt->trim_depth); | |
618 | } |
0 | #ifndef FM_MOG_H | |
1 | #define FM_MOG_H | |
2 | ||
3 | #include <stdint.h> | |
4 | #include <stdlib.h> | |
5 | #include "kstring.h" | |
6 | #include "fml.h" | |
7 | ||
8 | #ifndef KINT_DEF | |
9 | #define KINT_DEF | |
10 | typedef struct { uint64_t x, y; } ku128_t; | |
11 | typedef struct { size_t n, m; uint64_t *a; } ku64_v; | |
12 | typedef struct { size_t n, m; ku128_t *a; } ku128_v; | |
13 | #endif | |
14 | ||
15 | typedef struct { | |
16 | int len, nsr; // length; number supporting reads | |
17 | uint32_t max_len;// allocated seq/cov size | |
18 | uint64_t k[2]; // bi-interval | |
19 | ku128_v nei[2]; // neighbors | |
20 | char *seq, *cov; // sequence and coverage | |
21 | void *ptr; // additional information | |
22 | } magv_t; | |
23 | ||
24 | typedef struct { size_t n, m; magv_t *a; } magv_v; | |
25 | ||
26 | typedef struct mag_t { | |
27 | magv_v v; | |
28 | float rdist; // read distance | |
29 | int min_ovlp; // minimum overlap seen from the graph | |
30 | void *h; | |
31 | } mag_t; | |
32 | ||
33 | struct mogb_aux; | |
34 | typedef struct mogb_aux mogb_aux_t; | |
35 | ||
36 | #ifdef __cplusplus | |
37 | extern "C" { | |
38 | #endif | |
39 | ||
40 | void mag_init_opt(magopt_t *o); | |
41 | void mag_g_clean(mag_t *g, const magopt_t *opt); | |
42 | ||
43 | void mag_g_destroy(mag_t *g); | |
44 | void mag_g_amend(mag_t *g); | |
45 | void mag_g_build_hash(mag_t *g); | |
46 | void mag_g_print(const mag_t *g); | |
47 | int mag_g_rm_vext(mag_t *g, int min_len, int min_nsr); | |
48 | void mag_g_rm_edge(mag_t *g, int min_ovlp, double min_ratio, int min_len, int min_nsr); | |
49 | void mag_g_merge(mag_t *g, int rmdup, int min_merge_len); | |
50 | void mag_g_simplify_bubble(mag_t *g, int max_vtx, int max_dist); | |
51 | void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int aggressive); | |
52 | void mag_g_pop_open(mag_t *g, int min_elen); | |
53 | void mag_g_trim_open(mag_t *g, const magopt_t *opt); | |
54 | ||
55 | void mag_v_copy_to_empty(magv_t *dst, const magv_t *src); // NB: memory leak if dst is allocated | |
56 | void mag_v_del(mag_t *g, magv_t *p); | |
57 | void mag_v_write(const magv_t *p, kstring_t *out); | |
58 | void mag_v_pop_open(mag_t *g, magv_t *p, int min_elen); | |
59 | ||
60 | uint64_t mag_tid2idd(void *h, uint64_t tid); | |
61 | void mag_v128_clean(ku128_v *r); | |
62 | double mag_cal_rdist(mag_t *g); | |
63 | ||
64 | #ifdef __cplusplus | |
65 | } | |
66 | #endif | |
67 | ||
68 | #endif |
0 | #include <assert.h> | |
1 | #include "internal.h" | |
2 | #include "kstring.h" | |
3 | #include "rle.h" | |
4 | #include "mrope.h" | |
5 | #include "rld0.h" | |
6 | #include "mag.h" | |
7 | #include "kvec.h" | |
8 | #include "fml.h" | |
9 | #include "htab.h" | |
10 | ||
11 | unsigned char seq_nt6_table[256] = { | |
12 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, | |
13 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, | |
14 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, | |
15 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, | |
16 | 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, | |
17 | 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, | |
18 | 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, | |
19 | 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, | |
20 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, | |
21 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, | |
22 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, | |
23 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, | |
24 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, | |
25 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, | |
26 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, | |
27 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 | |
28 | }; | |
29 | ||
30 | void fml_opt_init(fml_opt_t *opt) | |
31 | { | |
32 | opt->n_threads = 1; | |
33 | opt->ec_k = 0; | |
34 | opt->min_cnt = 4; | |
35 | opt->max_cnt = 8; | |
36 | opt->min_asm_ovlp = 33; | |
37 | opt->min_merge_len = 0; | |
38 | mag_init_opt(&opt->mag_opt); | |
39 | opt->mag_opt.flag = MAG_F_NO_SIMPL; | |
40 | } | |
41 | ||
42 | void fml_opt_adjust(fml_opt_t *opt, int n_seqs, const bseq1_t *seqs) | |
43 | { | |
44 | int i, log_len; | |
45 | uint64_t tot_len = 0; | |
46 | if (opt->n_threads < 1) opt->n_threads = 1; | |
47 | for (i = 0; i < n_seqs; ++i) tot_len += seqs[i].l_seq; // compute total length | |
48 | for (log_len = 10; log_len < 32; ++log_len) // compute ceil(log2(tot_len)) | |
49 | if (1ULL<<log_len > tot_len) break; | |
50 | if (opt->ec_k == 0) opt->ec_k = (log_len + 12) / 2; | |
51 | if (opt->ec_k%2 == 0) ++opt->ec_k; | |
52 | opt->mag_opt.min_elen = (int)((double)tot_len / n_seqs * 2.5 + .499); | |
53 | } | |
54 | ||
55 | static inline int is_rev_same(int l, const char *s) | |
56 | { | |
57 | int i; | |
58 | if (l&1) return 0; | |
59 | for (i = 0; i < l>>1; ++i) | |
60 | if (s[i] + s[l-1-i] != 5) break; | |
61 | return (i == l>>1); | |
62 | } | |
63 | ||
64 | struct rld_t *fml_fmi_gen(int n, bseq1_t *seq, int is_mt) | |
65 | { | |
66 | mrope_t *mr; | |
67 | kstring_t str = {0,0,0}; | |
68 | mritr_t itr; | |
69 | rlditr_t di; | |
70 | const uint8_t *block; | |
71 | rld_t *e = 0; | |
72 | int k; | |
73 | ||
74 | mr = mr_init(ROPE_DEF_MAX_NODES, ROPE_DEF_BLOCK_LEN, MR_SO_RCLO); | |
75 | for (k = 0; k < n; ++k) { | |
76 | int i; | |
77 | bseq1_t *s = &seq[k]; | |
78 | if (s->l_seq == 0) continue; | |
79 | free(s->qual); | |
80 | for (i = 0; i < s->l_seq; ++i) | |
81 | s->seq[i] = seq_nt6_table[(int)s->seq[i]]; | |
82 | for (i = 0; i < s->l_seq; ++i) | |
83 | if (s->seq[i] == 5) break; | |
84 | if (i < s->l_seq) { | |
85 | free(s->seq); | |
86 | continue; | |
87 | } | |
88 | if (is_rev_same(s->l_seq, s->seq)) | |
89 | --s->l_seq, s->seq[s->l_seq] = 0; | |
90 | seq_reverse(s->l_seq, (uint8_t*)s->seq); | |
91 | kputsn(s->seq, s->l_seq + 1, &str); | |
92 | seq_revcomp6(s->l_seq, (uint8_t*)s->seq); | |
93 | kputsn(s->seq, s->l_seq + 1, &str); | |
94 | free(s->seq); | |
95 | } | |
96 | free(seq); | |
97 | mr_insert_multi(mr, str.l, (uint8_t*)str.s, is_mt); | |
98 | free(str.s); | |
99 | ||
100 | e = rld_init(6, 3); | |
101 | rld_itr_init(e, &di, 0); | |
102 | mr_itr_first(mr, &itr, 1); | |
103 | while ((block = mr_itr_next_block(&itr)) != 0) { | |
104 | const uint8_t *q = block + 2, *end = block + 2 + *rle_nptr(block); | |
105 | while (q < end) { | |
106 | int c = 0; | |
107 | int64_t l; | |
108 | rle_dec1(q, c, l); | |
109 | rld_enc(e, &di, l, c); | |
110 | } | |
111 | } | |
112 | rld_enc_finish(e, &di); | |
113 | ||
114 | mr_destroy(mr); | |
115 | return e; | |
116 | } | |
117 | ||
118 | struct rld_t *fml_seq2fmi(const fml_opt_t *opt, int n, bseq1_t *seq) | |
119 | { | |
120 | return fml_fmi_gen(n, seq, opt->n_threads > 1? 1 : 0); | |
121 | } | |
122 | ||
123 | void fml_fmi_destroy(rld_t *e) | |
124 | { | |
125 | rld_destroy(e); | |
126 | } | |
127 | ||
128 | void fml_mag_clean(const fml_opt_t *opt, struct mag_t *g) | |
129 | { | |
130 | magopt_t o = opt->mag_opt; | |
131 | o.min_merge_len = opt->min_merge_len; | |
132 | mag_g_merge(g, 1, opt->min_merge_len); | |
133 | mag_g_clean(g, &o); | |
134 | mag_g_trim_open(g, &o); | |
135 | } | |
136 | ||
137 | void fml_mag_destroy(struct mag_t *g) | |
138 | { | |
139 | mag_g_destroy(g); | |
140 | } | |
141 | ||
142 | #include "khash.h" | |
143 | KHASH_DECLARE(64, uint64_t, uint64_t) | |
144 | ||
145 | #define edge_is_del(_x) ((_x).x == (uint64_t)-2 || (_x).y == 0) // from mag.c | |
146 | ||
147 | fml_utg_t *fml_mag2utg(struct mag_t *g, int *n) | |
148 | { | |
149 | size_t i, j; | |
150 | fml_utg_t *utg; | |
151 | khash_t(64) *h; | |
152 | khint_t k; | |
153 | ||
154 | h = kh_init(64); | |
155 | for (i = j = 0; i < g->v.n; ++i) { | |
156 | int absent; | |
157 | magv_t *p = &g->v.a[i]; | |
158 | if (p->len < 0) continue; | |
159 | k = kh_put(64, h, p->k[0], &absent); | |
160 | kh_val(h, k) = j<<1 | 0; | |
161 | k = kh_put(64, h, p->k[1], &absent); | |
162 | kh_val(h, k) = j<<1 | 1; | |
163 | ++j; | |
164 | } | |
165 | *n = j; | |
166 | kh_destroy(64, g->h); | |
167 | ||
168 | utg = (fml_utg_t*)calloc(*n, sizeof(fml_utg_t)); | |
169 | for (i = j = 0; i < g->v.n; ++i) { | |
170 | magv_t *p = &g->v.a[i]; | |
171 | fml_utg_t *q; | |
172 | int from, a, b; | |
173 | if (p->len < 0) continue; | |
174 | q = &utg[j++]; | |
175 | q->len = p->len, q->nsr = p->nsr; | |
176 | q->seq = p->seq, q->cov = p->cov; | |
177 | for (a = 0; a < q->len; ++a) | |
178 | q->seq[a] = "$ACGTN"[(int)q->seq[a]]; | |
179 | q->seq[q->len] = q->cov[q->len] = 0; | |
180 | for (from = 0; from < 2; ++from) { | |
181 | ku128_v *r = &p->nei[from]; | |
182 | for (b = q->n_ovlp[from] = 0; b < r->n; ++b) | |
183 | if (!edge_is_del(r->a[b])) ++q->n_ovlp[from]; | |
184 | } | |
185 | q->ovlp = (fml_ovlp_t*)calloc(q->n_ovlp[0] + q->n_ovlp[1], sizeof(fml_ovlp_t)); | |
186 | for (from = a = 0; from < 2; ++from) { | |
187 | ku128_v *r = &p->nei[from]; | |
188 | for (b = 0; b < r->n; ++b) { | |
189 | ku128_t *s = &r->a[b]; | |
190 | fml_ovlp_t *t; | |
191 | if (edge_is_del(*s)) continue; | |
192 | t = &q->ovlp[a++]; | |
193 | k = kh_get(64, h, s->x); | |
194 | assert(k != kh_end(h)); | |
195 | t->tid = kh_val(h, k); | |
196 | t->len = s->y; | |
197 | t->from = from; | |
198 | } | |
199 | free(p->nei[from].a); | |
200 | } | |
201 | } | |
202 | kh_destroy(64, h); | |
203 | free(g->v.a); | |
204 | free(g); | |
205 | return utg; | |
206 | } | |
207 | ||
208 | void fml_utg_print(int n, const fml_utg_t *utg) | |
209 | { | |
210 | int i, j, l; | |
211 | kstring_t out = {0,0,0}; | |
212 | for (i = 0; i < n; ++i) { | |
213 | const fml_utg_t *u = &utg[i]; | |
214 | out.l = 0; | |
215 | kputc('@', &out); kputw(i<<1|0, &out); kputc(':', &out); kputw(i<<1|1, &out); | |
216 | kputc('\t', &out); kputw(u->nsr, &out); | |
217 | kputc('\t', &out); | |
218 | for (j = 0; j < u->n_ovlp[0]; ++j) { | |
219 | kputw(u->ovlp[j].tid, &out); kputc(',', &out); | |
220 | kputw(u->ovlp[j].len, &out); kputc(';', &out); | |
221 | } | |
222 | if (u->n_ovlp[0] == 0) kputc('.', &out); | |
223 | kputc('\t', &out); | |
224 | for (; j < u->n_ovlp[0] + u->n_ovlp[1]; ++j) { | |
225 | kputw(u->ovlp[j].tid, &out); kputc(',', &out); | |
226 | kputw(u->ovlp[j].len, &out); kputc(';', &out); | |
227 | } | |
228 | if (u->n_ovlp[1] == 0) kputc('.', &out); | |
229 | kputc('\n', &out); | |
230 | l = out.l; | |
231 | kputsn(u->seq, u->len, &out); | |
232 | kputsn("\n+\n", 3, &out); | |
233 | kputsn(u->cov, u->len, &out); | |
234 | kputc('\n', &out); | |
235 | fputs(out.s, stdout); | |
236 | } | |
237 | free(out.s); | |
238 | } | |
239 | ||
240 | void fml_utg_destroy(int n, fml_utg_t *utg) | |
241 | { | |
242 | int i; | |
243 | for (i = 0; i < n; ++i) { | |
244 | free(utg[i].seq); | |
245 | free(utg[i].cov); | |
246 | free(utg[i].ovlp); | |
247 | } | |
248 | free(utg); | |
249 | } | |
250 | ||
251 | #define MAG_MIN_NSR_COEF .1 | |
252 | ||
253 | fml_utg_t *fml_assemble(const fml_opt_t *opt0, int n_seqs, bseq1_t *seqs, int *n_utg) | |
254 | { | |
255 | rld_t *e; | |
256 | mag_t *g; | |
257 | fml_utg_t *utg; | |
258 | fml_opt_t opt = *opt0; | |
259 | float kcov; | |
260 | ||
261 | fml_opt_adjust(&opt, n_seqs, seqs); | |
262 | if (opt.ec_k >= 0) fml_correct(&opt, n_seqs, seqs); | |
263 | kcov = fml_fltuniq(&opt, n_seqs, seqs); | |
264 | e = fml_seq2fmi(&opt, n_seqs, seqs); | |
265 | g = fml_fmi2mag(&opt, e); | |
266 | opt.mag_opt.min_ensr = opt.mag_opt.min_ensr > kcov * MAG_MIN_NSR_COEF? opt.mag_opt.min_ensr : (int)(kcov * MAG_MIN_NSR_COEF + .499); | |
267 | opt.mag_opt.min_ensr = opt.mag_opt.min_ensr < opt0->max_cnt? opt.mag_opt.min_ensr : opt0->max_cnt; | |
268 | opt.mag_opt.min_ensr = opt.mag_opt.min_ensr > opt0->min_cnt? opt.mag_opt.min_ensr : opt0->min_cnt; | |
269 | opt.mag_opt.min_insr = opt.mag_opt.min_ensr - 1; | |
270 | fml_mag_clean(&opt, g); | |
271 | utg = fml_mag2utg(g, n_utg); | |
272 | return utg; | |
273 | } |
0 | #include <stdlib.h> | |
1 | #include <string.h> | |
2 | #include <assert.h> | |
3 | #include <unistd.h> | |
4 | #include <pthread.h> | |
5 | #include <stdio.h> | |
6 | #include <time.h> | |
7 | #include "mrope.h" | |
8 | ||
9 | /******************************* | |
10 | *** Single-string insertion *** | |
11 | *******************************/ | |
12 | ||
13 | mrope_t *mr_init(int max_nodes, int block_len, int sorting_order) | |
14 | { | |
15 | int a; | |
16 | mrope_t *r; | |
17 | assert(sorting_order >= 0 && sorting_order <= 2); | |
18 | r = calloc(1, sizeof(mrope_t)); | |
19 | r->so = sorting_order; | |
20 | r->thr_min = 1000; | |
21 | for (a = 0; a != 6; ++a) | |
22 | r->r[a] = rope_init(max_nodes, block_len); | |
23 | return r; | |
24 | } | |
25 | ||
26 | void mr_destroy(mrope_t *r) | |
27 | { | |
28 | int a; | |
29 | for (a = 0; a != 6; ++a) | |
30 | if (r->r[a]) rope_destroy(r->r[a]); | |
31 | free(r); | |
32 | } | |
33 | ||
34 | int mr_thr_min(mrope_t *r, int thr_min) | |
35 | { | |
36 | if (thr_min > 0) | |
37 | r->thr_min = thr_min; | |
38 | return r->thr_min; | |
39 | } | |
40 | ||
41 | int64_t mr_insert1(mrope_t *r, const uint8_t *str) | |
42 | { | |
43 | int64_t tl[6], tu[6], l, u; | |
44 | const uint8_t *p; | |
45 | int b, is_srt = (r->so != MR_SO_IO), is_comp = (r->so == MR_SO_RCLO); | |
46 | for (u = 0, b = 0; b != 6; ++b) u += r->r[b]->c[0]; | |
47 | l = is_srt? 0 : u; | |
48 | for (p = str, b = 0; *p; b = *p++) { | |
49 | int a; | |
50 | if (l != u) { | |
51 | int64_t cnt = 0; | |
52 | rope_rank2a(r->r[b], l, u, tl, tu); | |
53 | if (is_comp && *p != 5) { | |
54 | for (a = 4; a > *p; --a) l += tu[a] - tl[a]; | |
55 | l += tu[0] - tl[0]; | |
56 | } else for (a = 0; a < *p; ++a) l += tu[a] - tl[a]; | |
57 | rope_insert_run(r->r[b], l, *p, 1, 0); | |
58 | while (--b >= 0) cnt += r->r[b]->c[*p]; | |
59 | l = cnt + tl[*p]; u = cnt + tu[*p]; | |
60 | } else { | |
61 | l = rope_insert_run(r->r[b], l, *p, 1, 0); | |
62 | while (--b >= 0) l += r->r[b]->c[*p]; | |
63 | u = l; | |
64 | } | |
65 | } | |
66 | return rope_insert_run(r->r[b], l, 0, 1, 0); | |
67 | } | |
68 | ||
69 | void mr_rank2a(const mrope_t *mr, int64_t x, int64_t y, int64_t *cx, int64_t *cy) | |
70 | { | |
71 | int a, b; | |
72 | int64_t z, c[6], l; | |
73 | memset(c, 0, 48); | |
74 | for (a = 0, z = 0; a < 6; ++a) { | |
75 | const int64_t *ca = mr->r[a]->c; | |
76 | l = ca[0] + ca[1] + ca[2] + ca[3] + ca[4] + ca[5]; | |
77 | if (z + l >= x) break; | |
78 | for (b = 0; b < 6; ++b) c[b] += ca[b]; | |
79 | z += l; | |
80 | } | |
81 | assert(a != 6); | |
82 | if (y >= 0 && z + l >= y) { // [x,y) is in the same bucket | |
83 | rope_rank2a(mr->r[a], x - z, y - z, cx, cy); | |
84 | for (b = 0; b < 6; ++b) | |
85 | cx[b] += c[b], cy[b] += c[b]; | |
86 | return; | |
87 | } | |
88 | if (x != z) rope_rank1a(mr->r[a], x - z, cx); | |
89 | else memset(cx, 0, 48); | |
90 | for (b = 0; b < 6; ++b) | |
91 | cx[b] += c[b], c[b] += mr->r[a]->c[b]; | |
92 | if (y < 0) return; | |
93 | for (++a, z += l; a < 6; ++a) { | |
94 | const int64_t *ca = mr->r[a]->c; | |
95 | l = ca[0] + ca[1] + ca[2] + ca[3] + ca[4] + ca[5]; | |
96 | if (z + l >= y) break; | |
97 | for (b = 0; b < 6; ++b) c[b] += ca[b]; | |
98 | z += l; | |
99 | } | |
100 | assert(a != 6); | |
101 | if (y != z + l) rope_rank1a(mr->r[a], y - z, cy); | |
102 | else for (b = 0; b < 6; ++b) cy[b] = mr->r[a]->c[b]; | |
103 | for (b = 0; b < 6; ++b) cy[b] += c[b]; | |
104 | } | |
105 | ||
106 | /********************** | |
107 | *** Mrope iterator *** | |
108 | **********************/ | |
109 | ||
110 | void mr_itr_first(mrope_t *r, mritr_t *i, int to_free) | |
111 | { | |
112 | i->a = 0; i->r = r; i->to_free = to_free; | |
113 | rope_itr_first(i->r->r[0], &i->i); | |
114 | } | |
115 | ||
116 | const uint8_t *mr_itr_next_block(mritr_t *i) | |
117 | { | |
118 | const uint8_t *s; | |
119 | if (i->a >= 6) return 0; | |
120 | while ((s = rope_itr_next_block(&i->i)) == 0) { | |
121 | if (i->to_free) { | |
122 | rope_destroy(i->r->r[i->a]); | |
123 | i->r->r[i->a] = 0; | |
124 | } | |
125 | if (++i->a == 6) return 0; | |
126 | rope_itr_first(i->r->r[i->a], &i->i); | |
127 | } | |
128 | return i->a == 6? 0 : s; | |
129 | } | |
130 | ||
131 | /***************************************** | |
132 | *** Inserting multiple strings in RLO *** | |
133 | *****************************************/ | |
134 | ||
135 | typedef struct { | |
136 | uint64_t l; | |
137 | uint64_t u:61, c:3; | |
138 | const uint8_t *p; | |
139 | } triple64_t; | |
140 | ||
141 | typedef const uint8_t *cstr_t; | |
142 | ||
143 | #define rope_comp6(c) ((c) >= 1 && (c) <= 4? 5 - (c) : (c)) | |
144 | ||
145 | static void mr_insert_multi_aux(rope_t *rope, int64_t m, triple64_t *a, int is_comp) | |
146 | { | |
147 | int64_t k, beg; | |
148 | rpcache_t cache; | |
149 | memset(&cache, 0, sizeof(rpcache_t)); | |
150 | for (k = 0; k != m; ++k) // set the base to insert | |
151 | a[k].c = *a[k].p++; | |
152 | for (k = 1, beg = 0; k <= m; ++k) { | |
153 | if (k == m || a[k].u != a[k-1].u) { | |
154 | int64_t x, i, l = a[beg].l, u = a[beg].u, tl[6], tu[6], c[6]; | |
155 | int start, end, step, b; | |
156 | if (l == u && k == beg + 1) { // special case; still works without the following block | |
157 | a[beg].l = a[beg].u = rope_insert_run(rope, l, a[beg].c, 1, &cache); | |
158 | beg = k; | |
159 | continue; | |
160 | } else if (l == u) { | |
161 | memset(tl, 0, 48); | |
162 | memset(tu, 0, 48); | |
163 | } else rope_rank2a(rope, l, u, tl, tu); | |
164 | memset(c, 0, 48); | |
165 | for (i = beg; i < k; ++i) ++c[a[i].c]; | |
166 | // insert sentinel | |
167 | if (c[0]) rope_insert_run(rope, l, 0, c[0], &cache); | |
168 | // insert A/C/G/T | |
169 | x = l + c[0] + (tu[0] - tl[0]); | |
170 | if (is_comp) start = 4, end = 0, step = -1; | |
171 | else start = 1, end = 5, step = 1; | |
172 | for (b = start; b != end; b += step) { | |
173 | int64_t size = tu[b] - tl[b]; | |
174 | if (c[b]) { | |
175 | tl[b] = rope_insert_run(rope, x, b, c[b], &cache); | |
176 | tu[b] = tl[b] + size; | |
177 | } | |
178 | x += c[b] + size; | |
179 | } | |
180 | // insert N | |
181 | if (c[5]) { | |
182 | tu[5] -= tl[5]; | |
183 | tl[5] = rope_insert_run(rope, x, 5, c[5], &cache); | |
184 | tu[5] += tl[5]; | |
185 | } | |
186 | // update a[] | |
187 | for (i = beg; i < k; ++i) { | |
188 | triple64_t *p = &a[i]; | |
189 | p->l = tl[p->c], p->u = tu[p->c]; | |
190 | } | |
191 | beg = k; | |
192 | } | |
193 | } | |
194 | } | |
195 | ||
196 | typedef struct { | |
197 | volatile int *n_fin_workers; | |
198 | volatile int to_run; | |
199 | int to_exit; | |
200 | mrope_t *mr; | |
201 | int b, is_comp; | |
202 | int64_t m; | |
203 | triple64_t *a; | |
204 | } worker_t; | |
205 | ||
206 | static void *worker(void *data) | |
207 | { | |
208 | worker_t *w = (worker_t*)data; | |
209 | struct timespec req, rem; | |
210 | req.tv_sec = 0; req.tv_nsec = 1000000; | |
211 | do { | |
212 | while (!__sync_bool_compare_and_swap(&w->to_run, 1, 0)) nanosleep(&req, &rem); // wait for the signal from the master thread | |
213 | if (w->m) mr_insert_multi_aux(w->mr->r[w->b], w->m, w->a, w->is_comp); | |
214 | __sync_add_and_fetch(w->n_fin_workers, 1); | |
215 | } while (!w->to_exit); | |
216 | return 0; | |
217 | } | |
218 | ||
219 | void mr_insert_multi(mrope_t *mr, int64_t len, const uint8_t *s, int is_thr) | |
220 | { | |
221 | int64_t k, m, n0; | |
222 | int b, is_srt = (mr->so != MR_SO_IO), is_comp = (mr->so == MR_SO_RCLO), stop_thr = 0; | |
223 | volatile int n_fin_workers = 0; | |
224 | triple64_t *a[2], *curr, *prev, *swap; | |
225 | pthread_t *tid = 0; | |
226 | worker_t *w = 0; | |
227 | ||
228 | if (mr->thr_min < 0) mr->thr_min = 0; | |
229 | assert(len > 0 && s[len-1] == 0); | |
230 | { // split into short strings | |
231 | cstr_t p, q, end = s + len; | |
232 | for (p = s, m = 0; p != end; ++p) // count #sentinels | |
233 | if (*p == 0) ++m; | |
234 | curr = a[0] = malloc(m * sizeof(triple64_t)); | |
235 | prev = a[1] = malloc(m * sizeof(triple64_t)); | |
236 | for (p = q = s, k = 0; p != end; ++p) // find the start of each string | |
237 | if (*p == 0) prev[k++].p = q, q = p + 1; | |
238 | } | |
239 | ||
240 | for (k = n0 = 0; k < 6; ++k) n0 += mr->r[k]->c[0]; | |
241 | for (k = 0; k != m; ++k) { | |
242 | if (is_srt) prev[k].l = 0, prev[k].u = n0; | |
243 | else prev[k].l = prev[k].u = n0 + k; | |
244 | prev[k].c = 0; | |
245 | } | |
246 | mr_insert_multi_aux(mr->r[0], m, prev, is_comp); // insert the first (actually the last) column | |
247 | ||
248 | if (is_thr) { | |
249 | tid = alloca(4 * sizeof(pthread_t)); | |
250 | w = alloca(4 * sizeof(worker_t)); | |
251 | memset(w, 0, 4 * sizeof(worker_t)); | |
252 | for (b = 0; b < 4; ++b) { | |
253 | w[b].mr = mr, w[b].b = b + 1, w[b].is_comp = is_comp; | |
254 | w[b].n_fin_workers = &n_fin_workers; | |
255 | } | |
256 | for (b = 0; b < 4; ++b) pthread_create(&tid[b], 0, worker, &w[b]); | |
257 | } | |
258 | ||
259 | n0 = 0; // the number of inserted strings | |
260 | while (m) { | |
261 | int64_t c[6], ac[6]; | |
262 | triple64_t *q[6]; | |
263 | ||
264 | memset(c, 0, 48); | |
265 | for (k = n0; k != m; ++k) ++c[prev[k].c]; // counting | |
266 | for (q[0] = curr + n0, b = 1; b < 6; ++b) q[b] = q[b-1] + c[b-1]; | |
267 | if (n0 + c[0] < m) { | |
268 | for (k = n0; k != m; ++k) *q[prev[k].c]++ = prev[k]; // sort | |
269 | for (b = 0; b < 6; ++b) q[b] -= c[b]; | |
270 | } | |
271 | n0 += c[0]; | |
272 | ||
273 | if (is_thr && !stop_thr) { | |
274 | struct timespec req, rem; | |
275 | req.tv_sec = 0; req.tv_nsec = 1000000; | |
276 | stop_thr = (m - n0 <= mr->thr_min); | |
277 | for (b = 0; b < 4; ++b) { | |
278 | w[b].a = q[b+1], w[b].m = c[b+1]; | |
279 | if (stop_thr) w[b].to_exit = 1; // signal the workers to exit | |
280 | while (!__sync_bool_compare_and_swap(&w[b].to_run, 0, 1)); // signal the workers to start | |
281 | } | |
282 | if (c[5]) mr_insert_multi_aux(mr->r[5], c[5], q[5], is_comp); // the master thread processes the "N" bucket | |
283 | while (!__sync_bool_compare_and_swap(&n_fin_workers, 4, 0)) // wait until all 4 workers finish | |
284 | nanosleep(&req, &rem); | |
285 | if (stop_thr && n0 < m) | |
286 | fprintf(stderr, "[M::%s] Turn off parallelization for this batch as too few strings are left.\n", __func__); | |
287 | } else { | |
288 | for (b = 1; b < 6; ++b) | |
289 | if (c[b]) mr_insert_multi_aux(mr->r[b], c[b], q[b], is_comp); | |
290 | } | |
291 | if (n0 == m) break; | |
292 | ||
293 | memset(ac, 0, 48); | |
294 | for (b = 1; b < 6; ++b) { // update the intervals to account for buckets ahead | |
295 | int a; | |
296 | for (a = 0; a < 6; ++a) ac[a] += mr->r[b-1]->c[a]; | |
297 | for (k = 0; k < c[b]; ++k) { | |
298 | triple64_t *p = &q[b][k]; | |
299 | p->l += ac[p->c]; p->u += ac[p->c]; | |
300 | } | |
301 | } | |
302 | swap = curr, curr = prev, prev = swap; | |
303 | } | |
304 | if (is_thr) for (b = 0; b < 4; ++b) pthread_join(tid[b], 0); | |
305 | free(a[0]); free(a[1]); | |
306 | } |
0 | #ifndef MROPE_H_ | |
1 | #define MROPE_H_ | |
2 | ||
3 | #include "rope.h" | |
4 | ||
5 | #define MR_SO_IO 0 | |
6 | #define MR_SO_RLO 1 | |
7 | #define MR_SO_RCLO 2 | |
8 | ||
9 | typedef struct { | |
10 | uint8_t so; // sorting order | |
11 | int thr_min; // when there are fewer sequences than this, disable multi-threading | |
12 | rope_t *r[6]; | |
13 | } mrope_t; // multi-rope | |
14 | ||
15 | typedef struct { | |
16 | mrope_t *r; | |
17 | int a, to_free; | |
18 | rpitr_t i; | |
19 | } mritr_t; | |
20 | ||
21 | #ifdef __cplusplus | |
22 | extern "C" { | |
23 | #endif | |
24 | ||
25 | /** | |
26 | * Initiate a multi-rope | |
27 | * | |
28 | * @param max_nodes maximum number of nodes in an internal node; use ROPE_DEF_MAX_NODES (64) if unsure | |
29 | * @param block_len maximum block length in an external node; use ROPE_DEF_BLOCK_LEN (256) if unsure | |
30 | * @param sorting_order the order in which sequences are added; possible values defined by the MR_SO_* macros | |
31 | */ | |
32 | mrope_t *mr_init(int max_nodes, int block_len, int sorting_order); | |
33 | ||
34 | void mr_destroy(mrope_t *r); | |
35 | ||
36 | int mr_thr_min(mrope_t *r, int thr_min); | |
37 | ||
38 | /** | |
39 | * Insert one string into the index | |
40 | * | |
41 | * @param r multi-rope | |
42 | * @param str the *reverse* of the input string (important: it is reversed!) | |
43 | */ | |
44 | int64_t mr_insert1(mrope_t *r, const uint8_t *str); | |
45 | ||
46 | /** | |
47 | * Insert multiple strings | |
48 | * | |
49 | * @param mr multi-rope | |
50 | * @param len total length of $s | |
51 | * @param s concatenated, NULL delimited, reversed input strings | |
52 | * @param is_thr true to use 5 threads | |
53 | */ | |
54 | void mr_insert_multi(mrope_t *mr, int64_t len, const uint8_t *s, int is_thr); | |
55 | ||
56 | void mr_rank2a(const mrope_t *mr, int64_t x, int64_t y, int64_t *cx, int64_t *cy); | |
57 | #define mr_rank1a(mr, x, cx) mr_rank2a(mr, x, -1, cx, 0) | |
58 | ||
59 | /** | |
60 | * Put the iterator at the start of the index | |
61 | * | |
62 | * @param r multi-rope | |
63 | * @param i iterator to be initialized | |
64 | * @param to_free if true, free visited buckets | |
65 | */ | |
66 | void mr_itr_first(mrope_t *r, mritr_t *i, int to_free); | |
67 | ||
68 | /** | |
69 | * Iterate to the next block | |
70 | * | |
71 | * @param i iterator | |
72 | * | |
73 | * @return pointer to the start of a block; see rle.h for decoding the block | |
74 | */ | |
75 | const uint8_t *mr_itr_next_block(mritr_t *i); | |
76 | ||
77 | #ifdef __cplusplus | |
78 | } | |
79 | #endif | |
80 | ||
81 | static inline int64_t mr_get_c(const mrope_t *mr, int64_t c[6]) | |
82 | { | |
83 | int a, b; | |
84 | int64_t tot = 0; | |
85 | for (a = 0; a < 6; ++a) c[a] = 0; | |
86 | for (a = 0; a < 6; ++a) { | |
87 | for (b = 0; b < 6; ++b) | |
88 | c[b] += mr->r[a]->c[b]; | |
89 | tot += c[b]; | |
90 | } | |
91 | return tot; | |
92 | } | |
93 | ||
94 | static inline int64_t mr_get_ac(const mrope_t *mr, int64_t ac[7]) | |
95 | { | |
96 | int a; | |
97 | int64_t c[6], tot; | |
98 | tot = mr_get_c(mr, c); | |
99 | for (a = 1, ac[0] = 0; a <= 6; ++a) ac[a] = ac[a-1] + c[a-1]; | |
100 | return tot; | |
101 | } | |
102 | ||
103 | static inline int64_t mr_get_tot(const mrope_t *mr) | |
104 | { | |
105 | int a, b; | |
106 | int64_t tot = 0; | |
107 | for (a = 0; a < 6; ++a) | |
108 | for (b = 0; b < 6; ++b) | |
109 | tot += mr->r[a]->c[b]; | |
110 | return tot; | |
111 | } | |
112 | ||
113 | #endif |
0 | #include <stdlib.h> | |
1 | #include <stdint.h> | |
2 | #include <stdio.h> | |
3 | #include <string.h> | |
4 | #include <assert.h> | |
5 | #include <unistd.h> | |
6 | #include <fcntl.h> | |
7 | #include <sys/mman.h> | |
8 | #include "rld0.h" | |
9 | ||
10 | #define RLD_IBITS_PLUS 4 | |
11 | ||
12 | #define rld_file_size(e) ((4 + (e)->asize) * 8 + (e)->n_bytes + 8 * (e)->n_frames * ((e)->asize + 1)) | |
13 | ||
14 | #ifndef xcalloc | |
15 | #define xcalloc(n, s) calloc(n, s) | |
16 | #endif | |
17 | #ifndef xmalloc | |
18 | #define xmalloc(s) malloc(s) | |
19 | #endif | |
20 | ||
21 | /****************** | |
22 | * Delta encoding * | |
23 | ******************/ | |
24 | ||
25 | static const char LogTable256[256] = { | |
26 | #define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n | |
27 | -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, | |
28 | LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6), | |
29 | LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7) | |
30 | }; | |
31 | ||
32 | static inline int ilog2_32(uint32_t v) | |
33 | { | |
34 | register uint32_t t, tt; | |
35 | if ((tt = v>>16)) return (t = tt>>8) ? 24 + LogTable256[t] : 16 + LogTable256[tt]; | |
36 | return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v]; | |
37 | } | |
38 | ||
39 | static inline int ilog2(uint64_t v) | |
40 | { | |
41 | return v>>32? 32 + ilog2_32(v>>32) : ilog2_32(v); | |
42 | } | |
43 | ||
44 | static inline int64_t rld_delta_enc1(uint64_t x, int *width) | |
45 | { | |
46 | int y = ilog2(x); | |
47 | int z = ilog2_32(y + 1); | |
48 | *width = (z<<1) + 1 + y; | |
49 | return (x ^ (uint64_t)1<<y) | (uint64_t)(y+1)<<y; | |
50 | } | |
51 | ||
52 | /*********************************** | |
53 | * Initialization and deallocation * | |
54 | ***********************************/ | |
55 | ||
56 | rld_t *rld_init(int asize, int bbits) | |
57 | { | |
58 | rld_t *e; | |
59 | e = xcalloc(1, sizeof(rld_t)); | |
60 | e->n = 1; | |
61 | e->z = xmalloc(sizeof(void*)); | |
62 | e->z[0] = xcalloc(RLD_LSIZE, 8); | |
63 | e->ssize = 1<<bbits; | |
64 | e->cnt = xcalloc(asize + 1, 8); | |
65 | e->mcnt = xcalloc(asize + 1, 8); | |
66 | e->abits = ilog2(asize) + 1; | |
67 | e->asize = asize; | |
68 | e->sbits = bbits; | |
69 | e->asize1 = asize + 1; | |
70 | e->offset0[0] = (e->asize1*16+63)/64; | |
71 | e->offset0[1] = (e->asize1*32+63)/64; | |
72 | e->offset0[2] = e->asize1; | |
73 | return e; | |
74 | } | |
75 | ||
76 | void rld_destroy(rld_t *e) | |
77 | { | |
78 | int i = 0; | |
79 | if (e == 0) return; | |
80 | if (e->mem) { | |
81 | close(e->fd); | |
82 | munmap(e->mem, rld_file_size(e)); | |
83 | } else { | |
84 | for (i = 0; i < e->n; ++i) free(e->z[i]); | |
85 | free(e->frame); | |
86 | } | |
87 | free(e->z); free(e->cnt); free(e->mcnt); free(e); | |
88 | } | |
89 | ||
90 | void rld_itr_init(const rld_t *e, rlditr_t *itr, uint64_t k) | |
91 | { | |
92 | itr->i = e->z + (k >> RLD_LBITS); | |
93 | itr->shead = *itr->i + k%RLD_LSIZE; | |
94 | itr->stail = rld_get_stail(e, itr); | |
95 | itr->p = itr->shead + e->offset0[rld_block_type(*itr->shead)]; | |
96 | itr->q = (uint8_t*)itr->p; | |
97 | itr->r = 64; | |
98 | itr->c = -1; | |
99 | itr->l = 0; | |
100 | } | |
101 | ||
102 | /************ | |
103 | * Encoding * | |
104 | ************/ | |
105 | ||
106 | static inline void enc_next_block(rld_t *e, rlditr_t *itr) | |
107 | { | |
108 | int i, type; | |
109 | if (itr->stail + 2 - *itr->i == RLD_LSIZE) { | |
110 | ++e->n; | |
111 | e->z = realloc(e->z, e->n * sizeof(void*)); | |
112 | itr->i = e->z + e->n - 1; | |
113 | itr->shead = *itr->i = xcalloc(RLD_LSIZE, 8); | |
114 | } else itr->shead += e->ssize; | |
115 | if (e->cnt[0] - e->mcnt[0] < 0x4000) { | |
116 | uint16_t *p = (uint16_t*)itr->shead; | |
117 | for (i = 0; i <= e->asize; ++i) p[i] = e->cnt[i] - e->mcnt[i]; | |
118 | type = 0; | |
119 | } else if (e->cnt[0] - e->mcnt[0] < 0x40000000) { | |
120 | uint32_t *p = (uint32_t*)itr->shead; | |
121 | for (i = 0; i <= e->asize; ++i) p[i] = e->cnt[i] - e->mcnt[i]; | |
122 | type = 1; | |
123 | } else { | |
124 | uint64_t *p = (uint64_t*)itr->shead; | |
125 | for (i = 0; i <= e->asize; ++i) p[i] = e->cnt[i] - e->mcnt[i]; | |
126 | type = 2; | |
127 | } | |
128 | *itr->shead |= (uint64_t)type<<62; | |
129 | itr->p = itr->shead + e->offset0[type]; | |
130 | itr->stail = rld_get_stail(e, itr); | |
131 | itr->q = (uint8_t*)itr->p; | |
132 | itr->r = 64; | |
133 | for (i = 0; i <= e->asize; ++i) e->mcnt[i] = e->cnt[i]; | |
134 | } | |
135 | ||
136 | static int rld_enc1(rld_t *e, rlditr_t *itr, int64_t l, uint8_t c) | |
137 | { | |
138 | int w; | |
139 | uint64_t x = rld_delta_enc1(l, &w) << e->abits | c; | |
140 | w += e->abits; | |
141 | if (w >= itr->r && itr->p == itr->stail) enc_next_block(e, itr); | |
142 | if (w > itr->r) { | |
143 | w -= itr->r; | |
144 | *itr->p++ |= x >> w; | |
145 | *itr->p = x << (itr->r = 64 - w); | |
146 | } else itr->r -= w, *itr->p |= x << itr->r; | |
147 | e->cnt[0] += l; | |
148 | e->cnt[c + 1] += l; | |
149 | return 0; | |
150 | } | |
151 | ||
152 | int rld_enc(rld_t *e, rlditr_t *itr, int64_t l, uint8_t c) | |
153 | { | |
154 | if (l == 0) return 0; | |
155 | if (itr->c != c) { | |
156 | if (itr->l) rld_enc1(e, itr, itr->l, itr->c); | |
157 | itr->l = l; itr->c = c; | |
158 | } else itr->l += l; | |
159 | return 0; | |
160 | } | |
161 | ||
162 | void rld_rank_index(rld_t *e) | |
163 | { | |
164 | uint64_t last, n_blks, i, k, *cnt; | |
165 | int j; | |
166 | ||
167 | n_blks = e->n_bytes * 8 / 64 / e->ssize + 1; | |
168 | last = rld_last_blk(e); | |
169 | cnt = alloca(e->asize * 8); | |
170 | e->ibits = ilog2(e->mcnt[0] / n_blks) + RLD_IBITS_PLUS; | |
171 | e->n_frames = ((e->mcnt[0] + (1ll<<e->ibits) - 1) >> e->ibits) + 1; | |
172 | e->frame = xcalloc(e->n_frames * e->asize1, 8); | |
173 | e->frame[0] = 0; | |
174 | for (j = 0; j < e->asize; ++j) cnt[j] = 0; | |
175 | for (i = e->ssize, k = 1; i <= last; i += e->ssize) { | |
176 | uint64_t sum, *p = rld_seek_blk(e, i); | |
177 | int type = rld_block_type(*p); | |
178 | if (type == 0) { | |
179 | uint16_t *q = (uint16_t*)p; | |
180 | for (j = 1; j <= e->asize; ++j) cnt[j-1] += q[j]; | |
181 | } else if (type == 1) { | |
182 | uint32_t *q = (uint32_t*)p; | |
183 | for (j = 1; j <= e->asize; ++j) cnt[j-1] += q[j] & 0x3fffffff; | |
184 | } else { | |
185 | uint64_t *q = (uint64_t*)p; | |
186 | for (j = 1; j <= e->asize; ++j) cnt[j-1] += q[j]; | |
187 | } | |
188 | for (j = 0, sum = 0; j < e->asize; ++j) sum += cnt[j]; | |
189 | while (sum >= k<<e->ibits) ++k; | |
190 | if (k < e->n_frames) { | |
191 | uint64_t x = k * e->asize1; | |
192 | e->frame[x] = i; | |
193 | for (j = 0; j < e->asize; ++j) e->frame[x + j + 1] = cnt[j]; | |
194 | } | |
195 | } | |
196 | assert(k >= e->n_frames - 1); | |
197 | for (k = 1; k < e->n_frames; ++k) { // fill zero cells | |
198 | uint64_t x = k * e->asize1; | |
199 | if (e->frame[x] == 0) { | |
200 | for (j = 0; j <= e->asize; ++j) | |
201 | e->frame[x + j] = e->frame[x - e->asize1 + j]; | |
202 | } | |
203 | } | |
204 | } | |
205 | ||
206 | uint64_t rld_enc_finish(rld_t *e, rlditr_t *itr) | |
207 | { | |
208 | int i; | |
209 | if (itr->l) rld_enc1(e, itr, itr->l, itr->c); | |
210 | enc_next_block(e, itr); | |
211 | e->n_bytes = (((uint64_t)(e->n - 1) * RLD_LSIZE) + (itr->p - *itr->i)) * 8; | |
212 | // recompute e->cnt as the accumulative count; e->mcnt[] keeps the marginal counts | |
213 | for (e->cnt[0] = 0, i = 1; i <= e->asize; ++i) e->cnt[i] += e->cnt[i - 1]; | |
214 | rld_rank_index(e); | |
215 | return e->n_bytes; | |
216 | } | |
217 | ||
218 | /***************** | |
219 | * Save and load * | |
220 | *****************/ | |
221 | ||
222 | int rld_dump(const rld_t *e, const char *fn) | |
223 | { | |
224 | uint64_t k = 0; | |
225 | int i; | |
226 | uint32_t a; | |
227 | FILE *fp; | |
228 | fp = strcmp(fn, "-")? fopen(fn, "wb") : fdopen(fileno(stdout), "wb"); | |
229 | if (fp == 0) return -1; | |
230 | a = e->asize<<16 | e->sbits; | |
231 | fwrite("RLD\3", 1, 4, fp); // write magic | |
232 | fwrite(&a, 4, 1, fp); // write sbits and asize | |
233 | fwrite(&k, 8, 1, fp); // preserve 8 bytes for future uses | |
234 | fwrite(&e->n_bytes, 8, 1, fp); // n_bytes can always be divided by 8 | |
235 | fwrite(&e->n_frames, 8, 1, fp); // number of frames | |
236 | fwrite(e->mcnt + 1, 8, e->asize, fp); // write the marginal counts | |
237 | for (i = 0, k = e->n_bytes / 8; i < e->n - 1; ++i, k -= RLD_LSIZE) | |
238 | fwrite(e->z[i], 8, RLD_LSIZE, fp); | |
239 | fwrite(e->z[i], 8, k, fp); | |
240 | fwrite(e->frame, 8 * e->asize1, e->n_frames, fp); | |
241 | fclose(fp); | |
242 | return 0; | |
243 | } | |
244 | ||
245 | static rld_t *rld_restore_header(const char *fn, FILE **_fp) | |
246 | { | |
247 | FILE *fp; | |
248 | rld_t *e; | |
249 | char magic[4]; | |
250 | uint64_t a[3]; | |
251 | int32_t i, x; | |
252 | ||
253 | if (strcmp(fn, "-") == 0) *_fp = fp = stdin; | |
254 | else if ((*_fp = fp = fopen(fn, "rb")) == 0) return 0; | |
255 | fread(magic, 1, 4, fp); | |
256 | if (strncmp(magic, "RLD\3", 4)) return 0; | |
257 | fread(&x, 4, 1, fp); | |
258 | e = rld_init(x>>16, x&0xffff); | |
259 | fread(a, 8, 3, fp); | |
260 | e->n_bytes = a[1]; e->n_frames = a[2]; | |
261 | fread(e->mcnt + 1, 8, e->asize, fp); | |
262 | for (i = 0; i <= e->asize; ++i) e->cnt[i] = e->mcnt[i]; | |
263 | for (i = 1; i <= e->asize; ++i) e->cnt[i] += e->cnt[i - 1]; | |
264 | e->mcnt[0] = e->cnt[e->asize]; | |
265 | return e; | |
266 | } | |
267 | ||
268 | rld_t *rld_restore(const char *fn) | |
269 | { | |
270 | FILE *fp; | |
271 | rld_t *e; | |
272 | uint64_t k, n_blks; | |
273 | int32_t i; | |
274 | ||
275 | if ((e = rld_restore_header(fn, &fp)) == 0) { // then load as plain DNA rle | |
276 | uint8_t *buf; | |
277 | int l; | |
278 | rlditr_t itr; | |
279 | buf = malloc(0x10000); | |
280 | e = rld_init(6, 3); | |
281 | rld_itr_init(e, &itr, 0); | |
282 | while ((l = fread(buf, 1, 0x10000, fp)) != 0) | |
283 | for (i = 0; i < l; ++i) | |
284 | if (buf[i]>>3) rld_enc(e, &itr, buf[i]>>3, buf[i]&7); | |
285 | fclose(fp); | |
286 | free(buf); | |
287 | rld_enc_finish(e, &itr); | |
288 | return e; | |
289 | } | |
290 | if (e->n_bytes / 8 > RLD_LSIZE) { // allocate enough memory | |
291 | e->n = (e->n_bytes / 8 + RLD_LSIZE - 1) / RLD_LSIZE; | |
292 | e->z = realloc(e->z, e->n * sizeof(void*)); | |
293 | for (i = 1; i < e->n; ++i) | |
294 | e->z[i] = xcalloc(RLD_LSIZE, 8); | |
295 | } | |
296 | for (i = 0, k = e->n_bytes / 8; i < e->n - 1; ++i, k -= RLD_LSIZE) | |
297 | fread(e->z[i], 8, RLD_LSIZE, fp); | |
298 | fread(e->z[i], 8, k, fp); | |
299 | e->frame = xmalloc(e->n_frames * e->asize1 * 8); | |
300 | fread(e->frame, 8 * e->asize1, e->n_frames, fp); | |
301 | fclose(fp); | |
302 | n_blks = e->n_bytes * 8 / 64 / e->ssize + 1; | |
303 | e->ibits = ilog2(e->mcnt[0] / n_blks) + RLD_IBITS_PLUS; | |
304 | return e; | |
305 | } | |
306 | ||
307 | rld_t *rld_restore_mmap(const char *fn) | |
308 | { | |
309 | FILE *fp; | |
310 | rld_t *e; | |
311 | int i; | |
312 | int64_t n_blks; | |
313 | ||
314 | e = rld_restore_header(fn, &fp); | |
315 | fclose(fp); | |
316 | free(e->z[0]); free(e->z); | |
317 | e->n = (e->n_bytes / 8 + RLD_LSIZE - 1) / RLD_LSIZE; | |
318 | e->z = xcalloc(e->n, sizeof(void*)); | |
319 | e->fd = open(fn, O_RDONLY); | |
320 | e->mem = (uint64_t*)mmap(0, rld_file_size(e), PROT_READ, MAP_PRIVATE, e->fd, 0); | |
321 | for (i = 0; i < e->n; ++i) e->z[i] = e->mem + (4 + e->asize) + (size_t)i * RLD_LSIZE; | |
322 | e->frame = e->mem + (4 + e->asize) + e->n_bytes/8; | |
323 | n_blks = e->n_bytes * 8 / 64 / e->ssize + 1; | |
324 | e->ibits = ilog2(e->mcnt[0] / n_blks) + RLD_IBITS_PLUS; | |
325 | return e; | |
326 | } | |
327 | ||
328 | /****************** | |
329 | * Computing rank * | |
330 | ******************/ | |
331 | ||
332 | #ifdef _DNA_ONLY | |
333 | static inline int64_t rld_dec0_fast_dna(const rld_t *e, rlditr_t *itr, int *c) | |
334 | { // This is NOT a replacement of rld_dec0(). It does not do boundary check. | |
335 | uint64_t x = itr->r == 64? itr->p[0] : itr->p[0] << (64 - itr->r) | itr->p[1] >> itr->r; | |
336 | if (x>>63 == 0) { | |
337 | int64_t y; | |
338 | int l, w = 0x333333335555779bll>>(x>>59<<2)&0xf; | |
339 | l = (x >> (64 - w)) - 1; | |
340 | y = x << w >> (64 - l) | 1u << l; | |
341 | w += l; | |
342 | *c = x << w >> 61; | |
343 | w += 3; | |
344 | itr->r -= w; | |
345 | if (itr->r <= 0) ++itr->p, itr->r += 64; | |
346 | return y; | |
347 | } else { | |
348 | *c = x << 1 >> 61; | |
349 | itr->r -= 4; | |
350 | if (itr->r <= 0) ++itr->p, itr->r += 64; | |
351 | return 1; | |
352 | } | |
353 | } | |
354 | #endif | |
355 | ||
356 | static inline uint64_t rld_locate_blk(const rld_t *e, rlditr_t *itr, uint64_t k, uint64_t *cnt, uint64_t *sum) | |
357 | { | |
358 | int j; | |
359 | uint64_t c = 0, *q, *z = e->frame + (k>>e->ibits) * e->asize1; | |
360 | itr->i = e->z + (*z>>RLD_LBITS); | |
361 | q = itr->p = *itr->i + (*z&RLD_LMASK); | |
362 | for (j = 1, *sum = 0; j < e->asize1; ++j) *sum += (cnt[j-1] = z[j]); | |
363 | while (1) { // seek to the small block | |
364 | int type; | |
365 | q += e->ssize; | |
366 | if (q - *itr->i == RLD_LSIZE) q = *++itr->i; | |
367 | type = rld_block_type(*q); | |
368 | c = type == 2? *q&0x3fffffffffffffffULL : type == 1? *(uint32_t*)q : *(uint16_t*)q; | |
369 | if (*sum + c > k) break; | |
370 | if (type == 0) { | |
371 | uint16_t *p = (uint16_t*)q + 1; | |
372 | #ifdef _DNA_ONLY | |
373 | cnt[0] += p[0]; cnt[1] += p[1]; cnt[2] += p[2]; cnt[3] += p[3]; cnt[4] += p[4]; cnt[5] += p[5]; | |
374 | #else | |
375 | for (j = 0; j < e->asize; ++j) cnt[j] += p[j]; | |
376 | #endif | |
377 | } else if (type == 1) { | |
378 | uint32_t *p = (uint32_t*)q + 1; | |
379 | for (j = 0; j < e->asize; ++j) cnt[j] += p[j] & 0x3fffffff; | |
380 | } else { | |
381 | uint64_t *p = (uint64_t*)q + 1; | |
382 | for (j = 0; j < e->asize; ++j) cnt[j] += p[j]; | |
383 | } | |
384 | *sum += c; | |
385 | itr->p = q; | |
386 | } | |
387 | itr->shead = itr->p; | |
388 | itr->stail = rld_get_stail(e, itr); | |
389 | itr->p += e->offset0[rld_block_type(*itr->shead)]; | |
390 | itr->q = (uint8_t*)itr->p; | |
391 | itr->r = 64; | |
392 | return c + *sum; | |
393 | } | |
394 | ||
395 | void rld_rank21(const rld_t *e, uint64_t k, uint64_t l, int c, uint64_t *ok, uint64_t *ol) // FIXME: can be faster | |
396 | { | |
397 | *ok = rld_rank11(e, k, c); | |
398 | *ol = rld_rank11(e, l, c); | |
399 | } | |
400 | ||
401 | int rld_rank1a(const rld_t *e, uint64_t k, uint64_t *ok) | |
402 | { | |
403 | uint64_t z, l; | |
404 | int a = -1; | |
405 | rlditr_t itr; | |
406 | if (k == 0) { | |
407 | for (a = 0; a < e->asize; ++a) ok[a] = 0; | |
408 | return -1; | |
409 | } | |
410 | rld_locate_blk(e, &itr, k-1, ok, &z); | |
411 | while (1) { | |
412 | #ifdef _DNA_ONLY | |
413 | l = rld_dec0_fast_dna(e, &itr, &a); | |
414 | #else | |
415 | l = rld_dec0(e, &itr, &a); | |
416 | #endif | |
417 | if (z + l >= k) break; | |
418 | z += l; ok[a] += l; | |
419 | } | |
420 | ok[a] += k - z; | |
421 | return a; | |
422 | } | |
423 | ||
424 | uint64_t rld_rank11(const rld_t *e, uint64_t k, int c) | |
425 | { | |
426 | uint64_t *ok; | |
427 | if (k == (uint64_t)-1) return 0; | |
428 | ok = alloca(e->asize1 * 8); | |
429 | rld_rank1a(e, k, ok); | |
430 | return ok[c]; | |
431 | } | |
432 | ||
433 | void rld_rank2a(const rld_t *e, uint64_t k, uint64_t l, uint64_t *ok, uint64_t *ol) | |
434 | { | |
435 | uint64_t z, y, len; | |
436 | rlditr_t itr; | |
437 | int a = -1; | |
438 | if (k == 0) { | |
439 | for (a = 0; a < e->asize; ++a) ok[a] = 0; | |
440 | rld_rank1a(e, l, ol); | |
441 | return; | |
442 | } | |
443 | y = rld_locate_blk(e, &itr, k-1, ok, &z); // locate the block bracketing k | |
444 | while (1) { // compute ok[] | |
445 | #ifdef _DNA_ONLY | |
446 | len = rld_dec0_fast_dna(e, &itr, &a); | |
447 | #else | |
448 | len = rld_dec0(e, &itr, &a); | |
449 | #endif | |
450 | if (z + len >= k) break; | |
451 | z += len; ok[a] += len; | |
452 | } | |
453 | if (y > l) { // we do not need to decode other blocks | |
454 | int b; | |
455 | for (b = 0; b < e->asize; ++b) ol[b] = ok[b]; // copy ok[] to ol[] | |
456 | ok[a] += k - z; // finalize ok[a] | |
457 | if (z + len < l) { // we need to decode the next run | |
458 | z += len; ol[a] += len; | |
459 | while (1) { | |
460 | len = rld_dec0(e, &itr, &a); | |
461 | if (z + len >= l) break; | |
462 | z += len; ol[a] += len; | |
463 | } | |
464 | } | |
465 | ol[a] += l - z; | |
466 | } else { // we have to decode other blocks | |
467 | ok[a] += k - z; | |
468 | rld_rank1a(e, l, ol); | |
469 | } | |
470 | } | |
471 | ||
472 | int rld_extend(const rld_t *e, const rldintv_t *ik, rldintv_t ok[6], int is_back) | |
473 | { // TODO: this can be accelerated a little by using rld_rank1a() when ik.x[2]==1 | |
474 | uint64_t tk[6], tl[6]; | |
475 | int i; | |
476 | rld_rank2a(e, ik->x[!is_back], ik->x[!is_back] + ik->x[2], tk, tl); | |
477 | for (i = 0; i < 6; ++i) { | |
478 | ok[i].x[!is_back] = e->cnt[i] + tk[i]; | |
479 | ok[i].x[2] = (tl[i] -= tk[i]); | |
480 | } | |
481 | ok[0].x[is_back] = ik->x[is_back]; | |
482 | ok[4].x[is_back] = ok[0].x[is_back] + tl[0]; | |
483 | ok[3].x[is_back] = ok[4].x[is_back] + tl[4]; | |
484 | ok[2].x[is_back] = ok[3].x[is_back] + tl[3]; | |
485 | ok[1].x[is_back] = ok[2].x[is_back] + tl[2]; | |
486 | ok[5].x[is_back] = ok[1].x[is_back] + tl[1]; | |
487 | return 0; | |
488 | } |
0 | #ifndef RLDELTA0_H | |
1 | #define RLDELTA0_H | |
2 | ||
3 | #define _DNA_ONLY | |
4 | ||
5 | #include <stdint.h> | |
6 | #include <stdlib.h> | |
7 | #include <assert.h> | |
8 | #include <stdio.h> | |
9 | ||
10 | #define RLD_LBITS 23 | |
11 | #define RLD_LSIZE (1<<RLD_LBITS) | |
12 | #define RLD_LMASK (RLD_LSIZE - 1) | |
13 | ||
14 | typedef struct { | |
15 | int r, c; // $r: bits remained in the last 64-bit integer; $c: pending symbol | |
16 | int64_t l; // $l: pending length | |
17 | uint64_t *p, *shead, *stail, **i; | |
18 | uint8_t *q; | |
19 | } rlditr_t; | |
20 | ||
21 | typedef struct rld_t { | |
22 | // initialized in the constructor | |
23 | uint8_t asize, asize1; // alphabet size; asize1=asize+1 | |
24 | int8_t abits; // bits required to store a symbol | |
25 | int8_t sbits; // bits per small block | |
26 | int8_t ibits; // modified during indexing; here for a better alignment | |
27 | int8_t offset0[3]; // 0 for 16-bit blocks; 1 for 32-bit blocks; 2 for 64-bit blocks | |
28 | int ssize; // ssize = 1<<sbits | |
29 | // modified during encoding | |
30 | int n; // number of blocks (unchanged in decoding) | |
31 | uint64_t n_bytes; // total number of bits (unchanged in decoding) | |
32 | uint64_t **z; // the actual data (unchanged in decoding) | |
33 | uint64_t *cnt, *mcnt; // after enc_finish, cnt keeps the accumulative count and mcnt keeps the marginal | |
34 | // modified during indexing | |
35 | uint64_t n_frames; | |
36 | uint64_t *frame; | |
37 | // | |
38 | int fd; | |
39 | uint64_t *mem; // only used for memory mapped file | |
40 | } rld_t; | |
41 | ||
42 | typedef struct { | |
43 | uint64_t x[3]; // 0: start of the interval, backward; 1: forward; 2: size of the interval | |
44 | uint64_t info; | |
45 | } rldintv_t; | |
46 | ||
47 | #ifdef __cplusplus | |
48 | extern "C" { | |
49 | #endif | |
50 | ||
51 | rld_t *rld_init(int asize, int bbits); | |
52 | void rld_destroy(rld_t *e); | |
53 | int rld_dump(const rld_t *e, const char *fn); | |
54 | rld_t *rld_restore(const char *fn); | |
55 | rld_t *rld_restore_mmap(const char *fn); | |
56 | ||
57 | void rld_itr_init(const rld_t *e, rlditr_t *itr, uint64_t k); | |
58 | int rld_enc(rld_t *e, rlditr_t *itr, int64_t l, uint8_t c); | |
59 | uint64_t rld_enc_finish(rld_t *e, rlditr_t *itr); | |
60 | ||
61 | uint64_t rld_rank11(const rld_t *e, uint64_t k, int c); | |
62 | int rld_rank1a(const rld_t *e, uint64_t k, uint64_t *ok); | |
63 | void rld_rank21(const rld_t *e, uint64_t k, uint64_t l, int c, uint64_t *ok, uint64_t *ol); | |
64 | void rld_rank2a(const rld_t *e, uint64_t k, uint64_t l, uint64_t *ok, uint64_t *ol); | |
65 | ||
66 | int rld_extend(const rld_t *e, const rldintv_t *ik, rldintv_t ok[6], int is_back); | |
67 | ||
68 | #ifdef __cplusplus | |
69 | } | |
70 | #endif | |
71 | ||
72 | #define rld_last_blk(e) ((e)->n_bytes>>3>>(e)->sbits<<(e)->sbits) | |
73 | #define rld_seek_blk(e, k) ((e)->z[(k)>>RLD_LBITS] + ((k)&RLD_LMASK)) | |
74 | #define rld_get_stail(e, itr) ((itr)->shead + (e)->ssize - ((itr)->shead + (e)->ssize - *(itr)->i == RLD_LSIZE? 2 : 1)) | |
75 | ||
76 | #define rld_block_type(x) ((uint64_t)(x)>>62) | |
77 | ||
78 | static inline int64_t rld_dec0(const rld_t *e, rlditr_t *itr, int *c) | |
79 | { | |
80 | int w; | |
81 | uint64_t x; | |
82 | int64_t l, y = 0; | |
83 | x = itr->p[0] << (64 - itr->r) | (itr->p != itr->stail && itr->r != 64? itr->p[1] >> itr->r : 0); | |
84 | if (x>>63 == 0) { | |
85 | if ((w = 0x333333335555779bll>>(x>>59<<2)&0xf) == 0xb && x>>58 == 0) return 0; | |
86 | l = (x >> (64 - w)) - 1; | |
87 | y = x << w >> (64 - l) | 1u << l; | |
88 | w += l; | |
89 | } else w = y = 1; | |
90 | *c = x << w >> (64 - e->abits); | |
91 | w += e->abits; | |
92 | if (itr->r > w) itr->r -= w; | |
93 | else ++itr->p, itr->r = 64 + itr->r - w; | |
94 | return y; | |
95 | } | |
96 | ||
97 | static inline int64_t rld_dec(const rld_t *e, rlditr_t *itr, int *_c, int is_free) | |
98 | { | |
99 | int64_t l = rld_dec0(e, itr, _c); | |
100 | if (l == 0 || *_c > e->asize) { | |
101 | uint64_t last = rld_last_blk(e); | |
102 | if (itr->p - *itr->i > RLD_LSIZE - e->ssize) { | |
103 | if (is_free) { | |
104 | free(*itr->i); *itr->i = 0; | |
105 | } | |
106 | itr->shead = *++itr->i; | |
107 | } else itr->shead += e->ssize; | |
108 | if (itr->shead == rld_seek_blk(e, last)) return -1; | |
109 | itr->p = itr->shead + e->offset0[rld_block_type(*itr->shead)]; | |
110 | itr->q = (uint8_t*)itr->p; | |
111 | itr->stail = rld_get_stail(e, itr); | |
112 | itr->r = 64; | |
113 | return rld_dec0(e, itr, _c); | |
114 | } else return l; | |
115 | } | |
116 | ||
117 | // take k symbols from e0 and write it to e | |
118 | static inline void rld_dec_enc(rld_t *e, rlditr_t *itr, const rld_t *e0, rlditr_t *itr0, int64_t k) | |
119 | { | |
120 | if (itr0->l >= k) { // there are more pending symbols | |
121 | rld_enc(e, itr, k, itr0->c); | |
122 | itr0->l -= k; // l - k symbols remains | |
123 | } else { // use up all pending symbols | |
124 | int c = -1; // to please gcc | |
125 | int64_t l; | |
126 | rld_enc(e, itr, itr0->l, itr0->c); // write all pending symbols | |
127 | k -= itr0->l; | |
128 | for (; k > 0; k -= l) { // we always go into this loop because l0<k | |
129 | l = rld_dec(e0, itr0, &c, 1); | |
130 | rld_enc(e, itr, k < l? k : l, c); | |
131 | } | |
132 | itr0->l = -k; itr0->c = c; | |
133 | } | |
134 | } | |
135 | ||
136 | #endif |
0 | #include <string.h> | |
1 | #include <assert.h> | |
2 | #include <stdlib.h> | |
3 | #include <stdio.h> | |
4 | #include "rle.h" | |
5 | ||
6 | const uint8_t rle_auxtab[8] = { 0x01, 0x11, 0x21, 0x31, 0x03, 0x13, 0x07, 0x17 }; | |
7 | ||
8 | // insert symbol $a after $x symbols in $str; marginal counts added to $cnt; returns the size increase | |
9 | int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6]) | |
10 | { | |
11 | uint16_t *nptr = (uint16_t*)block; | |
12 | int diff; | |
13 | ||
14 | block += 2; // skip the first 2 counting bytes | |
15 | if (*nptr == 0) { | |
16 | memset(cnt, 0, 48); | |
17 | diff = rle_enc1(block, a, rl); | |
18 | } else { | |
19 | uint8_t *p, *end = block + *nptr, *q; | |
20 | int64_t pre, z, l = 0, tot, beg_l; | |
21 | int c = -1, n_bytes = 0, n_bytes2, t = 0; | |
22 | uint8_t tmp[24]; | |
23 | beg_l = bc[0] + bc[1] + bc[2] + bc[3] + bc[4] + bc[5]; | |
24 | tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5]; | |
25 | if (x < beg_l) { | |
26 | beg_l = 0, *beg = 0; | |
27 | memset(bc, 0, 48); | |
28 | } | |
29 | if (x == beg_l) { | |
30 | p = q = block + (*beg); z = beg_l; | |
31 | memcpy(cnt, bc, 48); | |
32 | } else if (x - beg_l <= ((tot-beg_l)>>1) + ((tot-beg_l)>>3)) { // forward | |
33 | z = beg_l; p = block + (*beg); | |
34 | memcpy(cnt, bc, 48); | |
35 | while (z < x) { | |
36 | rle_dec1(p, c, l); | |
37 | z += l; cnt[c] += l; | |
38 | } | |
39 | for (q = p - 1; *q>>6 == 2; --q); | |
40 | } else { // backward | |
41 | memcpy(cnt, ec, 48); | |
42 | z = tot; p = end; | |
43 | while (z >= x) { | |
44 | --p; | |
45 | if (*p>>6 != 2) { | |
46 | l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3; | |
47 | z -= l; cnt[*p&7] -= l; | |
48 | l = 0; t = 0; | |
49 | } else { | |
50 | l |= (*p&0x3fL) << t; | |
51 | t += 6; | |
52 | } | |
53 | } | |
54 | q = p; | |
55 | rle_dec1(p, c, l); | |
56 | z += l; cnt[c] += l; | |
57 | } | |
58 | *beg = q - block; | |
59 | memcpy(bc, cnt, 48); | |
60 | bc[c] -= l; | |
61 | n_bytes = p - q; | |
62 | if (x == z && a != c && p < end) { // then try the next run | |
63 | int tc; | |
64 | int64_t tl; | |
65 | q = p; | |
66 | rle_dec1(q, tc, tl); | |
67 | if (a == tc) | |
68 | c = tc, n_bytes = q - p, l = tl, z += l, p = q, cnt[tc] += tl; | |
69 | } | |
70 | if (z != x) cnt[c] -= z - x; | |
71 | pre = x - (z - l); p -= n_bytes; | |
72 | if (a == c) { // insert to the same run | |
73 | n_bytes2 = rle_enc1(tmp, c, l + rl); | |
74 | } else if (x == z) { // at the end; append to the existing run | |
75 | p += n_bytes; n_bytes = 0; | |
76 | n_bytes2 = rle_enc1(tmp, a, rl); | |
77 | } else { // break the current run | |
78 | n_bytes2 = rle_enc1(tmp, c, pre); | |
79 | n_bytes2 += rle_enc1(tmp + n_bytes2, a, rl); | |
80 | n_bytes2 += rle_enc1(tmp + n_bytes2, c, l - pre); | |
81 | } | |
82 | if (n_bytes != n_bytes2 && end != p + n_bytes) // size changed | |
83 | memmove(p + n_bytes2, p + n_bytes, end - p - n_bytes); | |
84 | memcpy(p, tmp, n_bytes2); | |
85 | diff = n_bytes2 - n_bytes; | |
86 | } | |
87 | return (*nptr += diff); | |
88 | } | |
89 | ||
90 | int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6]) | |
91 | { | |
92 | int beg = 0; | |
93 | int64_t bc[6]; | |
94 | memset(bc, 0, 48); | |
95 | return rle_insert_cached(block, x, a, rl, cnt, ec, &beg, bc); | |
96 | } | |
97 | ||
98 | void rle_split(uint8_t *block, uint8_t *new_block) | |
99 | { | |
100 | int n = *(uint16_t*)block; | |
101 | uint8_t *end = block + 2 + n, *q = block + 2 + (n>>1); | |
102 | while (*q>>6 == 2) --q; | |
103 | memcpy(new_block + 2, q, end - q); | |
104 | *(uint16_t*)new_block = end - q; | |
105 | *(uint16_t*)block = q - block - 2; | |
106 | } | |
107 | ||
108 | void rle_count(const uint8_t *block, int64_t cnt[6]) | |
109 | { | |
110 | const uint8_t *q = block + 2, *end = q + *(uint16_t*)block; | |
111 | while (q < end) { | |
112 | int c; | |
113 | int64_t l; | |
114 | rle_dec1(q, c, l); | |
115 | cnt[c] += l; | |
116 | } | |
117 | } | |
118 | ||
119 | void rle_print(const uint8_t *block, int expand) | |
120 | { | |
121 | const uint16_t *p = (const uint16_t*)block; | |
122 | const uint8_t *q = block + 2, *end = block + 2 + *p; | |
123 | while (q < end) { | |
124 | int c; | |
125 | int64_t l, x; | |
126 | rle_dec1(q, c, l); | |
127 | if (expand) for (x = 0; x < l; ++x) putchar("$ACGTN"[c]); | |
128 | else printf("%c%ld", "$ACGTN"[c], (long)l); | |
129 | } | |
130 | putchar('\n'); | |
131 | } | |
132 | ||
133 | void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6]) | |
134 | { | |
135 | int a; | |
136 | int64_t tot, cnt[6]; | |
137 | const uint8_t *p; | |
138 | ||
139 | y = y >= x? y : x; | |
140 | tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5]; | |
141 | if (tot == 0) return; | |
142 | if (x <= (tot - y) + (tot>>3)) { | |
143 | int c = 0; | |
144 | int64_t l, z = 0; | |
145 | memset(cnt, 0, 48); | |
146 | p = block + 2; | |
147 | while (z < x) { | |
148 | rle_dec1(p, c, l); | |
149 | z += l; cnt[c] += l; | |
150 | } | |
151 | for (a = 0; a != 6; ++a) cx[a] += cnt[a]; | |
152 | cx[c] -= z - x; | |
153 | if (cy) { | |
154 | while (z < y) { | |
155 | rle_dec1(p, c, l); | |
156 | z += l; cnt[c] += l; | |
157 | } | |
158 | for (a = 0; a != 6; ++a) cy[a] += cnt[a]; | |
159 | cy[c] -= z - y; | |
160 | } | |
161 | } else { | |
162 | #define move_backward(_x) \ | |
163 | while (z >= (_x)) { \ | |
164 | --p; \ | |
165 | if (*p>>6 != 2) { \ | |
166 | l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3; \ | |
167 | z -= l; cnt[*p&7] -= l; \ | |
168 | l = 0; t = 0; \ | |
169 | } else { \ | |
170 | l |= (*p&0x3fL) << t; \ | |
171 | t += 6; \ | |
172 | } \ | |
173 | } \ | |
174 | ||
175 | int t = 0; | |
176 | int64_t l = 0, z = tot; | |
177 | memcpy(cnt, ec, 48); | |
178 | p = block + 2 + *(const uint16_t*)block; | |
179 | if (cy) { | |
180 | move_backward(y) | |
181 | for (a = 0; a != 6; ++a) cy[a] += cnt[a]; | |
182 | cy[*p&7] += y - z; | |
183 | } | |
184 | move_backward(x) | |
185 | for (a = 0; a != 6; ++a) cx[a] += cnt[a]; | |
186 | cx[*p&7] += x - z; | |
187 | ||
188 | #undef move_backward | |
189 | } | |
190 | } |
0 | #ifndef RLE6_H_ | |
1 | #define RLE6_H_ | |
2 | ||
3 | #include <stdint.h> | |
4 | ||
5 | #ifdef __GNUC__ | |
6 | #define LIKELY(x) __builtin_expect((x),1) | |
7 | #else | |
8 | #define LIKELY(x) (x) | |
9 | #endif | |
10 | #ifdef __cplusplus | |
11 | ||
12 | extern "C" { | |
13 | #endif | |
14 | ||
15 | int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6]); | |
16 | int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t end_cnt[6]); | |
17 | void rle_split(uint8_t *block, uint8_t *new_block); | |
18 | void rle_count(const uint8_t *block, int64_t cnt[6]); | |
19 | void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6]); | |
20 | #define rle_rank1a(block, x, cx, ec) rle_rank2a(block, x, -1, cx, 0, ec) | |
21 | ||
22 | void rle_print(const uint8_t *block, int expand); | |
23 | ||
24 | #ifdef __cplusplus | |
25 | } | |
26 | #endif | |
27 | ||
28 | /****************** | |
29 | *** 43+3 codec *** | |
30 | ******************/ | |
31 | ||
32 | extern const uint8_t rle_auxtab[8]; | |
33 | ||
34 | #define RLE_MIN_SPACE 18 | |
35 | #define rle_nptr(block) ((uint16_t*)(block)) | |
36 | ||
37 | // decode one run (c,l) and move the pointer p | |
38 | #define rle_dec1(p, c, l) do { \ | |
39 | (c) = *(p) & 7; \ | |
40 | if (LIKELY((*(p)&0x80) == 0)) { \ | |
41 | (l) = *(p)++ >> 3; \ | |
42 | } else if (LIKELY(*(p)>>5 == 6)) { \ | |
43 | (l) = (*(p)&0x18L)<<3L | ((p)[1]&0x3fL); \ | |
44 | (p) += 2; \ | |
45 | } else { \ | |
46 | int n = ((*(p)&0x10) >> 2) + 4; \ | |
47 | (l) = *(p)++ >> 3 & 1; \ | |
48 | while (--n) (l) = ((l)<<6) | (*(p)++&0x3fL); \ | |
49 | } \ | |
50 | } while (0) | |
51 | ||
52 | static inline int rle_enc1(uint8_t *p, int c, int64_t l) | |
53 | { | |
54 | if (l < 1LL<<4) { | |
55 | *p = l << 3 | c; | |
56 | return 1; | |
57 | } else if (l < 1LL<<8) { | |
58 | *p = 0xC0 | l >> 6 << 3 | c; | |
59 | p[1] = 0x80 | (l & 0x3f); | |
60 | return 2; | |
61 | } else if (l < 1LL<<19) { | |
62 | *p = 0xE0 | l >> 18 << 3 | c; | |
63 | p[1] = 0x80 | (l >> 12 & 0x3f); | |
64 | p[2] = 0x80 | (l >> 6 & 0x3f); | |
65 | p[3] = 0x80 | (l & 0x3f); | |
66 | return 4; | |
67 | } else { | |
68 | int i, shift = 36; | |
69 | *p = 0xF0 | l >> 42 << 3 | c; | |
70 | for (i = 1; i < 8; ++i, shift -= 6) | |
71 | p[i] = 0x80 | (l>>shift & 0x3f); | |
72 | return 8; | |
73 | } | |
74 | } | |
75 | ||
76 | #endif |
0 | #include <stdlib.h> | |
1 | #include <string.h> | |
2 | #include <assert.h> | |
3 | #include <stdio.h> | |
4 | #include <zlib.h> | |
5 | #include "rle.h" | |
6 | #include "rope.h" | |
7 | ||
8 | /******************* | |
9 | *** Memory Pool *** | |
10 | *******************/ | |
11 | ||
12 | #define MP_CHUNK_SIZE 0x100000 // 1MB per chunk | |
13 | ||
14 | typedef struct { // memory pool for fast and compact memory allocation (no free) | |
15 | int size, i, n_elems; | |
16 | int64_t top, max; | |
17 | uint8_t **mem; | |
18 | } mempool_t; | |
19 | ||
20 | static mempool_t *mp_init(int size) | |
21 | { | |
22 | mempool_t *mp; | |
23 | mp = calloc(1, sizeof(mempool_t)); | |
24 | mp->size = size; | |
25 | mp->i = mp->n_elems = MP_CHUNK_SIZE / size; | |
26 | mp->top = -1; | |
27 | return mp; | |
28 | } | |
29 | ||
30 | static void mp_destroy(mempool_t *mp) | |
31 | { | |
32 | int64_t i; | |
33 | for (i = 0; i <= mp->top; ++i) free(mp->mem[i]); | |
34 | free(mp->mem); free(mp); | |
35 | } | |
36 | ||
37 | static inline void *mp_alloc(mempool_t *mp) | |
38 | { | |
39 | if (mp->i == mp->n_elems) { | |
40 | if (++mp->top == mp->max) { | |
41 | mp->max = mp->max? mp->max<<1 : 1; | |
42 | mp->mem = realloc(mp->mem, sizeof(void*) * mp->max); | |
43 | } | |
44 | mp->mem[mp->top] = calloc(mp->n_elems, mp->size); | |
45 | mp->i = 0; | |
46 | } | |
47 | return mp->mem[mp->top] + (mp->i++) * mp->size; | |
48 | } | |
49 | ||
50 | /*************** | |
51 | *** B+ rope *** | |
52 | ***************/ | |
53 | ||
54 | rope_t *rope_init(int max_nodes, int block_len) | |
55 | { | |
56 | rope_t *rope; | |
57 | rope = calloc(1, sizeof(rope_t)); | |
58 | if (block_len < 32) block_len = 32; | |
59 | rope->max_nodes = (max_nodes+ 1)>>1<<1; | |
60 | rope->block_len = (block_len + 7) >> 3 << 3; | |
61 | rope->node = mp_init(sizeof(rpnode_t) * rope->max_nodes); | |
62 | rope->leaf = mp_init(rope->block_len); | |
63 | rope->root = mp_alloc(rope->node); | |
64 | rope->root->n = 1; | |
65 | rope->root->is_bottom = 1; | |
66 | rope->root->p = mp_alloc(rope->leaf); | |
67 | return rope; | |
68 | } | |
69 | ||
70 | void rope_destroy(rope_t *rope) | |
71 | { | |
72 | mp_destroy(rope->node); | |
73 | mp_destroy(rope->leaf); | |
74 | free(rope); | |
75 | } | |
76 | ||
77 | static inline rpnode_t *split_node(rope_t *rope, rpnode_t *u, rpnode_t *v) | |
78 | { // split $v's child. $u is the first node in the bucket. $v and $u are in the same bucket. IMPORTANT: there is always enough room in $u | |
79 | int j, i = v - u; | |
80 | rpnode_t *w; // $w is the sibling of $v | |
81 | if (u == 0) { // only happens at the root; add a new root | |
82 | u = v = mp_alloc(rope->node); | |
83 | v->n = 1; v->p = rope->root; // the new root has the old root as the only child | |
84 | memcpy(v->c, rope->c, 48); | |
85 | for (j = 0; j < 6; ++j) v->l += v->c[j]; | |
86 | rope->root = v; | |
87 | } | |
88 | if (i != u->n - 1) // then make room for a new node | |
89 | memmove(v + 2, v + 1, sizeof(rpnode_t) * (u->n - i - 1)); | |
90 | ++u->n; w = v + 1; | |
91 | memset(w, 0, sizeof(rpnode_t)); | |
92 | w->p = mp_alloc(u->is_bottom? rope->leaf : rope->node); | |
93 | if (u->is_bottom) { // we are at the bottom level; $v->p is a string instead of a node | |
94 | uint8_t *p = (uint8_t*)v->p, *q = (uint8_t*)w->p; | |
95 | rle_split(p, q); | |
96 | rle_count(q, w->c); | |
97 | } else { // $v->p is a node, not a string | |
98 | rpnode_t *p = v->p, *q = w->p; // $v and $w are siblings and thus $p and $q are cousins | |
99 | p->n -= rope->max_nodes>>1; | |
100 | memcpy(q, p + p->n, sizeof(rpnode_t) * (rope->max_nodes>>1)); | |
101 | q->n = rope->max_nodes>>1; // NB: this line must below memcpy() as $q->n and $q->is_bottom are modified by memcpy() | |
102 | q->is_bottom = p->is_bottom; | |
103 | for (i = 0; i < q->n; ++i) | |
104 | for (j = 0; j < 6; ++j) | |
105 | w->c[j] += q[i].c[j]; | |
106 | } | |
107 | for (j = 0; j < 6; ++j) // compute $w->l and update $v->c | |
108 | w->l += w->c[j], v->c[j] -= w->c[j]; | |
109 | v->l -= w->l; // update $v->c | |
110 | return v; | |
111 | } | |
112 | ||
113 | int64_t rope_insert_run(rope_t *rope, int64_t x, int a, int64_t rl, rpcache_t *cache) | |
114 | { // insert $a after $x symbols in $rope and the returns rank(a, x) | |
115 | rpnode_t *u = 0, *v = 0, *p = rope->root; // $v is the parent of $p; $u and $v are at the same level and $u is the first node in the bucket | |
116 | int64_t y = 0, z = 0, cnt[6]; | |
117 | int n_runs; | |
118 | do { // top-down update. Searching and node splitting are done together in one pass. | |
119 | if (p->n == rope->max_nodes) { // node is full; split | |
120 | v = split_node(rope, u, v); // $v points to the parent of $p; when a new root is added, $v points to the root | |
121 | if (y + v->l < x) // if $v is not long enough after the split, we need to move both $p and its parent $v | |
122 | y += v->l, z += v->c[a], ++v, p = v->p; | |
123 | } | |
124 | u = p; | |
125 | if (v && x - y > v->l>>1) { // then search backwardly for the right node to descend | |
126 | p += p->n - 1; y += v->l; z += v->c[a]; | |
127 | for (; y >= x; --p) y -= p->l, z -= p->c[a]; | |
128 | ++p; | |
129 | } else for (; y + p->l < x; ++p) y += p->l, z += p->c[a]; // then search forwardly | |
130 | assert(p - u < u->n); | |
131 | if (v) v->c[a] += rl, v->l += rl; // we should not change p->c[a] because this may cause troubles when p's child is split | |
132 | v = p; p = p->p; // descend | |
133 | } while (!u->is_bottom); | |
134 | rope->c[a] += rl; // $rope->c should be updated after the loop as adding a new root needs the old $rope->c counts | |
135 | if (cache) { | |
136 | if (cache->p != (uint8_t*)p) memset(cache, 0, sizeof(rpcache_t)); | |
137 | n_runs = rle_insert_cached((uint8_t*)p, x - y, a, rl, cnt, v->c, &cache->beg, cache->bc); | |
138 | cache->p = (uint8_t*)p; | |
139 | } else n_runs = rle_insert((uint8_t*)p, x - y, a, rl, cnt, v->c); | |
140 | z += cnt[a]; | |
141 | v->c[a] += rl; v->l += rl; // this should be after rle_insert(); otherwise rle_insert() won't work | |
142 | if (n_runs + RLE_MIN_SPACE > rope->block_len) { | |
143 | split_node(rope, u, v); | |
144 | if (cache) memset(cache, 0, sizeof(rpcache_t)); | |
145 | } | |
146 | return z; | |
147 | } | |
148 | ||
149 | static rpnode_t *rope_count_to_leaf(const rope_t *rope, int64_t x, int64_t cx[6], int64_t *rest) | |
150 | { | |
151 | rpnode_t *u, *v = 0, *p = rope->root; | |
152 | int64_t y = 0; | |
153 | int a; | |
154 | ||
155 | memset(cx, 0, 48); | |
156 | do { | |
157 | u = p; | |
158 | if (v && x - y > v->l>>1) { | |
159 | p += p->n - 1; y += v->l; | |
160 | for (a = 0; a != 6; ++a) cx[a] += v->c[a]; | |
161 | for (; y >= x; --p) { | |
162 | y -= p->l; | |
163 | for (a = 0; a != 6; ++a) cx[a] -= p->c[a]; | |
164 | } | |
165 | ++p; | |
166 | } else { | |
167 | for (; y + p->l < x; ++p) { | |
168 | y += p->l; | |
169 | for (a = 0; a != 6; ++a) cx[a] += p->c[a]; | |
170 | } | |
171 | } | |
172 | v = p; p = p->p; | |
173 | } while (!u->is_bottom); | |
174 | *rest = x - y; | |
175 | return v; | |
176 | } | |
177 | ||
178 | void rope_rank2a(const rope_t *rope, int64_t x, int64_t y, int64_t *cx, int64_t *cy) | |
179 | { | |
180 | rpnode_t *v; | |
181 | int64_t rest; | |
182 | v = rope_count_to_leaf(rope, x, cx, &rest); | |
183 | if (y < x || cy == 0) { | |
184 | rle_rank1a((const uint8_t*)v->p, rest, cx, v->c); | |
185 | } else if (rest + (y - x) <= v->l) { | |
186 | memcpy(cy, cx, 48); | |
187 | rle_rank2a((const uint8_t*)v->p, rest, rest + (y - x), cx, cy, v->c); | |
188 | } else { | |
189 | rle_rank1a((const uint8_t*)v->p, rest, cx, v->c); | |
190 | v = rope_count_to_leaf(rope, y, cy, &rest); | |
191 | rle_rank1a((const uint8_t*)v->p, rest, cy, v->c); | |
192 | } | |
193 | } | |
194 | ||
195 | /********************* | |
196 | *** Rope iterator *** | |
197 | *********************/ | |
198 | ||
199 | void rope_itr_first(const rope_t *rope, rpitr_t *i) | |
200 | { | |
201 | memset(i, 0, sizeof(rpitr_t)); | |
202 | i->rope = rope; | |
203 | for (i->pa[i->d] = rope->root; !i->pa[i->d]->is_bottom;) // descend to the leftmost leaf | |
204 | ++i->d, i->pa[i->d] = i->pa[i->d - 1]->p; | |
205 | } | |
206 | ||
207 | const uint8_t *rope_itr_next_block(rpitr_t *i) | |
208 | { | |
209 | const uint8_t *ret; | |
210 | assert(i->d < ROPE_MAX_DEPTH); // a B+ tree should not be that tall | |
211 | if (i->d < 0) return 0; | |
212 | ret = (uint8_t*)i->pa[i->d][i->ia[i->d]].p; | |
213 | while (i->d >= 0 && ++i->ia[i->d] == i->pa[i->d]->n) i->ia[i->d--] = 0; // backtracking | |
214 | if (i->d >= 0) | |
215 | while (!i->pa[i->d]->is_bottom) // descend to the leftmost leaf | |
216 | ++i->d, i->pa[i->d] = i->pa[i->d - 1][i->ia[i->d - 1]].p; | |
217 | return ret; | |
218 | } |
0 | #ifndef ROPE_H_ | |
1 | #define ROPE_H_ | |
2 | ||
3 | #include <stdint.h> | |
4 | #include <stdio.h> | |
5 | ||
6 | #define ROPE_MAX_DEPTH 80 | |
7 | #define ROPE_DEF_MAX_NODES 64 | |
8 | #define ROPE_DEF_BLOCK_LEN 512 | |
9 | ||
10 | typedef struct rpnode_s { | |
11 | struct rpnode_s *p; // child; at the bottom level, $p points to a string with the first 2 bytes giving the number of runs (#runs) | |
12 | uint64_t l:54, n:9, is_bottom:1; // $n and $is_bottom are only set for the first node in a bucket | |
13 | int64_t c[6]; // marginal counts | |
14 | } rpnode_t; | |
15 | ||
16 | typedef struct { | |
17 | int32_t max_nodes, block_len; // both MUST BE even numbers | |
18 | int64_t c[6]; // marginal counts | |
19 | rpnode_t *root; | |
20 | void *node, *leaf; // memory pool | |
21 | } rope_t; | |
22 | ||
23 | typedef struct { | |
24 | const rope_t *rope; // the rope | |
25 | const rpnode_t *pa[ROPE_MAX_DEPTH]; // parent nodes | |
26 | int ia[ROPE_MAX_DEPTH]; // index in the parent nodes | |
27 | int d; // the current depth in the B+-tree | |
28 | } rpitr_t; | |
29 | ||
30 | typedef struct { | |
31 | int beg; | |
32 | int64_t bc[6]; | |
33 | uint8_t *p; | |
34 | } rpcache_t; | |
35 | ||
36 | #ifdef __cplusplus | |
37 | extern "C" { | |
38 | #endif | |
39 | ||
40 | rope_t *rope_init(int max_nodes, int block_len); | |
41 | void rope_destroy(rope_t *rope); | |
42 | int64_t rope_insert_run(rope_t *rope, int64_t x, int a, int64_t rl, rpcache_t *cache); | |
43 | void rope_rank2a(const rope_t *rope, int64_t x, int64_t y, int64_t *cx, int64_t *cy); | |
44 | #define rope_rank1a(rope, x, cx) rope_rank2a(rope, x, -1, cx, 0) | |
45 | ||
46 | void rope_itr_first(const rope_t *rope, rpitr_t *i); | |
47 | const uint8_t *rope_itr_next_block(rpitr_t *i); | |
48 | ||
49 | #ifdef __cplusplus | |
50 | } | |
51 | #endif | |
52 | ||
53 | #endif |
Binary diff not shown
0 | #include <assert.h> | |
1 | #include <string.h> | |
2 | #include <math.h> | |
3 | #include "kvec.h" | |
4 | #include "kstring.h" | |
5 | #include "rld0.h" | |
6 | #include "mag.h" | |
7 | #include "internal.h" | |
8 | ||
9 | /****************** | |
10 | *** From fermi *** | |
11 | ******************/ | |
12 | ||
13 | typedef struct { size_t n, m; int32_t *a; } fm32s_v; | |
14 | typedef struct { size_t n, m; rldintv_t *a; } rldintv_v; | |
15 | ||
16 | static uint64_t utg_primes[] = { 123457, 234571, 345679, 456791, 567899, 0 }; | |
17 | ||
18 | #define fm6_comp(a) ((a) >= 1 && (a) <= 4? 5 - (a) : (a)) | |
19 | #define fm6_set_intv(e, c, ik) ((ik).x[0] = (e)->cnt[(int)(c)], (ik).x[2] = (e)->cnt[(int)(c)+1] - (e)->cnt[(int)(c)], (ik).x[1] = (e)->cnt[fm6_comp(c)], (ik).info = 0) | |
20 | ||
21 | int rld_extend0(const rld_t *e, const rldintv_t *ik, rldintv_t *ok0, int is_back) | |
22 | { // FIXME: this can be accelerated a little by using rld_rank1a() when ik.x[2]==1 | |
23 | uint64_t tk[6], tl[6]; | |
24 | rld_rank2a(e, ik->x[!is_back], ik->x[!is_back] + ik->x[2], tk, tl); | |
25 | ok0->x[!is_back] = tk[0]; | |
26 | ok0->x[is_back] = ik->x[is_back]; | |
27 | ok0->x[2] = tl[0] - tk[0]; | |
28 | return 0; | |
29 | } | |
30 | ||
31 | uint64_t fm6_retrieve(const rld_t *e, uint64_t x, kstring_t *s, rldintv_t *k2, int *contained) | |
32 | { | |
33 | uint64_t k = x, ok[6]; | |
34 | rldintv_t ok2[6]; | |
35 | s->l = 0; *contained = 0; | |
36 | while (1) { | |
37 | int c = rld_rank1a(e, k + 1, ok); | |
38 | k = e->cnt[c] + ok[c] - 1; | |
39 | if (c == 0) break; | |
40 | if (s->l > 0) { | |
41 | if (k2->x[2] == 1) k2->x[0] = k; | |
42 | else { | |
43 | rld_extend(e, k2, ok2, 1); | |
44 | *k2 = ok2[c]; | |
45 | } | |
46 | } else fm6_set_intv(e, c, *k2); | |
47 | kputc(c, s); | |
48 | } | |
49 | if (k2->x[2] != 1) { | |
50 | rld_extend(e, k2, ok2, 1); | |
51 | if (ok2[0].x[2] != k2->x[2]) *contained |= 1; // left contained | |
52 | *k2 = ok2[0]; | |
53 | } else k2->x[0] = k; | |
54 | rld_extend(e, k2, ok2, 0); | |
55 | if (ok2[0].x[2] != k2->x[2]) *contained |= 2; // right contained | |
56 | *k2 = ok2[0]; | |
57 | return k; | |
58 | } | |
59 | ||
60 | /***************** | |
61 | *** Main body *** | |
62 | *****************/ | |
63 | ||
64 | #define info_lt(a, b) ((a).info < (b).info) | |
65 | ||
66 | #include "ksort.h" | |
67 | KSORT_INIT(infocmp, rldintv_t, info_lt) | |
68 | ||
69 | static inline void set_bit(uint64_t *bits, uint64_t x) | |
70 | { | |
71 | uint64_t *p = bits + (x>>6); | |
72 | uint64_t z = 1LLU<<(x&0x3f); | |
73 | __sync_fetch_and_or(p, z); | |
74 | } | |
75 | ||
76 | static inline void set_bits(uint64_t *bits, const rldintv_t *p) | |
77 | { | |
78 | uint64_t k; | |
79 | for (k = 0; k < p->x[2]; ++k) { | |
80 | set_bit(bits, p->x[0] + k); | |
81 | set_bit(bits, p->x[1] + k); | |
82 | } | |
83 | } | |
84 | ||
85 | static rldintv_t overlap_intv(const rld_t *e, int len, const uint8_t *seq, int min, int j, int at5, rldintv_v *p, int inc_sentinel) | |
86 | { // requirement: seq[j] matches the end of a read | |
87 | int c, depth, dir, end; | |
88 | rldintv_t ik, ok[6]; | |
89 | p->n = 0; | |
90 | dir = at5? 1 : -1; // at5 is true iff we start from the 5'-end of a read | |
91 | end = at5? len : -1; | |
92 | c = seq[j]; | |
93 | fm6_set_intv(e, c, ik); | |
94 | for (depth = 1, j += dir; j != end; j += dir, ++depth) { | |
95 | c = at5? fm6_comp(seq[j]) : seq[j]; | |
96 | rld_extend(e, &ik, ok, !at5); | |
97 | if (!ok[c].x[2]) break; // cannot be extended | |
98 | if (depth >= min && ok[0].x[2]) { | |
99 | if (inc_sentinel) { | |
100 | ok[0].info = j - dir; | |
101 | kv_push(rldintv_t, *p, ok[0]); | |
102 | } else { | |
103 | ik.info = j - dir; | |
104 | kv_push(rldintv_t, *p, ik); | |
105 | } | |
106 | } | |
107 | ik = ok[c]; | |
108 | } | |
109 | kv_reverse(rldintv_t, *p, 0); // reverse the array such that the smallest interval comes first | |
110 | return ik; | |
111 | } | |
112 | ||
113 | typedef struct { | |
114 | const rld_t *e; | |
115 | int min_match, min_merge_len; | |
116 | rldintv_v a[2], nei; | |
117 | fm32s_v cat; | |
118 | uint64_t *used, *bend; | |
119 | kstring_t str; | |
120 | uint64_t n, sum, sum2, unpaired; | |
121 | } aux_t; | |
122 | ||
123 | int fm6_is_contained(const rld_t *e, int min_match, const kstring_t *s, rldintv_t *intv, rldintv_v *ovlp) | |
124 | { // for s is a sequence in e, test if s is contained in other sequences in e; return intervals right overlapping with s | |
125 | rldintv_t ik, ok[6]; | |
126 | int ret = 0; | |
127 | assert(s->l > min_match); | |
128 | ovlp->n = 0; | |
129 | ik = overlap_intv(e, s->l, (uint8_t*)s->s, min_match, s->l - 1, 0, ovlp, 0); | |
130 | rld_extend(e, &ik, ok, 1); assert(ok[0].x[2]); | |
131 | if (ik.x[2] != ok[0].x[2]) ret = -1; // the sequence is left contained | |
132 | ik = ok[0]; | |
133 | rld_extend(e, &ik, ok, 0); assert(ok[0].x[2]); | |
134 | if (ik.x[2] != ok[0].x[2]) ret = -1; // the sequence is right contained | |
135 | *intv = ok[0]; | |
136 | return ret; | |
137 | } | |
138 | ||
139 | int fm6_get_nei(const rld_t *e, int min_match, int beg, kstring_t *s, rldintv_v *nei, // input and output variables | |
140 | rldintv_v *prev, rldintv_v *curr, fm32s_v *cat, // temporary arrays | |
141 | uint64_t *used) // optional info | |
142 | { | |
143 | int ori_l = s->l, j, i, c, rbeg, is_forked = 0; | |
144 | rldintv_v *swap; | |
145 | rldintv_t ok[6], ok0; | |
146 | ||
147 | curr->n = nei->n = cat->n = 0; | |
148 | if (prev->n == 0) { // when this routine is called for the seed, prev may filled by fm6_is_contained() | |
149 | overlap_intv(e, s->l - beg, (uint8_t*)s->s + beg, min_match, s->l - beg - 1, 0, prev, 0); | |
150 | if (prev->n == 0) return -1; // no overlap | |
151 | for (j = 0; j < prev->n; ++j) prev->a[j].info += beg; | |
152 | } | |
153 | kv_resize(int, *cat, prev->m); | |
154 | for (j = 0; j < prev->n; ++j) cat->a[j] = 0; // only one interval; all point to 0 | |
155 | while (prev->n) { | |
156 | for (j = 0, curr->n = 0; j < prev->n; ++j) { | |
157 | rldintv_t *p = &prev->a[j]; | |
158 | if (cat->a[j] < 0) continue; | |
159 | rld_extend(e, p, ok, 0); // forward extension | |
160 | if (ok[0].x[2] && ori_l != s->l) { // some (partial) reads end here | |
161 | rld_extend0(e, &ok[0], &ok0, 1); // backward extension to look for sentinels | |
162 | if (ok0.x[2]) { // the match is bounded by sentinels - a full-length match | |
163 | if (ok[0].x[2] == p->x[2] && p->x[2] == ok0.x[2]) { // never consider a read contained in another read | |
164 | int cat0 = cat->a[j]; // a category approximately corresponds to one neighbor, though not always | |
165 | assert(j == 0 || cat->a[j] > cat->a[j-1]); // otherwise not irreducible | |
166 | ok0.info = ori_l - (p->info&0xffffffffU); | |
167 | for (i = j; i < prev->n && cat->a[i] == cat0; ++i) cat->a[i] = -1; // mask out other intervals of the same cat | |
168 | kv_push(rldintv_t, *nei, ok0); // keep in the neighbor vector | |
169 | continue; // no need to go through for(c); do NOT set "used" as this neighbor may be rejected later | |
170 | } else if (used) set_bits(used, &ok0); // the read is contained in another read; mark it as used | |
171 | } | |
172 | } // ~if(ok[0].x[2]) | |
173 | if (cat->a[j] < 0) continue; // no need to proceed if we have finished this path | |
174 | for (c = 1; c < 5; ++c) // collect extensible intervals | |
175 | if (ok[c].x[2]) { | |
176 | rld_extend0(e, &ok[c], &ok0, 1); | |
177 | if (ok0.x[2]) { // do not extend intervals whose left end is not bounded by a sentinel | |
178 | ok[c].info = (p->info&0xfffffff0ffffffffLLU) | (uint64_t)c<<32; | |
179 | kv_push(rldintv_t, *curr, ok[c]); | |
180 | } | |
181 | } | |
182 | } // ~for(j) | |
183 | if (curr->n) { // update category | |
184 | uint32_t last, cat0; | |
185 | kv_resize(int, *cat, curr->m); | |
186 | c = curr->a[0].info>>32&0xf; | |
187 | kputc(fm6_comp(c), s); | |
188 | ks_introsort(infocmp, curr->n, curr->a); | |
189 | last = curr->a[0].info >> 32; | |
190 | cat->a[0] = 0; | |
191 | curr->a[0].info &= 0xffffffff; | |
192 | for (j = 1, cat0 = 0; j < curr->n; ++j) { // this loop recalculate cat | |
193 | if (curr->a[j].info>>32 != last) | |
194 | last = curr->a[j].info>>32, cat0 = j; | |
195 | cat->a[j] = cat0; | |
196 | curr->a[j].info = (curr->a[j].info&0xffffffff) | (uint64_t)cat0<<36; | |
197 | } | |
198 | if (cat0 != 0) is_forked = 1; | |
199 | } | |
200 | swap = curr; curr = prev; prev = swap; // swap curr and prev | |
201 | } // ~while(prev->n) | |
202 | if (nei->n == 0) return -1; // no overlap | |
203 | rbeg = ori_l - (uint32_t)nei->a[0].info; | |
204 | if (nei->n == 1 && is_forked) { // this may happen if there are contained reads; fix this | |
205 | fm6_set_intv(e, 0, ok0); | |
206 | for (i = rbeg; i < ori_l; ++i) { | |
207 | rld_extend(e, &ok0, ok, 0); | |
208 | ok0 = ok[fm6_comp(s->s[i])]; | |
209 | } | |
210 | for (i = ori_l; i < s->l; ++i) { | |
211 | int c0 = -1; | |
212 | rld_extend(e, &ok0, ok, 0); | |
213 | for (c = 1, j = 0; c < 5; ++c) | |
214 | if (ok[c].x[2] && ok[c].x[0] <= nei->a[0].x[0] && ok[c].x[0] + ok[c].x[2] >= nei->a[0].x[0] + nei->a[0].x[2]) | |
215 | ++j, c0 = c; | |
216 | if (j == 0 && ok[0].x[2]) break; | |
217 | assert(j == 1); | |
218 | s->s[i] = fm6_comp(c0); | |
219 | ok0 = ok[c0]; | |
220 | } | |
221 | s->l = i; s->s[s->l] = 0; | |
222 | } | |
223 | if (nei->n > 1) s->l = ori_l, s->s[s->l] = 0; | |
224 | return rbeg; | |
225 | } | |
226 | ||
227 | static int try_right(aux_t *a, int beg, kstring_t *s) | |
228 | { | |
229 | return fm6_get_nei(a->e, a->min_match, beg, s, &a->nei, &a->a[0], &a->a[1], &a->cat, a->used); | |
230 | } | |
231 | ||
232 | static int check_left_simple(aux_t *a, int beg, int rbeg, const kstring_t *s) | |
233 | { | |
234 | rldintv_t ok[6]; | |
235 | rldintv_v *prev = &a->a[0], *curr = &a->a[1], *swap; | |
236 | int i, j; | |
237 | ||
238 | overlap_intv(a->e, s->l, (uint8_t*)s->s, a->min_match, rbeg, 1, prev, 1); | |
239 | for (i = rbeg - 1; i >= beg; --i) { | |
240 | for (j = 0, curr->n = 0; j < prev->n; ++j) { | |
241 | rldintv_t *p = &prev->a[j]; | |
242 | rld_extend(a->e, p, ok, 1); | |
243 | if (ok[0].x[2]) set_bits(a->used, &ok[0]); // some reads end here; they must be contained in a longer read | |
244 | if (ok[0].x[2] + ok[(int)s->s[i]].x[2] != p->x[2]) return -1; // potential backward bifurcation | |
245 | kv_push(rldintv_t, *curr, ok[(int)s->s[i]]); | |
246 | } | |
247 | swap = curr; curr = prev; prev = swap; | |
248 | } // ~for(i) | |
249 | return 0; | |
250 | } | |
251 | ||
252 | static int check_left(aux_t *a, int beg, int rbeg, const kstring_t *s) | |
253 | { | |
254 | int i, ret; | |
255 | rldintv_t tmp; | |
256 | assert(a->nei.n == 1); | |
257 | ret = check_left_simple(a, beg, rbeg, s); | |
258 | if (ret == 0) return 0; | |
259 | // when ret<0, the back fork may be caused by a contained read. we have to do more to confirm this. | |
260 | tmp = a->nei.a[0]; // backup the neighbour as it will be overwritten by try_right() | |
261 | a->a[0].n = a->a[1].n = a->nei.n = 0; | |
262 | ks_resize(&a->str, s->l - rbeg + 1); | |
263 | for (i = s->l - 1, a->str.l = 0; i >= rbeg; --i) | |
264 | a->str.s[a->str.l++] = fm6_comp(s->s[i]); | |
265 | a->str.s[a->str.l] = 0; | |
266 | try_right(a, 0, &a->str); | |
267 | assert(a->nei.n >= 1); | |
268 | ret = a->nei.n > 1? -1 : 0; | |
269 | a->nei.n = 1; a->nei.a[0] = tmp; // recover the original neighbour | |
270 | return ret; | |
271 | } | |
272 | ||
273 | static int unitig_unidir(aux_t *a, kstring_t *s, kstring_t *cov, int beg0, uint64_t k0, uint64_t *end, int *is_loop) | |
274 | { | |
275 | int i, beg = beg0, rbeg, ori_l = s->l, n_reads = 0; | |
276 | *is_loop = 0; | |
277 | while ((rbeg = try_right(a, beg, s)) >= 0) { // loop if there is at least one overlap | |
278 | uint64_t k; | |
279 | if (a->nei.n > 1) { // forward bifurcation | |
280 | set_bit(a->bend, *end); | |
281 | break; | |
282 | } | |
283 | if ((k = a->nei.a[0].x[0]) == *end) break; // a loop like b>>c>>a><a; keep the link but stop extension | |
284 | if (((a->bend[k>>6]>>(k&0x3f)&1) || check_left(a, beg, rbeg, s) < 0)) { // backward bifurcation | |
285 | set_bit(a->bend, k); | |
286 | break; | |
287 | } | |
288 | if (k == k0) { // a loop like a>>b>>c>>a | |
289 | *is_loop = 1; | |
290 | break; | |
291 | } | |
292 | if (a->nei.a[0].x[1] == *end) { // a loop like b>>c>>a>>a; cut the last link | |
293 | a->nei.n = 0; | |
294 | break; | |
295 | } | |
296 | if ((int)a->nei.a[0].info < a->min_merge_len) break; // the overlap is not long enough | |
297 | *end = a->nei.a[0].x[1]; | |
298 | set_bits(a->used, &a->nei.a[0]); // successful extension | |
299 | ++n_reads; | |
300 | if (cov->m < s->m) ks_resize(cov, s->m); | |
301 | cov->l = s->l; cov->s[cov->l] = 0; | |
302 | for (i = rbeg; i < ori_l; ++i) // update the coverage string | |
303 | if (cov->s[i] != '~') ++cov->s[i]; | |
304 | for (i = ori_l; i < s->l; ++i) cov->s[i] = '"'; | |
305 | beg = rbeg; ori_l = s->l; a->a[0].n = a->a[1].n = 0; // prepare for the next round of loop | |
306 | } | |
307 | cov->l = s->l = ori_l; s->s[ori_l] = cov->s[ori_l] = 0; | |
308 | return n_reads; | |
309 | } | |
310 | ||
311 | static void copy_nei(ku128_v *dst, const rldintv_v *src) | |
312 | { | |
313 | int i; | |
314 | for (i = 0; i < src->n; ++i) { | |
315 | ku128_t z; | |
316 | z.x = src->a[i].x[0]; z.y = src->a[i].info; | |
317 | kv_push(ku128_t, *dst, z); | |
318 | } | |
319 | } | |
320 | ||
321 | static int unitig1(aux_t *a, int64_t seed, kstring_t *s, kstring_t *cov, uint64_t end[2], ku128_v nei[2], int *n_reads) | |
322 | { | |
323 | rldintv_t intv0; | |
324 | int seed_len, ret, is_loop, contained; | |
325 | int64_t k; | |
326 | size_t i; | |
327 | ||
328 | *n_reads = nei[0].n = nei[1].n = 0; | |
329 | if (a->used[seed>>6]>>(seed&0x3f)&1) return -2; // used | |
330 | // retrieve the sequence pointed by seed | |
331 | k = fm6_retrieve(a->e, seed, s, &intv0, &contained); | |
332 | seq_reverse(s->l, (uint8_t*)s->s); | |
333 | seed_len = s->l; | |
334 | // check contained status | |
335 | if (intv0.x[2] > 1 && k != intv0.x[0]) return -3; // duplicated, but not the first | |
336 | set_bits(a->used, &intv0); | |
337 | if (contained) return -3; // contained | |
338 | // check length, containment and if used before | |
339 | if (s->l <= a->min_match) return -1; // too short | |
340 | ret = fm6_is_contained(a->e, a->min_match, s, &intv0, &a->a[0]); | |
341 | *n_reads = 1; | |
342 | // initialize the coverage string | |
343 | if (cov->m < s->m) ks_resize(cov, s->m); | |
344 | cov->l = s->l; cov->s[cov->l] = 0; | |
345 | for (i = 0; i < cov->l; ++i) cov->s[i] = '"'; | |
346 | // left-wards extension | |
347 | end[0] = intv0.x[1]; end[1] = intv0.x[0]; | |
348 | if (a->a[0].n) { // no need to extend to the right if there is no overlap | |
349 | *n_reads += unitig_unidir(a, s, cov, 0, intv0.x[0], &end[0], &is_loop); | |
350 | copy_nei(&nei[0], &a->nei); | |
351 | if (is_loop) { | |
352 | ku128_t z; | |
353 | z.x = end[0]; z.y = a->nei.a[0].info; | |
354 | kv_push(ku128_t, nei[1], z); | |
355 | return 0; | |
356 | } | |
357 | } | |
358 | // right-wards extension | |
359 | a->a[0].n = a->a[1].n = a->nei.n = 0; | |
360 | seq_revcomp6(s->l, (uint8_t*)s->s); // reverse complement for extension in the other direction | |
361 | seq_reverse(cov->l, (uint8_t*)cov->s); // reverse the coverage | |
362 | *n_reads += unitig_unidir(a, s, cov, s->l - seed_len, intv0.x[1], &end[1], &is_loop); | |
363 | copy_nei(&nei[1], &a->nei); | |
364 | return 0; | |
365 | } | |
366 | ||
367 | typedef struct { | |
368 | long max_l; | |
369 | aux_t a; | |
370 | kstring_t str, cov; | |
371 | magv_t z; | |
372 | magv_v v; | |
373 | } thrdat_t; | |
374 | ||
375 | typedef struct { | |
376 | uint64_t prime, *used, *bend, *visited; | |
377 | const rld_t *e; | |
378 | thrdat_t *d; | |
379 | } worker_t; | |
380 | ||
381 | static void worker(void *data, long _i, int tid) | |
382 | { | |
383 | worker_t *w = (worker_t*)data; | |
384 | thrdat_t *d = &w->d[tid]; | |
385 | uint64_t i = (w->prime * _i) % w->e->mcnt[1]; | |
386 | if (unitig1(&d->a, i, &d->str, &d->cov, d->z.k, d->z.nei, &d->z.nsr) >= 0) { // then we keep the unitig | |
387 | uint64_t *p[2], x[2]; | |
388 | magv_t *q; | |
389 | p[0] = w->visited + (d->z.k[0]>>6); x[0] = 1LLU<<(d->z.k[0]&0x3f); | |
390 | p[1] = w->visited + (d->z.k[1]>>6); x[1] = 1LLU<<(d->z.k[1]&0x3f); | |
391 | if ((__sync_fetch_and_or(p[0], x[0])&x[0]) || (__sync_fetch_and_or(p[1], x[1])&x[1])) return; | |
392 | d->z.len = d->str.l; | |
393 | if (d->max_l < d->str.m) { | |
394 | d->max_l = d->str.m; | |
395 | d->z.seq = realloc(d->z.seq, d->max_l); | |
396 | d->z.cov = realloc(d->z.cov, d->max_l); | |
397 | } | |
398 | memcpy(d->z.seq, d->str.s, d->z.len); | |
399 | memcpy(d->z.cov, d->cov.s, d->z.len + 1); | |
400 | kv_pushp(magv_t, d->v, &q); | |
401 | mag_v_copy_to_empty(q, &d->z); | |
402 | } | |
403 | } | |
404 | ||
405 | mag_t *fml_fmi2mag_core(const rld_t *e, int min_match, int min_merge_len, int n_threads) | |
406 | { | |
407 | extern void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n); | |
408 | worker_t w; | |
409 | int j; | |
410 | mag_t *g; | |
411 | ||
412 | w.used = (uint64_t*)calloc((e->mcnt[1] + 63)/64, 8); | |
413 | w.bend = (uint64_t*)calloc((e->mcnt[1] + 63)/64, 8); | |
414 | w.visited = (uint64_t*)calloc((e->mcnt[1] + 63)/64, 8); | |
415 | w.e = e; | |
416 | assert(e->mcnt[1] >= n_threads * 2); | |
417 | w.d = calloc(n_threads, sizeof(thrdat_t)); | |
418 | w.prime = 0; | |
419 | for (j = 0; utg_primes[j] > 0; ++j) | |
420 | if (e->mcnt[1] % utg_primes[j] != 0) { | |
421 | w.prime = utg_primes[j]; | |
422 | break; | |
423 | } | |
424 | assert(w.prime); | |
425 | for (j = 0; j < n_threads; ++j) { | |
426 | w.d[j].a.e = e; w.d[j].a.min_match = min_match; w.d[j].a.min_merge_len = min_merge_len; | |
427 | w.d[j].a.used = w.used; w.d[j].a.bend = w.bend; | |
428 | } | |
429 | kt_for(n_threads, worker, &w, e->mcnt[1]); | |
430 | g = (mag_t*)calloc(1, sizeof(mag_t)); | |
431 | for (j = 0; j < n_threads; ++j) { | |
432 | kv_resize(magv_t, g->v, g->v.n + w.d[j].v.n); | |
433 | memcpy(g->v.a + g->v.n, w.d[j].v.a, w.d[j].v.n * sizeof(magv_t)); | |
434 | g->v.n += w.d[j].v.n; | |
435 | free(w.d[j].v.a); | |
436 | free(w.d[j].a.a[0].a); free(w.d[j].a.a[1].a); free(w.d[j].a.nei.a); free(w.d[j].a.cat.a); | |
437 | free(w.d[j].z.nei[0].a); free(w.d[j].z.nei[1].a); free(w.d[j].z.seq); free(w.d[j].z.cov); | |
438 | free(w.d[j].a.str.s); free(w.d[j].str.s); free(w.d[j].cov.s); | |
439 | } | |
440 | free(w.d); free(w.used); free(w.bend); free(w.visited); | |
441 | ||
442 | mag_g_build_hash(g); | |
443 | mag_g_amend(g); | |
444 | g->rdist = mag_cal_rdist(g); | |
445 | return g; | |
446 | } | |
447 | ||
448 | mag_t *fml_fmi2mag(const fml_opt_t *opt, rld_t *e) | |
449 | { | |
450 | mag_t *g; | |
451 | g = fml_fmi2mag_core(e, opt->min_asm_ovlp, opt->min_merge_len, opt->n_threads); | |
452 | rld_destroy(e); | |
453 | return g; | |
454 | } |
0 | The MIT License | |
1 | ||
2 | Copyright (c) 2015 Broad Institute | |
3 | ||
4 | Permission is hereby granted, free of charge, to any person obtaining | |
5 | a copy of this software and associated documentation files (the | |
6 | "Software"), to deal in the Software without restriction, including | |
7 | without limitation the rights to use, copy, modify, merge, publish, | |
8 | distribute, sublicense, and/or sell copies of the Software, and to | |
9 | permit persons to whom the Software is furnished to do so, subject to | |
10 | the following conditions: | |
11 | ||
12 | The above copyright notice and this permission notice shall be | |
13 | included in all copies or substantial portions of the Software. | |
14 | ||
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
19 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
20 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 | SOFTWARE. |
0 | ## Introduction | |
1 | ||
2 | Minimap is an *experimental* tool to efficiently find multiple approximate | |
3 | mapping positions between two sets of long sequences, such as between reads and | |
4 | reference genomes, between genomes and between long noisy reads. By default, it | |
5 | is tuned to have high sensitivity to 2kb matches around 20% divergence but with | |
6 | low specificity. Minimap does not generate alignments as of now and because of | |
7 | this, it is usually tens of times faster than mainstream *aligners*. With four | |
8 | CPU cores, minimap can map 1.6Gbp PacBio reads to human in 2.5 minutes, 1Gbp | |
9 | PacBio E. coli reads to pre-indexed 9.6Gbp bacterial genomes in 3 minutes, to | |
10 | pre-indexed >100Gbp nt database in ~1 hour (of which ~20 minutes are spent on | |
11 | loading index from the network filesystem; peak RAM: 10GB), map 2800 bacteria | |
12 | to themselves in 1 hour, and map 1Gbp E. coli reads against themselves in a | |
13 | couple of minutes. | |
14 | ||
15 | Minimap does not replace mainstream aligners, but it can be useful when you | |
16 | want to quickly identify long approximate matches at moderate divergence among | |
17 | a huge collection of sequences. For this task, it is much faster than most | |
18 | existing tools. | |
19 | ||
20 | ## Usage | |
21 | ||
22 | * Map two sets of long sequences: | |
23 | ```sh | |
24 | minimap target.fa.gz query.fa.gz > out.mini | |
25 | ``` | |
26 | The output is TAB-delimited with each line consisting of query name, length, | |
27 | 0-based start, end, strand, target name, length, start, end, the number of | |
28 | matching bases, the number of co-linear minimizers in the match and the | |
29 | fraction of matching bases. | |
30 | ||
31 | * All-vs-all PacBio read self-mapping for [miniasm][miniasm]: | |
32 | ```sh | |
33 | minimap -Sw5 -L100 -m0 reads.fa reads.fa | gzip -1 > reads.paf.gz | |
34 | ``` | |
35 | ||
36 | * Prebuild index and then map: | |
37 | ```sh | |
38 | minimap -d target.mmi target.fa.gz | |
39 | minimap -l target.mmi query.fa.gz > out.mini | |
40 | ``` | |
41 | Minimap indexing is very fast (1 minute for human genome; 50 minutes for >100Gbp | |
42 | nt database retrieved on 2015-09-30), but for huge | |
43 | repeatedly used databases, prebuilding index is still preferred. | |
44 | ||
45 | * Map sequences against themselve without diagnal matches: | |
46 | ```sh | |
47 | minimap -S sequences.fa sequences.fa > self-match.mini | |
48 | ``` | |
49 | The output may still contain overlapping matches in repetitive regions. | |
50 | ||
51 | ## Algorithm Overview | |
52 | ||
53 | 1. Indexing. Collect all [(*w*,*k*)-minimizers][mini] in a batch (**-I**=4 | |
54 | billion bp) of target sequences and store them in a hash table. Mark top | |
55 | **-f**=0.1% of most frequent minimizers as repeats. Minimap | |
56 | uses [invertible hash function][invhash] to avoid taking ploy-A as | |
57 | minimizers. | |
58 | ||
59 | 2. For each query, collect all (*w*,*k*)-minimizers and look up the hash table for | |
60 | matches (*q<sub>i</sub>*,*t<sub>i</sub>*,*s<sub>i</sub>*), where | |
61 | *q<sub>i</sub>* is the query position, *t<sub>i</sub>* the target position | |
62 | and *s<sub>i</sub>* indicates whether the minimizer match is on the same | |
63 | strand. | |
64 | ||
65 | 3. For matches on the same strand, sort by {*q<sub>i</sub>*-*t<sub>i</sub>*} | |
66 | and then cluster matches within a **-r**=500bp window. Minimap merges | |
67 | two windows if **-m**=50% of minimizer matches overlap. For matches on different | |
68 | strands, sort {*q<sub>i</sub>*+*t<sub>i</sub>*} and apply a similar | |
69 | clustering procedure. This is inspired by the [Hough transformation][hough]. | |
70 | ||
71 | 4. For each cluster, sort (*q<sub>i</sub>*,*t<sub>i</sub>*) by *q<sub>i</sub>* | |
72 | and solve a [longest increasing sequence problem][lis] for *t<sub>i</sub>*. This | |
73 | finds the longest co-linear matching chain. Break the chain whenever there | |
74 | is a gap longer than **-g**=10000. | |
75 | ||
76 | 5. Output the start and end of the chain if it contains **-c**=4 or more | |
77 | minimizer matches and the matching length is no less than **-L**=40. | |
78 | ||
79 | 6. Go to 1 and rewind to the first record of query if there are more target | |
80 | sequences; otherwise stop. | |
81 | ||
82 | To increase sensitivity, we may decrease **-w** to index more minimizers; | |
83 | we may also decrease **-k**, though this may greatly impact performance for | |
84 | mammalian genomes. | |
85 | ||
86 | Also note that by default, if the total length of target sequences is less than | |
87 | 4Gbp (1G=1 billion; controlled by **-I**), minimap creates one index and stream | |
88 | all the query sequences in one go. The multiple hits of a query sequence is | |
89 | adjacent to each other in the output. If the total length is greater than | |
90 | 4Gbp, minimap needs to read query sequences multiple times. The multiple hits | |
91 | of a query may not be adjacent. | |
92 | ||
93 | [mini]: http://bioinformatics.oxfordjournals.org/content/20/18/3363.abstract | |
94 | [lis]: https://en.wikipedia.org/wiki/Longest_increasing_subsequence | |
95 | [hough]: https://en.wikipedia.org/wiki/Hough_transform | |
96 | [invhash]: https://gist.github.com/lh3/974ced188be2f90422cc | |
97 | [miniasm]: https://github.com/lh3/miniasm |
0 | #include <zlib.h> | |
1 | #include <stdio.h> | |
2 | #include <stdlib.h> | |
3 | #include <string.h> | |
4 | #include <assert.h> | |
5 | #include "bseq.h" | |
6 | #include "kseq.h" | |
7 | KSEQ_INIT(gzFile, gzread) | |
8 | ||
9 | extern unsigned char seq_nt4_table[256]; | |
10 | ||
11 | struct bseq_file_s { | |
12 | int is_eof; | |
13 | gzFile fp; | |
14 | kseq_t *ks; | |
15 | }; | |
16 | ||
17 | bseq_file_t *bseq_open(const char *fn) | |
18 | { | |
19 | bseq_file_t *fp; | |
20 | gzFile f; | |
21 | f = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); | |
22 | if (f == 0) return 0; | |
23 | fp = (bseq_file_t*)calloc(1, sizeof(bseq_file_t)); | |
24 | fp->fp = f; | |
25 | fp->ks = kseq_init(fp->fp); | |
26 | return fp; | |
27 | } | |
28 | ||
29 | void bseq_close(bseq_file_t *fp) | |
30 | { | |
31 | kseq_destroy(fp->ks); | |
32 | gzclose(fp->fp); | |
33 | free(fp); | |
34 | } | |
35 | ||
36 | bseq1_t *bseq_read(bseq_file_t *fp, int chunk_size, int *n_) | |
37 | { | |
38 | int size = 0, m, n; | |
39 | bseq1_t *seqs; | |
40 | kseq_t *ks = fp->ks; | |
41 | m = n = 0; seqs = 0; | |
42 | while (kseq_read(ks) >= 0) { | |
43 | bseq1_t *s; | |
44 | assert(ks->seq.l <= INT32_MAX); | |
45 | if (n >= m) { | |
46 | m = m? m<<1 : 256; | |
47 | seqs = (bseq1_t*)realloc(seqs, m * sizeof(bseq1_t)); | |
48 | } | |
49 | s = &seqs[n]; | |
50 | s->name = strdup(ks->name.s); | |
51 | s->seq = strdup(ks->seq.s); | |
52 | s->l_seq = ks->seq.l; | |
53 | size += seqs[n++].l_seq; | |
54 | if (size >= chunk_size) break; | |
55 | } | |
56 | if (n == 0) fp->is_eof = 1; | |
57 | *n_ = n; | |
58 | return seqs; | |
59 | } | |
60 | ||
61 | int bseq_eof(bseq_file_t *fp) | |
62 | { | |
63 | return fp->is_eof; | |
64 | } |
0 | #ifndef MM_BSEQ_H | |
1 | #define MM_BSEQ_H | |
2 | ||
3 | #include <stdint.h> | |
4 | ||
5 | struct bseq_file_s; | |
6 | typedef struct bseq_file_s bseq_file_t; | |
7 | ||
8 | typedef struct { | |
9 | int l_seq, rid; | |
10 | char *name, *seq; | |
11 | } bseq1_t; | |
12 | ||
13 | bseq_file_t *bseq_open(const char *fn); | |
14 | void bseq_close(bseq_file_t *fp); | |
15 | bseq1_t *bseq_read(bseq_file_t *fp, int chunk_size, int *n_); | |
16 | int bseq_eof(bseq_file_t *fp); | |
17 | ||
18 | #endif |
0 | // To compile: | |
1 | // gcc -g -O2 example.c libminimap.a -lz | |
2 | ||
3 | #include <stdlib.h> | |
4 | #include <assert.h> | |
5 | #include <stdio.h> | |
6 | #include <zlib.h> | |
7 | #include "minimap.h" | |
8 | #include "kseq.h" | |
9 | KSEQ_INIT(gzFile, gzread) | |
10 | ||
11 | int main(int argc, char *argv[]) | |
12 | { | |
13 | if (argc < 3) { | |
14 | fprintf(stderr, "Usage: minimap-lite <target.fa> <query.fa>\n"); | |
15 | return 1; | |
16 | } | |
17 | ||
18 | // open query file for reading; you may use your favorite FASTA/Q parser | |
19 | gzFile f = gzopen(argv[2], "r"); | |
20 | assert(f); | |
21 | kseq_t *ks = kseq_init(f); | |
22 | ||
23 | // create index for target; we are creating one index for all target sequence | |
24 | int n_threads = 4, w = 10, k = 15; | |
25 | mm_idx_t *mi = mm_idx_build(argv[1], w, k, n_threads); | |
26 | assert(mi); | |
27 | ||
28 | // mapping | |
29 | mm_mapopt_t opt; | |
30 | mm_mapopt_init(&opt); // initialize mapping parameters | |
31 | mm_tbuf_t *tbuf = mm_tbuf_init(); // thread buffer; for multi-threading, allocate one tbuf for each thread | |
32 | while (kseq_read(ks) >= 0) { // each kseq_read() call reads one query sequence | |
33 | const mm_reg1_t *reg; | |
34 | int j, n_reg; | |
35 | // get all hits for the query | |
36 | reg = mm_map(mi, ks->seq.l, ks->seq.s, &n_reg, tbuf, &opt, 0); | |
37 | // traverse hits and print them out | |
38 | for (j = 0; j < n_reg; ++j) { | |
39 | const mm_reg1_t *r = ®[j]; | |
40 | printf("%s\t%d\t%d\t%d\t%c\t", ks->name.s, ks->seq.l, r->qs, r->qe, "+-"[r->rev]); | |
41 | printf("%s\t%d\t%d\t%d\t%d\t%d\n", mi->name[r->rid], mi->len[r->rid], r->rs, r->re, r->len, r->cnt); | |
42 | } | |
43 | } | |
44 | mm_tbuf_destroy(tbuf); | |
45 | ||
46 | // deallocate index and close the query file | |
47 | mm_idx_destroy(mi); | |
48 | kseq_destroy(ks); | |
49 | gzclose(f); | |
50 | return 0; | |
51 | } |
0 | #include <stdlib.h> | |
1 | #include <assert.h> | |
2 | #include <stdio.h> | |
3 | #include "minimap.h" | |
4 | #include "kvec.h" | |
5 | #include "khash.h" | |
6 | ||
7 | #define idx_hash(a) ((a)>>1) | |
8 | #define idx_eq(a, b) ((a)>>1 == (b)>>1) | |
9 | KHASH_INIT(idx, uint64_t, uint64_t, 1, idx_hash, idx_eq) | |
10 | typedef khash_t(idx) idxhash_t; | |
11 | ||
12 | void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n); | |
13 | ||
14 | mm_idx_t *mm_idx_init(int w, int k, int b) | |
15 | { | |
16 | mm_idx_t *mi; | |
17 | if (k*2 < b) b = k * 2; | |
18 | if (w < 1) w = 1; | |
19 | mi = (mm_idx_t*)calloc(1, sizeof(mm_idx_t)); | |
20 | mi->w = w, mi->k = k, mi->b = b; | |
21 | mi->max_occ = UINT32_MAX; | |
22 | mi->B = (mm_idx_bucket_t*)calloc(1<<b, sizeof(mm_idx_bucket_t)); | |
23 | return mi; | |
24 | } | |
25 | ||
26 | void mm_idx_destroy(mm_idx_t *mi) | |
27 | { | |
28 | int i; | |
29 | if (mi == 0) return; | |
30 | for (i = 0; i < 1<<mi->b; ++i) { | |
31 | free(mi->B[i].p); | |
32 | free(mi->B[i].a.a); | |
33 | kh_destroy(idx, (idxhash_t*)mi->B[i].h); | |
34 | } | |
35 | free(mi->B); | |
36 | if (mi->name) | |
37 | for (i = 0; i < mi->n; ++i) free(mi->name[i]); | |
38 | free(mi->len); free(mi->name); | |
39 | free(mi); | |
40 | } | |
41 | ||
42 | const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n) | |
43 | { | |
44 | int mask = (1<<mi->b) - 1; | |
45 | khint_t k; | |
46 | mm_idx_bucket_t *b = &mi->B[minier&mask]; | |
47 | idxhash_t *h = (idxhash_t*)b->h; | |
48 | *n = 0; | |
49 | if (h == 0) return 0; | |
50 | k = kh_get(idx, h, minier>>mi->b<<1); | |
51 | if (k == kh_end(h)) return 0; | |
52 | if (kh_key(h, k)&1) { | |
53 | *n = 1; | |
54 | return &kh_val(h, k); | |
55 | } else { | |
56 | *n = (uint32_t)kh_val(h, k); | |
57 | return &b->p[kh_val(h, k)>>32]; | |
58 | } | |
59 | } | |
60 | ||
61 | uint32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f) | |
62 | { | |
63 | int i; | |
64 | size_t n = 0; | |
65 | uint32_t thres; | |
66 | khint_t *a, k; | |
67 | if (f <= 0.) return UINT32_MAX; | |
68 | for (i = 0; i < 1<<mi->b; ++i) | |
69 | if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h); | |
70 | a = (uint32_t*)malloc(n * 4); | |
71 | for (i = n = 0; i < 1<<mi->b; ++i) { | |
72 | idxhash_t *h = (idxhash_t*)mi->B[i].h; | |
73 | if (h == 0) continue; | |
74 | for (k = 0; k < kh_end(h); ++k) { | |
75 | if (!kh_exist(h, k)) continue; | |
76 | a[n++] = kh_key(h, k)&1? 1 : (uint32_t)kh_val(h, k); | |
77 | } | |
78 | } | |
79 | thres = ks_ksmall_uint32_t(n, a, (uint32_t)((1. - f) * n)) + 1; | |
80 | free(a); | |
81 | return thres; | |
82 | } | |
83 | ||
84 | void mm_idx_set_max_occ(mm_idx_t *mi, float f) | |
85 | { | |
86 | mi->freq_thres = f; | |
87 | mi->max_occ = mm_idx_cal_max_occ(mi, f); | |
88 | } | |
89 | ||
90 | /********************************* | |
91 | * Sort and generate hash tables * | |
92 | *********************************/ | |
93 | ||
94 | static void worker_post(void *g, long i, int tid) | |
95 | { | |
96 | int j, start_a, start_p, n, n_keys; | |
97 | idxhash_t *h; | |
98 | mm_idx_t *mi = (mm_idx_t*)g; | |
99 | mm_idx_bucket_t *b = &mi->B[i]; | |
100 | if (b->a.n == 0) return; | |
101 | ||
102 | // sort by minimizer | |
103 | radix_sort_128x(b->a.a, b->a.a + b->a.n); | |
104 | ||
105 | // count and preallocate | |
106 | for (j = 1, n = 1, n_keys = 0, b->n = 0; j <= b->a.n; ++j) { | |
107 | if (j == b->a.n || b->a.a[j].x != b->a.a[j-1].x) { | |
108 | ++n_keys; | |
109 | if (n > 1) b->n += n; | |
110 | n = 1; | |
111 | } else ++n; | |
112 | } | |
113 | h = kh_init(idx); | |
114 | kh_resize(idx, h, n_keys); | |
115 | b->p = (uint64_t*)calloc(b->n, 8); | |
116 | ||
117 | // create the hash table | |
118 | for (j = 1, n = 1, start_a = start_p = 0; j <= b->a.n; ++j) { | |
119 | if (j == b->a.n || b->a.a[j].x != b->a.a[j-1].x) { | |
120 | khint_t itr; | |
121 | int absent; | |
122 | mm128_t *p = &b->a.a[j-1]; | |
123 | itr = kh_put(idx, h, p->x>>mi->b<<1, &absent); | |
124 | assert(absent && j - start_a == n); | |
125 | if (n == 1) { | |
126 | kh_key(h, itr) |= 1; | |
127 | kh_val(h, itr) = p->y; | |
128 | } else { | |
129 | int k; | |
130 | for (k = 0; k < n; ++k) | |
131 | b->p[start_p + k] = b->a.a[start_a + k].y; | |
132 | kh_val(h, itr) = (uint64_t)start_p<<32 | n; | |
133 | start_p += n; | |
134 | } | |
135 | start_a = j, n = 1; | |
136 | } else ++n; | |
137 | } | |
138 | b->h = h; | |
139 | assert(b->n == start_p); | |
140 | ||
141 | // deallocate and clear b->a | |
142 | free(b->a.a); | |
143 | b->a.n = b->a.m = 0, b->a.a = 0; | |
144 | } | |
145 | ||
146 | static void mm_idx_post(mm_idx_t *mi, int n_threads) | |
147 | { | |
148 | kt_for(n_threads, worker_post, mi, 1<<mi->b); | |
149 | } | |
150 | ||
151 | /****************** | |
152 | * Generate index * | |
153 | ******************/ | |
154 | ||
155 | #include <string.h> | |
156 | #include <zlib.h> | |
157 | #include "bseq.h" | |
158 | ||
159 | void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps); | |
160 | ||
161 | typedef struct { | |
162 | int tbatch_size, n_processed, keep_name; | |
163 | bseq_file_t *fp; | |
164 | uint64_t ibatch_size, n_read; | |
165 | mm_idx_t *mi; | |
166 | } pipeline_t; | |
167 | ||
168 | typedef struct { | |
169 | int n_seq; | |
170 | bseq1_t *seq; | |
171 | mm128_v a; | |
172 | } step_t; | |
173 | ||
174 | static void mm_idx_add(mm_idx_t *mi, int n, const mm128_t *a) | |
175 | { | |
176 | int i, mask = (1<<mi->b) - 1; | |
177 | for (i = 0; i < n; ++i) { | |
178 | mm128_v *p = &mi->B[a[i].x&mask].a; | |
179 | kv_push(mm128_t, *p, a[i]); | |
180 | } | |
181 | } | |
182 | ||
183 | static void *worker_pipeline(void *shared, int step, void *in) | |
184 | { | |
185 | int i; | |
186 | pipeline_t *p = (pipeline_t*)shared; | |
187 | if (step == 0) { // step 0: read sequences | |
188 | step_t *s; | |
189 | if (p->n_read > p->ibatch_size) return 0; | |
190 | s = (step_t*)calloc(1, sizeof(step_t)); | |
191 | s->seq = bseq_read(p->fp, p->tbatch_size, &s->n_seq); | |
192 | if (s->seq) { | |
193 | uint32_t old_m = p->mi->n, m, n; | |
194 | assert((uint64_t)p->n_processed + s->n_seq <= INT32_MAX); | |
195 | m = n = p->mi->n + s->n_seq; | |
196 | kroundup32(m); kroundup32(old_m); | |
197 | if (old_m != m) { | |
198 | if (p->keep_name) | |
199 | p->mi->name = (char**)realloc(p->mi->name, m * sizeof(char*)); | |
200 | p->mi->len = (int*)realloc(p->mi->len, m * sizeof(int)); | |
201 | } | |
202 | for (i = 0; i < s->n_seq; ++i) { | |
203 | if (p->keep_name) { | |
204 | assert(strlen(s->seq[i].name) <= 254); | |
205 | p->mi->name[p->mi->n] = strdup(s->seq[i].name); | |
206 | } | |
207 | p->mi->len[p->mi->n++] = s->seq[i].l_seq; | |
208 | s->seq[i].rid = p->n_processed++; | |
209 | p->n_read += s->seq[i].l_seq; | |
210 | } | |
211 | return s; | |
212 | } else free(s); | |
213 | } else if (step == 1) { // step 1: compute sketch | |
214 | step_t *s = (step_t*)in; | |
215 | for (i = 0; i < s->n_seq; ++i) { | |
216 | bseq1_t *t = &s->seq[i]; | |
217 | mm_sketch(t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, &s->a); | |
218 | free(t->seq); free(t->name); | |
219 | } | |
220 | free(s->seq); s->seq = 0; | |
221 | return s; | |
222 | } else if (step == 2) { // dispatch sketch to buckets | |
223 | step_t *s = (step_t*)in; | |
224 | mm_idx_add(p->mi, s->a.n, s->a.a); | |
225 | free(s->a.a); free(s); | |
226 | } | |
227 | return 0; | |
228 | } | |
229 | ||
230 | mm_idx_t *mm_idx_gen(bseq_file_t *fp, int w, int k, int b, int tbatch_size, int n_threads, uint64_t ibatch_size, int keep_name) | |
231 | { | |
232 | pipeline_t pl; | |
233 | memset(&pl, 0, sizeof(pipeline_t)); | |
234 | pl.tbatch_size = tbatch_size; | |
235 | pl.keep_name = keep_name; | |
236 | pl.ibatch_size = ibatch_size; | |
237 | pl.fp = fp; | |
238 | if (pl.fp == 0) return 0; | |
239 | pl.mi = mm_idx_init(w, k, b); | |
240 | ||
241 | kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3); | |
242 | if (mm_verbose >= 3) | |
243 | fprintf(stderr, "[M::%s::%.3f*%.2f] collected minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0)); | |
244 | ||
245 | mm_idx_post(pl.mi, n_threads); | |
246 | if (mm_verbose >= 3) | |
247 | fprintf(stderr, "[M::%s::%.3f*%.2f] sorted minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0)); | |
248 | ||
249 | return pl.mi; | |
250 | } | |
251 | ||
252 | mm_idx_t *mm_idx_build(const char *fn, int w, int k, int n_threads) // a simpler interface | |
253 | { | |
254 | bseq_file_t *fp; | |
255 | mm_idx_t *mi; | |
256 | fp = bseq_open(fn); | |
257 | if (fp == 0) return 0; | |
258 | mi = mm_idx_gen(fp, w, k, MM_IDX_DEF_B, 1<<18, n_threads, UINT64_MAX, 1); | |
259 | mm_idx_set_max_occ(mi, 0.001); | |
260 | bseq_close(fp); | |
261 | return mi; | |
262 | } | |
263 | ||
264 | /************* | |
265 | * index I/O * | |
266 | *************/ | |
267 | ||
268 | #define MM_IDX_MAGIC "MMI\1" | |
269 | ||
270 | void mm_idx_dump(FILE *fp, const mm_idx_t *mi) | |
271 | { | |
272 | uint32_t x[6]; | |
273 | int i; | |
274 | x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n, x[4] = mi->name? 1 : 0, x[5] = mi->max_occ; | |
275 | fwrite(MM_IDX_MAGIC, 1, 4, fp); | |
276 | fwrite(x, 4, 6, fp); | |
277 | fwrite(&mi->freq_thres, sizeof(float), 1, fp); | |
278 | fwrite(mi->len, 4, mi->n, fp); | |
279 | if (mi->name) { | |
280 | for (i = 0; i < mi->n; ++i) { | |
281 | uint8_t l; | |
282 | l = strlen(mi->name[i]); | |
283 | fwrite(&l, 1, 1, fp); | |
284 | fwrite(mi->name[i], 1, l, fp); | |
285 | } | |
286 | } | |
287 | for (i = 0; i < 1<<mi->b; ++i) { | |
288 | mm_idx_bucket_t *b = &mi->B[i]; | |
289 | khint_t k; | |
290 | idxhash_t *h = (idxhash_t*)b->h; | |
291 | uint32_t size = h? h->size : 0; | |
292 | fwrite(&b->n, 4, 1, fp); | |
293 | fwrite(b->p, 8, b->n, fp); | |
294 | fwrite(&size, 4, 1, fp); | |
295 | if (size == 0) continue; | |
296 | for (k = 0; k < kh_end(h); ++k) { | |
297 | uint64_t x[2]; | |
298 | if (!kh_exist(h, k)) continue; | |
299 | x[0] = kh_key(h, k), x[1] = kh_val(h, k); | |
300 | fwrite(x, 8, 2, fp); | |
301 | } | |
302 | } | |
303 | } | |
304 | ||
305 | mm_idx_t *mm_idx_load(FILE *fp) | |
306 | { | |
307 | int i; | |
308 | char magic[4]; | |
309 | uint32_t x[6]; | |
310 | mm_idx_t *mi; | |
311 | if (fread(magic, 1, 4, fp) != 4) return 0; | |
312 | if (strncmp(magic, MM_IDX_MAGIC, 4) != 0) return 0; | |
313 | if (fread(x, 4, 6, fp) != 6) return 0; | |
314 | mi = mm_idx_init(x[0], x[1], x[2]); | |
315 | mi->n = x[3], mi->max_occ = x[5]; | |
316 | fread(&mi->freq_thres, sizeof(float), 1, fp); | |
317 | mi->len = (int32_t*)malloc(mi->n * 4); | |
318 | fread(mi->len, 4, mi->n, fp); | |
319 | if (x[4]) { // has names | |
320 | mi->name = (char**)calloc(mi->n, sizeof(char*)); | |
321 | for (i = 0; i < mi->n; ++i) { | |
322 | uint8_t l; | |
323 | fread(&l, 1, 1, fp); | |
324 | mi->name[i] = (char*)malloc(l + 1); | |
325 | fread(mi->name[i], 1, l, fp); | |
326 | mi->name[i][l] = 0; | |
327 | } | |
328 | } | |
329 | for (i = 0; i < 1<<mi->b; ++i) { | |
330 | mm_idx_bucket_t *b = &mi->B[i]; | |
331 | uint32_t j, size; | |
332 | khint_t k; | |
333 | idxhash_t *h; | |
334 | fread(&b->n, 4, 1, fp); | |
335 | b->p = (uint64_t*)malloc(b->n * 8); | |
336 | fread(b->p, 8, b->n, fp); | |
337 | fread(&size, 4, 1, fp); | |
338 | if (size == 0) continue; | |
339 | b->h = h = kh_init(idx); | |
340 | kh_resize(idx, h, size); | |
341 | for (j = 0; j < size; ++j) { | |
342 | uint64_t x[2]; | |
343 | int absent; | |
344 | fread(x, 8, 2, fp); | |
345 | k = kh_put(idx, h, x[0], &absent); | |
346 | assert(absent); | |
347 | kh_val(h, k) = x[1]; | |
348 | } | |
349 | } | |
350 | return mi; | |
351 | } |
0 | #ifndef __AC_KDQ_H | |
1 | #define __AC_KDQ_H | |
2 | ||
3 | #include <stdlib.h> | |
4 | #include <string.h> | |
5 | ||
6 | #define __KDQ_TYPE(type) \ | |
7 | typedef struct { \ | |
8 | size_t front:58, bits:6, count, mask; \ | |
9 | type *a; \ | |
10 | } kdq_##type##_t; | |
11 | ||
12 | #define kdq_t(type) kdq_##type##_t | |
13 | #define kdq_size(q) ((q)->count) | |
14 | #define kdq_first(q) ((q)->a[(q)->front]) | |
15 | #define kdq_last(q) ((q)->a[((q)->front + (q)->count - 1) & (q)->mask]) | |
16 | #define kdq_at(q, i) ((q)->a[((q)->front + (i)) & (q)->mask]) | |
17 | ||
18 | #define __KDQ_IMPL(type, SCOPE) \ | |
19 | SCOPE kdq_##type##_t *kdq_init_##type() \ | |
20 | { \ | |
21 | kdq_##type##_t *q; \ | |
22 | q = (kdq_##type##_t*)calloc(1, sizeof(kdq_##type##_t)); \ | |
23 | q->bits = 2, q->mask = (1ULL<<q->bits) - 1; \ | |
24 | q->a = (type*)malloc((1<<q->bits) * sizeof(type)); \ | |
25 | return q; \ | |
26 | } \ | |
27 | SCOPE void kdq_destroy_##type(kdq_##type##_t *q) \ | |
28 | { \ | |
29 | if (q == 0) return; \ | |
30 | free(q->a); free(q); \ | |
31 | } \ | |
32 | SCOPE int kdq_resize_##type(kdq_##type##_t *q, int new_bits) \ | |
33 | { \ | |
34 | size_t new_size = 1ULL<<new_bits, old_size = 1ULL<<q->bits; \ | |
35 | if (new_size < q->count) { /* not big enough */ \ | |
36 | int i; \ | |
37 | for (i = 0; i < 64; ++i) \ | |
38 | if (1ULL<<i > q->count) break; \ | |
39 | new_bits = i, new_size = 1ULL<<new_bits; \ | |
40 | } \ | |
41 | if (new_bits == q->bits) return q->bits; /* unchanged */ \ | |
42 | if (new_bits > q->bits) q->a = (type*)realloc(q->a, (1ULL<<new_bits) * sizeof(type)); \ | |
43 | if (q->front + q->count <= old_size) { /* unwrapped */ \ | |
44 | if (q->front + q->count > new_size) /* only happens for shrinking */ \ | |
45 | memmove(q->a, q->a + new_size, (q->front + q->count - new_size) * sizeof(type)); \ | |
46 | } else { /* wrapped */ \ | |
47 | memmove(q->a + (new_size - (old_size - q->front)), q->a + q->front, (old_size - q->front) * sizeof(type)); \ | |
48 | q->front = new_size - (old_size - q->front); \ | |
49 | } \ | |
50 | q->bits = new_bits, q->mask = (1ULL<<q->bits) - 1; \ | |
51 | if (new_bits < q->bits) q->a = (type*)realloc(q->a, (1ULL<<new_bits) * sizeof(type)); \ | |
52 | return q->bits; \ | |
53 | } \ | |
54 | SCOPE type *kdq_pushp_##type(kdq_##type##_t *q) \ | |
55 | { \ | |
56 | if (q->count == 1ULL<<q->bits) kdq_resize_##type(q, q->bits + 1); \ | |
57 | return &q->a[((q->count++) + q->front) & (q)->mask]; \ | |
58 | } \ | |
59 | SCOPE void kdq_push_##type(kdq_##type##_t *q, type v) \ | |
60 | { \ | |
61 | if (q->count == 1ULL<<q->bits) kdq_resize_##type(q, q->bits + 1); \ | |
62 | q->a[((q->count++) + q->front) & (q)->mask] = v; \ | |
63 | } \ | |
64 | SCOPE type *kdq_unshiftp_##type(kdq_##type##_t *q) \ | |
65 | { \ | |
66 | if (q->count == 1ULL<<q->bits) kdq_resize_##type(q, q->bits + 1); \ | |
67 | ++q->count; \ | |
68 | q->front = q->front? q->front - 1 : (1ULL<<q->bits) - 1; \ | |
69 | return &q->a[q->front]; \ | |
70 | } \ | |
71 | SCOPE void kdq_unshift_##type(kdq_##type##_t *q, type v) \ | |
72 | { \ | |
73 | type *p; \ | |
74 | p = kdq_unshiftp_##type(q); \ | |
75 | *p = v; \ | |
76 | } \ | |
77 | SCOPE type *kdq_pop_##type(kdq_##type##_t *q) \ | |
78 | { \ | |
79 | return q->count? &q->a[((--q->count) + q->front) & q->mask] : 0; \ | |
80 | } \ | |
81 | SCOPE type *kdq_shift_##type(kdq_##type##_t *q) \ | |
82 | { \ | |
83 | type *d = 0; \ | |
84 | if (q->count == 0) return 0; \ | |
85 | d = &q->a[q->front++]; \ | |
86 | q->front &= q->mask; \ | |
87 | --q->count; \ | |
88 | return d; \ | |
89 | } | |
90 | ||
91 | #define KDQ_INIT2(type, SCOPE) \ | |
92 | __KDQ_TYPE(type) \ | |
93 | __KDQ_IMPL(type, SCOPE) | |
94 | ||
95 | #ifndef klib_unused | |
96 | #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) | |
97 | #define klib_unused __attribute__ ((__unused__)) | |
98 | #else | |
99 | #define klib_unused | |
100 | #endif | |
101 | #endif /* klib_unused */ | |
102 | ||
103 | #define KDQ_INIT(type) KDQ_INIT2(type, static inline klib_unused) | |
104 | ||
105 | #define KDQ_DECLARE(type) \ | |
106 | __KDQ_TYPE(type) \ | |
107 | kdq_##type##_t *kdq_init_##type(); \ | |
108 | void kdq_destroy_##type(kdq_##type##_t *q); \ | |
109 | int kdq_resize_##type(kdq_##type##_t *q, int new_bits); \ | |
110 | type *kdq_pushp_##type(kdq_##type##_t *q); \ | |
111 | void kdq_push_##type(kdq_##type##_t *q, type v); \ | |
112 | type *kdq_unshiftp_##type(kdq_##type##_t *q); \ | |
113 | void kdq_unshift_##type(kdq_##type##_t *q, type v); \ | |
114 | type *kdq_pop_##type(kdq_##type##_t *q); \ | |
115 | type *kdq_shift_##type(kdq_##type##_t *q); | |
116 | ||
117 | #define kdq_init(type) kdq_init_##type() | |
118 | #define kdq_destroy(type, q) kdq_destroy_##type(q) | |
119 | #define kdq_resize(type, q, new_bits) kdq_resize_##type(q, new_bits) | |
120 | #define kdq_pushp(type, q) kdq_pushp_##type(q) | |
121 | #define kdq_push(type, q, v) kdq_push_##type(q, v) | |
122 | #define kdq_pop(type, q) kdq_pop_##type(q) | |
123 | #define kdq_unshiftp(type, q) kdq_unshiftp_##type(q) | |
124 | #define kdq_unshift(type, q, v) kdq_unshift_##type(q, v) | |
125 | #define kdq_shift(type, q) kdq_shift_##type(q) | |
126 | ||
127 | #endif |
0 | /* The MIT License | |
1 | ||
2 | Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk> | |
3 | ||
4 | Permission is hereby granted, free of charge, to any person obtaining | |
5 | a copy of this software and associated documentation files (the | |
6 | "Software"), to deal in the Software without restriction, including | |
7 | without limitation the rights to use, copy, modify, merge, publish, | |
8 | distribute, sublicense, and/or sell copies of the Software, and to | |
9 | permit persons to whom the Software is furnished to do so, subject to | |
10 | the following conditions: | |
11 | ||
12 | The above copyright notice and this permission notice shall be | |
13 | included in all copies or substantial portions of the Software. | |
14 | ||
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
19 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
20 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 | SOFTWARE. | |
23 | */ | |
24 | ||
25 | /* | |
26 | An example: | |
27 | ||
28 | #include "khash.h" | |
29 | KHASH_MAP_INIT_INT(32, char) | |
30 | int main() { | |
31 | int ret, is_missing; | |
32 | khiter_t k; | |
33 | khash_t(32) *h = kh_init(32); | |
34 | k = kh_put(32, h, 5, &ret); | |
35 | kh_value(h, k) = 10; | |
36 | k = kh_get(32, h, 10); | |
37 | is_missing = (k == kh_end(h)); | |
38 | k = kh_get(32, h, 5); | |
39 | kh_del(32, h, k); | |
40 | for (k = kh_begin(h); k != kh_end(h); ++k) | |
41 | if (kh_exist(h, k)) kh_value(h, k) = 1; | |
42 | kh_destroy(32, h); | |
43 | return 0; | |
44 | } | |
45 | */ | |
46 | ||
47 | /* | |
48 | 2013-05-02 (0.2.8): | |
49 | ||
50 | * Use quadratic probing. When the capacity is power of 2, stepping function | |
51 | i*(i+1)/2 guarantees to traverse each bucket. It is better than double | |
52 | hashing on cache performance and is more robust than linear probing. | |
53 | ||
54 | In theory, double hashing should be more robust than quadratic probing. | |
55 | However, my implementation is probably not for large hash tables, because | |
56 | the second hash function is closely tied to the first hash function, | |
57 | which reduce the effectiveness of double hashing. | |
58 | ||
59 | Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php | |
60 | ||
61 | 2011-12-29 (0.2.7): | |
62 | ||
63 | * Minor code clean up; no actual effect. | |
64 | ||
65 | 2011-09-16 (0.2.6): | |
66 | ||
67 | * The capacity is a power of 2. This seems to dramatically improve the | |
68 | speed for simple keys. Thank Zilong Tan for the suggestion. Reference: | |
69 | ||
70 | - http://code.google.com/p/ulib/ | |
71 | - http://nothings.org/computer/judy/ | |
72 | ||
73 | * Allow to optionally use linear probing which usually has better | |
74 | performance for random input. Double hashing is still the default as it | |
75 | is more robust to certain non-random input. | |
76 | ||
77 | * Added Wang's integer hash function (not used by default). This hash | |
78 | function is more robust to certain non-random input. | |
79 | ||
80 | 2011-02-14 (0.2.5): | |
81 | ||
82 | * Allow to declare global functions. | |
83 | ||
84 | 2009-09-26 (0.2.4): | |
85 | ||
86 | * Improve portability | |
87 | ||
88 | 2008-09-19 (0.2.3): | |
89 | ||
90 | * Corrected the example | |
91 | * Improved interfaces | |
92 | ||
93 | 2008-09-11 (0.2.2): | |
94 | ||
95 | * Improved speed a little in kh_put() | |
96 | ||
97 | 2008-09-10 (0.2.1): | |
98 | ||
99 | * Added kh_clear() | |
100 | * Fixed a compiling error | |
101 | ||
102 | 2008-09-02 (0.2.0): | |
103 | ||
104 | * Changed to token concatenation which increases flexibility. | |
105 | ||
106 | 2008-08-31 (0.1.2): | |
107 | ||
108 | * Fixed a bug in kh_get(), which has not been tested previously. | |
109 | ||
110 | 2008-08-31 (0.1.1): | |
111 | ||
112 | * Added destructor | |
113 | */ | |
114 | ||
115 | ||
116 | #ifndef __AC_KHASH_H | |
117 | #define __AC_KHASH_H | |
118 | ||
119 | /*! | |
120 | @header | |
121 | ||
122 | Generic hash table library. | |
123 | */ | |
124 | ||
125 | #define AC_VERSION_KHASH_H "0.2.8" | |
126 | ||
127 | #include <stdlib.h> | |
128 | #include <string.h> | |
129 | #include <limits.h> | |
130 | ||
131 | /* compiler specific configuration */ | |
132 | ||
133 | #if UINT_MAX == 0xffffffffu | |
134 | typedef unsigned int khint32_t; | |
135 | #elif ULONG_MAX == 0xffffffffu | |
136 | typedef unsigned long khint32_t; | |
137 | #endif | |
138 | ||
139 | #if ULONG_MAX == ULLONG_MAX | |
140 | typedef unsigned long khint64_t; | |
141 | #else | |
142 | typedef unsigned long long khint64_t; | |
143 | #endif | |
144 | ||
145 | #ifndef kh_inline | |
146 | #ifdef _MSC_VER | |
147 | #define kh_inline __inline | |
148 | #else | |
149 | #define kh_inline inline | |
150 | #endif | |
151 | #endif /* kh_inline */ | |
152 | ||
153 | typedef khint32_t khint_t; | |
154 | typedef khint_t khiter_t; | |
155 | ||
156 | #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) | |
157 | #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) | |
158 | #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) | |
159 | #define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) | |
160 | #define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) | |
161 | #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) | |
162 | #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) | |
163 | ||
164 | #define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) | |
165 | ||
166 | #ifndef kroundup32 | |
167 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) | |
168 | #endif | |
169 | ||
170 | #ifndef kcalloc | |
171 | #define kcalloc(N,Z) calloc(N,Z) | |
172 | #endif | |
173 | #ifndef kmalloc | |
174 | #define kmalloc(Z) malloc(Z) | |
175 | #endif | |
176 | #ifndef krealloc | |
177 | #define krealloc(P,Z) realloc(P,Z) | |
178 | #endif | |
179 | #ifndef kfree | |
180 | #define kfree(P) free(P) | |
181 | #endif | |
182 | ||
183 | static const double __ac_HASH_UPPER = 0.77; | |
184 | ||
185 | #define __KHASH_TYPE(name, khkey_t, khval_t) \ | |
186 | typedef struct kh_##name##_s { \ | |
187 | khint_t n_buckets, size, n_occupied, upper_bound; \ | |
188 | khint32_t *flags; \ | |
189 | khkey_t *keys; \ | |
190 | khval_t *vals; \ | |
191 | } kh_##name##_t; | |
192 | ||
193 | #define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ | |
194 | extern kh_##name##_t *kh_init_##name(void); \ | |
195 | extern void kh_destroy_##name(kh_##name##_t *h); \ | |
196 | extern void kh_clear_##name(kh_##name##_t *h); \ | |
197 | extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ | |
198 | extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ | |
199 | extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ | |
200 | extern void kh_del_##name(kh_##name##_t *h, khint_t x); | |
201 | ||
202 | #define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ | |
203 | SCOPE kh_##name##_t *kh_init_##name(void) { \ | |
204 | return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ | |
205 | } \ | |
206 | SCOPE void kh_destroy_##name(kh_##name##_t *h) \ | |
207 | { \ | |
208 | if (h) { \ | |
209 | kfree((void *)h->keys); kfree(h->flags); \ | |
210 | kfree((void *)h->vals); \ | |
211 | kfree(h); \ | |
212 | } \ | |
213 | } \ | |
214 | SCOPE void kh_clear_##name(kh_##name##_t *h) \ | |
215 | { \ | |
216 | if (h && h->flags) { \ | |
217 | memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ | |
218 | h->size = h->n_occupied = 0; \ | |
219 | } \ | |
220 | } \ | |
221 | SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ | |
222 | { \ | |
223 | if (h->n_buckets) { \ | |
224 | khint_t k, i, last, mask, step = 0; \ | |
225 | mask = h->n_buckets - 1; \ | |
226 | k = __hash_func(key); i = k & mask; \ | |
227 | last = i; \ | |
228 | while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ | |
229 | i = (i + (++step)) & mask; \ | |
230 | if (i == last) return h->n_buckets; \ | |
231 | } \ | |
232 | return __ac_iseither(h->flags, i)? h->n_buckets : i; \ | |
233 | } else return 0; \ | |
234 | } \ | |
235 | SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ | |
236 | { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ | |
237 | khint32_t *new_flags = 0; \ | |
238 | khint_t j = 1; \ | |
239 | { \ | |
240 | kroundup32(new_n_buckets); \ | |
241 | if (new_n_buckets < 4) new_n_buckets = 4; \ | |
242 | if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ | |
243 | else { /* hash table size to be changed (shrink or expand); rehash */ \ | |
244 | new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ | |
245 | if (!new_flags) return -1; \ | |
246 | memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ | |
247 | if (h->n_buckets < new_n_buckets) { /* expand */ \ | |
248 | khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ | |
249 | if (!new_keys) { kfree(new_flags); return -1; } \ | |
250 | h->keys = new_keys; \ | |
251 | if (kh_is_map) { \ | |
252 | khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ | |
253 | if (!new_vals) { kfree(new_flags); return -1; } \ | |
254 | h->vals = new_vals; \ | |
255 | } \ | |
256 | } /* otherwise shrink */ \ | |
257 | } \ | |
258 | } \ | |
259 | if (j) { /* rehashing is needed */ \ | |
260 | for (j = 0; j != h->n_buckets; ++j) { \ | |
261 | if (__ac_iseither(h->flags, j) == 0) { \ | |
262 | khkey_t key = h->keys[j]; \ | |
263 | khval_t val; \ | |
264 | khint_t new_mask; \ | |
265 | new_mask = new_n_buckets - 1; \ | |
266 | if (kh_is_map) val = h->vals[j]; \ | |
267 | __ac_set_isdel_true(h->flags, j); \ | |
268 | while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ | |
269 | khint_t k, i, step = 0; \ | |
270 | k = __hash_func(key); \ | |
271 | i = k & new_mask; \ | |
272 | while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ | |
273 | __ac_set_isempty_false(new_flags, i); \ | |
274 | if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ | |
275 | { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ | |
276 | if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ | |
277 | __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ | |
278 | } else { /* write the element and jump out of the loop */ \ | |
279 | h->keys[i] = key; \ | |
280 | if (kh_is_map) h->vals[i] = val; \ | |
281 | break; \ | |
282 | } \ | |
283 | } \ | |
284 | } \ | |
285 | } \ | |
286 | if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ | |
287 | h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ | |
288 | if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ | |
289 | } \ | |
290 | kfree(h->flags); /* free the working space */ \ | |
291 | h->flags = new_flags; \ | |
292 | h->n_buckets = new_n_buckets; \ | |
293 | h->n_occupied = h->size; \ | |
294 | h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ | |
295 | } \ | |
296 | return 0; \ | |
297 | } \ | |
298 | SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ | |
299 | { \ | |
300 | khint_t x; \ | |
301 | if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ | |
302 | if (h->n_buckets > (h->size<<1)) { \ | |
303 | if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ | |
304 | *ret = -1; return h->n_buckets; \ | |
305 | } \ | |
306 | } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ | |
307 | *ret = -1; return h->n_buckets; \ | |
308 | } \ | |
309 | } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ | |
310 | { \ | |
311 | khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ | |
312 | x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ | |
313 | if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ | |
314 | else { \ | |
315 | last = i; \ | |
316 | while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ | |
317 | if (__ac_isdel(h->flags, i)) site = i; \ | |
318 | i = (i + (++step)) & mask; \ | |
319 | if (i == last) { x = site; break; } \ | |
320 | } \ | |
321 | if (x == h->n_buckets) { \ | |
322 | if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ | |
323 | else x = i; \ | |
324 | } \ | |
325 | } \ | |
326 | } \ | |
327 | if (__ac_isempty(h->flags, x)) { /* not present at all */ \ | |
328 | h->keys[x] = key; \ | |
329 | __ac_set_isboth_false(h->flags, x); \ | |
330 | ++h->size; ++h->n_occupied; \ | |
331 | *ret = 1; \ | |
332 | } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ | |
333 | h->keys[x] = key; \ | |
334 | __ac_set_isboth_false(h->flags, x); \ | |
335 | ++h->size; \ | |
336 | *ret = 2; \ | |
337 | } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ | |
338 | return x; \ | |
339 | } \ | |
340 | SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ | |
341 | { \ | |
342 | if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ | |
343 | __ac_set_isdel_true(h->flags, x); \ | |
344 | --h->size; \ | |
345 | } \ | |
346 | } | |
347 | ||
348 | #define KHASH_DECLARE(name, khkey_t, khval_t) \ | |
349 | __KHASH_TYPE(name, khkey_t, khval_t) \ | |
350 | __KHASH_PROTOTYPES(name, khkey_t, khval_t) | |
351 | ||
352 | #define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ | |
353 | __KHASH_TYPE(name, khkey_t, khval_t) \ | |
354 | __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) | |
355 | ||
356 | #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ | |
357 | KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) | |
358 | ||
359 | /* --- BEGIN OF HASH FUNCTIONS --- */ | |
360 | ||
361 | /*! @function | |
362 | @abstract Integer hash function | |
363 | @param key The integer [khint32_t] | |
364 | @return The hash value [khint_t] | |
365 | */ | |
366 | #define kh_int_hash_func(key) (khint32_t)(key) | |
367 | /*! @function | |
368 | @abstract Integer comparison function | |
369 | */ | |
370 | #define kh_int_hash_equal(a, b) ((a) == (b)) | |
371 | /*! @function | |
372 | @abstract 64-bit integer hash function | |
373 | @param key The integer [khint64_t] | |
374 | @return The hash value [khint_t] | |
375 | */ | |
376 | #define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) | |
377 | /*! @function | |
378 | @abstract 64-bit integer comparison function | |
379 | */ | |
380 | #define kh_int64_hash_equal(a, b) ((a) == (b)) | |
381 | /*! @function | |
382 | @abstract const char* hash function | |
383 | @param s Pointer to a null terminated string | |
384 | @return The hash value | |
385 | */ | |
386 | static kh_inline khint_t __ac_X31_hash_string(const char *s) | |
387 | { | |
388 | khint_t h = (khint_t)*s; | |
389 | if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; | |
390 | return h; | |
391 | } | |
392 | /*! @function | |
393 | @abstract Another interface to const char* hash function | |
394 | @param key Pointer to a null terminated string [const char*] | |
395 | @return The hash value [khint_t] | |
396 | */ | |
397 | #define kh_str_hash_func(key) __ac_X31_hash_string(key) | |
398 | /*! @function | |
399 | @abstract Const char* comparison function | |
400 | */ | |
401 | #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) | |
402 | ||
403 | static kh_inline khint_t __ac_Wang_hash(khint_t key) | |
404 | { | |
405 | key += ~(key << 15); | |
406 | key ^= (key >> 10); | |
407 | key += (key << 3); | |
408 | key ^= (key >> 6); | |
409 | key += ~(key << 11); | |
410 | key ^= (key >> 16); | |
411 | return key; | |
412 | } | |
413 | #define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) | |
414 | ||
415 | /* --- END OF HASH FUNCTIONS --- */ | |
416 | ||
417 | /* Other convenient macros... */ | |
418 | ||
419 | /*! | |
420 | @abstract Type of the hash table. | |
421 | @param name Name of the hash table [symbol] | |
422 | */ | |
423 | #define khash_t(name) kh_##name##_t | |
424 | ||
425 | /*! @function | |
426 | @abstract Initiate a hash table. | |
427 | @param name Name of the hash table [symbol] | |
428 | @return Pointer to the hash table [khash_t(name)*] | |
429 | */ | |
430 | #define kh_init(name) kh_init_##name() | |
431 | ||
432 | /*! @function | |
433 | @abstract Destroy a hash table. | |
434 | @param name Name of the hash table [symbol] | |
435 | @param h Pointer to the hash table [khash_t(name)*] | |
436 | */ | |
437 | #define kh_destroy(name, h) kh_destroy_##name(h) | |
438 | ||
439 | /*! @function | |
440 | @abstract Reset a hash table without deallocating memory. | |
441 | @param name Name of the hash table [symbol] | |
442 | @param h Pointer to the hash table [khash_t(name)*] | |
443 | */ | |
444 | #define kh_clear(name, h) kh_clear_##name(h) | |
445 | ||
446 | /*! @function | |
447 | @abstract Resize a hash table. | |
448 | @param name Name of the hash table [symbol] | |
449 | @param h Pointer to the hash table [khash_t(name)*] | |
450 | @param s New size [khint_t] | |
451 | */ | |
452 | #define kh_resize(name, h, s) kh_resize_##name(h, s) | |
453 | ||
454 | /*! @function | |
455 | @abstract Insert a key to the hash table. | |
456 | @param name Name of the hash table [symbol] | |
457 | @param h Pointer to the hash table [khash_t(name)*] | |
458 | @param k Key [type of keys] | |
459 | @param r Extra return code: -1 if the operation failed; | |
460 | 0 if the key is present in the hash table; | |
461 | 1 if the bucket is empty (never used); 2 if the element in | |
462 | the bucket has been deleted [int*] | |
463 | @return Iterator to the inserted element [khint_t] | |
464 | */ | |
465 | #define kh_put(name, h, k, r) kh_put_##name(h, k, r) | |
466 | ||
467 | /*! @function | |
468 | @abstract Retrieve a key from the hash table. | |
469 | @param name Name of the hash table [symbol] | |
470 | @param h Pointer to the hash table [khash_t(name)*] | |
471 | @param k Key [type of keys] | |
472 | @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] | |
473 | */ | |
474 | #define kh_get(name, h, k) kh_get_##name(h, k) | |
475 | ||
476 | /*! @function | |
477 | @abstract Remove a key from the hash table. | |
478 | @param name Name of the hash table [symbol] | |
479 | @param h Pointer to the hash table [khash_t(name)*] | |
480 | @param k Iterator to the element to be deleted [khint_t] | |
481 | */ | |
482 | #define kh_del(name, h, k) kh_del_##name(h, k) | |
483 | ||
484 | /*! @function | |
485 | @abstract Test whether a bucket contains data. | |
486 | @param h Pointer to the hash table [khash_t(name)*] | |
487 | @param x Iterator to the bucket [khint_t] | |
488 | @return 1 if containing data; 0 otherwise [int] | |
489 | */ | |
490 | #define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) | |
491 | ||
492 | /*! @function | |
493 | @abstract Get key given an iterator | |
494 | @param h Pointer to the hash table [khash_t(name)*] | |
495 | @param x Iterator to the bucket [khint_t] | |
496 | @return Key [type of keys] | |
497 | */ | |
498 | #define kh_key(h, x) ((h)->keys[x]) | |
499 | ||
500 | /*! @function | |
501 | @abstract Get value given an iterator | |
502 | @param h Pointer to the hash table [khash_t(name)*] | |
503 | @param x Iterator to the bucket [khint_t] | |
504 | @return Value [type of values] | |
505 | @discussion For hash sets, calling this results in segfault. | |
506 | */ | |
507 | #define kh_val(h, x) ((h)->vals[x]) | |
508 | ||
509 | /*! @function | |
510 | @abstract Alias of kh_val() | |
511 | */ | |
512 | #define kh_value(h, x) ((h)->vals[x]) | |
513 | ||
514 | /*! @function | |
515 | @abstract Get the start iterator | |
516 | @param h Pointer to the hash table [khash_t(name)*] | |
517 | @return The start iterator [khint_t] | |
518 | */ | |
519 | #define kh_begin(h) (khint_t)(0) | |
520 | ||
521 | /*! @function | |
522 | @abstract Get the end iterator | |
523 | @param h Pointer to the hash table [khash_t(name)*] | |
524 | @return The end iterator [khint_t] | |
525 | */ | |
526 | #define kh_end(h) ((h)->n_buckets) | |
527 | ||
528 | /*! @function | |
529 | @abstract Get the number of elements in the hash table | |
530 | @param h Pointer to the hash table [khash_t(name)*] | |
531 | @return Number of elements in the hash table [khint_t] | |
532 | */ | |
533 | #define kh_size(h) ((h)->size) | |
534 | ||
535 | /*! @function | |
536 | @abstract Get the number of buckets in the hash table | |
537 | @param h Pointer to the hash table [khash_t(name)*] | |
538 | @return Number of buckets in the hash table [khint_t] | |
539 | */ | |
540 | #define kh_n_buckets(h) ((h)->n_buckets) | |
541 | ||
542 | /*! @function | |
543 | @abstract Iterate over the entries in the hash table | |
544 | @param h Pointer to the hash table [khash_t(name)*] | |
545 | @param kvar Variable to which key will be assigned | |
546 | @param vvar Variable to which value will be assigned | |
547 | @param code Block of code to execute | |
548 | */ | |
549 | #define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ | |
550 | for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ | |
551 | if (!kh_exist(h,__i)) continue; \ | |
552 | (kvar) = kh_key(h,__i); \ | |
553 | (vvar) = kh_val(h,__i); \ | |
554 | code; \ | |
555 | } } | |
556 | ||
557 | /*! @function | |
558 | @abstract Iterate over the values in the hash table | |
559 | @param h Pointer to the hash table [khash_t(name)*] | |
560 | @param vvar Variable to which value will be assigned | |
561 | @param code Block of code to execute | |
562 | */ | |
563 | #define kh_foreach_value(h, vvar, code) { khint_t __i; \ | |
564 | for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ | |
565 | if (!kh_exist(h,__i)) continue; \ | |
566 | (vvar) = kh_val(h,__i); \ | |
567 | code; \ | |
568 | } } | |
569 | ||
570 | /* More conenient interfaces */ | |
571 | ||
572 | /*! @function | |
573 | @abstract Instantiate a hash set containing integer keys | |
574 | @param name Name of the hash table [symbol] | |
575 | */ | |
576 | #define KHASH_SET_INIT_INT(name) \ | |
577 | KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) | |
578 | ||
579 | /*! @function | |
580 | @abstract Instantiate a hash map containing integer keys | |
581 | @param name Name of the hash table [symbol] | |
582 | @param khval_t Type of values [type] | |
583 | */ | |
584 | #define KHASH_MAP_INIT_INT(name, khval_t) \ | |
585 | KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) | |
586 | ||
587 | /*! @function | |
588 | @abstract Instantiate a hash map containing 64-bit integer keys | |
589 | @param name Name of the hash table [symbol] | |
590 | */ | |
591 | #define KHASH_SET_INIT_INT64(name) \ | |
592 | KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) | |
593 | ||
594 | /*! @function | |
595 | @abstract Instantiate a hash map containing 64-bit integer keys | |
596 | @param name Name of the hash table [symbol] | |
597 | @param khval_t Type of values [type] | |
598 | */ | |
599 | #define KHASH_MAP_INIT_INT64(name, khval_t) \ | |
600 | KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) | |
601 | ||
602 | typedef const char *kh_cstr_t; | |
603 | /*! @function | |
604 | @abstract Instantiate a hash map containing const char* keys | |
605 | @param name Name of the hash table [symbol] | |
606 | */ | |
607 | #define KHASH_SET_INIT_STR(name) \ | |
608 | KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) | |
609 | ||
610 | /*! @function | |
611 | @abstract Instantiate a hash map containing const char* keys | |
612 | @param name Name of the hash table [symbol] | |
613 | @param khval_t Type of values [type] | |
614 | */ | |
615 | #define KHASH_MAP_INIT_STR(name, khval_t) \ | |
616 | KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) | |
617 | ||
618 | #endif /* __AC_KHASH_H */ |
0 | /* The MIT License | |
1 | ||
2 | Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk> | |
3 | ||
4 | Permission is hereby granted, free of charge, to any person obtaining | |
5 | a copy of this software and associated documentation files (the | |
6 | "Software"), to deal in the Software without restriction, including | |
7 | without limitation the rights to use, copy, modify, merge, publish, | |
8 | distribute, sublicense, and/or sell copies of the Software, and to | |
9 | permit persons to whom the Software is furnished to do so, subject to | |
10 | the following conditions: | |
11 | ||
12 | The above copyright notice and this permission notice shall be | |
13 | included in all copies or substantial portions of the Software. | |
14 | ||
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
19 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
20 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 | SOFTWARE. | |
23 | */ | |
24 | ||
25 | /* Last Modified: 05MAR2012 */ | |
26 | ||
27 | #ifndef AC_KSEQ_H | |
28 | #define AC_KSEQ_H | |
29 | ||
30 | #include <ctype.h> | |
31 | #include <string.h> | |
32 | #include <stdlib.h> | |
33 | ||
34 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r | |
35 | #define KS_SEP_TAB 1 // isspace() && !' ' | |
36 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) | |
37 | #define KS_SEP_MAX 2 | |
38 | ||
39 | #define __KS_TYPE(type_t) \ | |
40 | typedef struct __kstream_t { \ | |
41 | int begin, end; \ | |
42 | int is_eof:2, bufsize:30; \ | |
43 | type_t f; \ | |
44 | unsigned char *buf; \ | |
45 | } kstream_t; | |
46 | ||
47 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) | |
48 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) | |
49 | ||
50 | #define __KS_BASIC(SCOPE, type_t, __bufsize) \ | |
51 | SCOPE kstream_t *ks_init(type_t f) \ | |
52 | { \ | |
53 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ | |
54 | ks->f = f; ks->bufsize = __bufsize; \ | |
55 | ks->buf = (unsigned char*)malloc(__bufsize); \ | |
56 | return ks; \ | |
57 | } \ | |
58 | SCOPE void ks_destroy(kstream_t *ks) \ | |
59 | { \ | |
60 | if (!ks) return; \ | |
61 | free(ks->buf); \ | |
62 | free(ks); \ | |
63 | } | |
64 | ||
65 | #define __KS_INLINED(__read) \ | |
66 | static inline int ks_getc(kstream_t *ks) \ | |
67 | { \ | |
68 | if (ks->is_eof && ks->begin >= ks->end) return -1; \ | |
69 | if (ks->begin >= ks->end) { \ | |
70 | ks->begin = 0; \ | |
71 | ks->end = __read(ks->f, ks->buf, ks->bufsize); \ | |
72 | if (ks->end < ks->bufsize) ks->is_eof = 1; \ | |
73 | if (ks->end == 0) return -1; \ | |
74 | } \ | |
75 | return (int)ks->buf[ks->begin++]; \ | |
76 | } \ | |
77 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ | |
78 | { return ks_getuntil2(ks, delimiter, str, dret, 0); } | |
79 | ||
80 | #ifndef KSTRING_T | |
81 | #define KSTRING_T kstring_t | |
82 | typedef struct __kstring_t { | |
83 | unsigned l, m; | |
84 | char *s; | |
85 | } kstring_t; | |
86 | #endif | |
87 | ||
88 | #ifndef kroundup32 | |
89 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) | |
90 | #endif | |
91 | ||
92 | #define __KS_GETUNTIL(SCOPE, __read) \ | |
93 | SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ | |
94 | { \ | |
95 | if (dret) *dret = 0; \ | |
96 | str->l = append? str->l : 0; \ | |
97 | if (ks->begin >= ks->end && ks->is_eof) return -1; \ | |
98 | for (;;) { \ | |
99 | int i; \ | |
100 | if (ks->begin >= ks->end) { \ | |
101 | if (!ks->is_eof) { \ | |
102 | ks->begin = 0; \ | |
103 | ks->end = __read(ks->f, ks->buf, ks->bufsize); \ | |
104 | if (ks->end < ks->bufsize) ks->is_eof = 1; \ | |
105 | if (ks->end == 0) break; \ | |
106 | } else break; \ | |
107 | } \ | |
108 | if (delimiter == KS_SEP_LINE) { \ | |
109 | for (i = ks->begin; i < ks->end; ++i) \ | |
110 | if (ks->buf[i] == '\n') break; \ | |
111 | } else if (delimiter > KS_SEP_MAX) { \ | |
112 | for (i = ks->begin; i < ks->end; ++i) \ | |
113 | if (ks->buf[i] == delimiter) break; \ | |
114 | } else if (delimiter == KS_SEP_SPACE) { \ | |
115 | for (i = ks->begin; i < ks->end; ++i) \ | |
116 | if (isspace(ks->buf[i])) break; \ | |
117 | } else if (delimiter == KS_SEP_TAB) { \ | |
118 | for (i = ks->begin; i < ks->end; ++i) \ | |
119 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ | |
120 | } else i = 0; /* never come to here! */ \ | |
121 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ | |
122 | str->m = str->l + (i - ks->begin) + 1; \ | |
123 | kroundup32(str->m); \ | |
124 | str->s = (char*)realloc(str->s, str->m); \ | |
125 | } \ | |
126 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ | |
127 | str->l = str->l + (i - ks->begin); \ | |
128 | ks->begin = i + 1; \ | |
129 | if (i < ks->end) { \ | |
130 | if (dret) *dret = ks->buf[i]; \ | |
131 | break; \ | |
132 | } \ | |
133 | } \ | |
134 | if (str->s == 0) { \ | |
135 | str->m = 1; \ | |
136 | str->s = (char*)calloc(1, 1); \ | |
137 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ | |
138 | str->s[str->l] = '\0'; \ | |
139 | return str->l; \ | |
140 | } | |
141 | ||
142 | #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ | |
143 | __KS_TYPE(type_t) \ | |
144 | __KS_BASIC(SCOPE, type_t, __bufsize) \ | |
145 | __KS_GETUNTIL(SCOPE, __read) \ | |
146 | __KS_INLINED(__read) | |
147 | ||
148 | #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) | |
149 | ||
150 | #define KSTREAM_DECLARE(type_t, __read) \ | |
151 | __KS_TYPE(type_t) \ | |
152 | extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ | |
153 | extern kstream_t *ks_init(type_t f); \ | |
154 | extern void ks_destroy(kstream_t *ks); \ | |
155 | __KS_INLINED(__read) | |
156 | ||
157 | /****************** | |
158 | * FASTA/Q parser * | |
159 | ******************/ | |
160 | ||
161 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) | |
162 | ||
163 | #define __KSEQ_BASIC(SCOPE, type_t) \ | |
164 | SCOPE kseq_t *kseq_init(type_t fd) \ | |
165 | { \ | |
166 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ | |
167 | s->f = ks_init(fd); \ | |
168 | return s; \ | |
169 | } \ | |
170 | SCOPE void kseq_destroy(kseq_t *ks) \ | |
171 | { \ | |
172 | if (!ks) return; \ | |
173 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ | |
174 | ks_destroy(ks->f); \ | |
175 | free(ks); \ | |
176 | } | |
177 | ||
178 | /* Return value: | |
179 | >=0 length of the sequence (normal) | |
180 | -1 end-of-file | |
181 | -2 truncated quality string | |
182 | */ | |
183 | #define __KSEQ_READ(SCOPE) \ | |
184 | SCOPE int kseq_read(kseq_t *seq) \ | |
185 | { \ | |
186 | int c; \ | |
187 | kstream_t *ks = seq->f; \ | |
188 | if (seq->last_char == 0) { /* then jump to the next header line */ \ | |
189 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ | |
190 | if (c == -1) return -1; /* end of file */ \ | |
191 | seq->last_char = c; \ | |
192 | } /* else: the first header char has been read in the previous call */ \ | |
193 | seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ | |
194 | if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ | |
195 | if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ | |
196 | if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ | |
197 | seq->seq.m = 256; \ | |
198 | seq->seq.s = (char*)malloc(seq->seq.m); \ | |
199 | } \ | |
200 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ | |
201 | if (c == '\n') continue; /* skip empty lines */ \ | |
202 | seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ | |
203 | ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ | |
204 | } \ | |
205 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ | |
206 | if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ | |
207 | seq->seq.m = seq->seq.l + 2; \ | |
208 | kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ | |
209 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ | |
210 | } \ | |
211 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ | |
212 | if (c != '+') return seq->seq.l; /* FASTA */ \ | |
213 | if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ | |
214 | seq->qual.m = seq->seq.m; \ | |
215 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ | |
216 | } \ | |
217 | while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ | |
218 | if (c == -1) return -2; /* error: no quality string */ \ | |
219 | while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ | |
220 | seq->last_char = 0; /* we have not come to the next header line */ \ | |
221 | if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ | |
222 | return seq->seq.l; \ | |
223 | } | |
224 | ||
225 | #define __KSEQ_TYPE(type_t) \ | |
226 | typedef struct { \ | |
227 | kstring_t name, comment, seq, qual; \ | |
228 | int last_char; \ | |
229 | kstream_t *f; \ | |
230 | } kseq_t; | |
231 | ||
232 | #define KSEQ_INIT2(SCOPE, type_t, __read) \ | |
233 | KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \ | |
234 | __KSEQ_TYPE(type_t) \ | |
235 | __KSEQ_BASIC(SCOPE, type_t) \ | |
236 | __KSEQ_READ(SCOPE) | |
237 | ||
238 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) | |
239 | ||
240 | #define KSEQ_DECLARE(type_t) \ | |
241 | __KS_TYPE(type_t) \ | |
242 | __KSEQ_TYPE(type_t) \ | |
243 | extern kseq_t *kseq_init(type_t fd); \ | |
244 | void kseq_destroy(kseq_t *ks); \ | |
245 | int kseq_read(kseq_t *seq); | |
246 | ||
247 | #endif |
0 | /* The MIT License | |
1 | ||
2 | Copyright (c) 2008, 2011 Attractive Chaos <attractor@live.co.uk> | |
3 | ||
4 | Permission is hereby granted, free of charge, to any person obtaining | |
5 | a copy of this software and associated documentation files (the | |
6 | "Software"), to deal in the Software without restriction, including | |
7 | without limitation the rights to use, copy, modify, merge, publish, | |
8 | distribute, sublicense, and/or sell copies of the Software, and to | |
9 | permit persons to whom the Software is furnished to do so, subject to | |
10 | the following conditions: | |
11 | ||
12 | The above copyright notice and this permission notice shall be | |
13 | included in all copies or substantial portions of the Software. | |
14 | ||
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
19 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
20 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 | SOFTWARE. | |
23 | */ | |
24 | ||
25 | // This is a simplified version of ksort.h | |
26 | ||
27 | #ifndef AC_KSORT_H | |
28 | #define AC_KSORT_H | |
29 | ||
30 | #include <stdlib.h> | |
31 | #include <string.h> | |
32 | ||
33 | typedef struct { | |
34 | void *left, *right; | |
35 | int depth; | |
36 | } ks_isort_stack_t; | |
37 | ||
38 | #define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } | |
39 | ||
40 | #define KSORT_INIT(name, type_t, __sort_lt) \ | |
41 | size_t ks_lis_##name(size_t n, const type_t *a, size_t *b, size_t *_p) \ | |
42 | { /* translated from: http://www.algorithmist.com/index.php/Longest_Increasing_Subsequence.cpp */ \ | |
43 | size_t i, u, v, *top = b, *p; \ | |
44 | if (n == 0) return 0; \ | |
45 | p = _p? _p : (size_t*)malloc(n * sizeof(size_t)); \ | |
46 | *top++ = 0; \ | |
47 | for (i = 1; i < n; i++) { \ | |
48 | if (__sort_lt(a[*(top-1)], a[i])) { \ | |
49 | p[i] = *(top-1); \ | |
50 | *top++ = i; \ | |
51 | continue; \ | |
52 | } \ | |
53 | for (u = 0, v = top - b - 1; u < v;) { \ | |
54 | size_t c = (u + v) >> 1; \ | |
55 | if (__sort_lt(a[b[c]], a[i])) u = c + 1; \ | |
56 | else v = c; \ | |
57 | } \ | |
58 | if (__sort_lt(a[i], a[b[u]])) { \ | |
59 | if (u > 0) p[i] = b[u-1]; \ | |
60 | b[u] = i; \ | |
61 | } \ | |
62 | } \ | |
63 | for (u = top - b, v = *(top-1); u--; v = p[v]) b[u] = v; \ | |
64 | if (!_p) free(p); \ | |
65 | return top - b; \ | |
66 | } \ | |
67 | type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ | |
68 | { \ | |
69 | type_t *low, *high, *k, *ll, *hh, *mid; \ | |
70 | low = arr; high = arr + n - 1; k = arr + kk; \ | |
71 | for (;;) { \ | |
72 | if (high <= low) return *k; \ | |
73 | if (high == low + 1) { \ | |
74 | if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ | |
75 | return *k; \ | |
76 | } \ | |
77 | mid = low + (high - low) / 2; \ | |
78 | if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ | |
79 | if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ | |
80 | if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ | |
81 | KSORT_SWAP(type_t, *mid, *(low+1)); \ | |
82 | ll = low + 1; hh = high; \ | |
83 | for (;;) { \ | |
84 | do ++ll; while (__sort_lt(*ll, *low)); \ | |
85 | do --hh; while (__sort_lt(*low, *hh)); \ | |
86 | if (hh < ll) break; \ | |
87 | KSORT_SWAP(type_t, *ll, *hh); \ | |
88 | } \ | |
89 | KSORT_SWAP(type_t, *low, *hh); \ | |
90 | if (hh <= k) low = ll; \ | |
91 | if (hh >= k) high = hh - 1; \ | |
92 | } \ | |
93 | } \ | |
94 | ||
95 | #define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) | |
96 | ||
97 | #define ks_lt_generic(a, b) ((a) < (b)) | |
98 | #define ks_lt_str(a, b) (strcmp((a), (b)) < 0) | |
99 | ||
100 | typedef const char *ksstr_t; | |
101 | ||
102 | #define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) | |
103 | #define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) | |
104 | ||
105 | #define RS_MIN_SIZE 64 | |
106 | ||
107 | #define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \ | |
108 | typedef struct { \ | |
109 | rstype_t *b, *e; \ | |
110 | } rsbucket_##name##_t; \ | |
111 | void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \ | |
112 | { \ | |
113 | rstype_t *i; \ | |
114 | for (i = beg + 1; i < end; ++i) \ | |
115 | if (rskey(*i) < rskey(*(i - 1))) { \ | |
116 | rstype_t *j, tmp = *i; \ | |
117 | for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \ | |
118 | *j = *(j - 1); \ | |
119 | *j = tmp; \ | |
120 | } \ | |
121 | } \ | |
122 | void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \ | |
123 | { \ | |
124 | rstype_t *i; \ | |
125 | int size = 1<<n_bits, m = size - 1; \ | |
126 | rsbucket_##name##_t *k, b[size], *be = b + size; \ | |
127 | for (k = b; k != be; ++k) k->b = k->e = beg; \ | |
128 | for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \ | |
129 | for (k = b + 1; k != be; ++k) \ | |
130 | k->e += (k-1)->e - beg, k->b = (k-1)->e; \ | |
131 | for (k = b; k != be;) { \ | |
132 | if (k->b != k->e) { \ | |
133 | rsbucket_##name##_t *l; \ | |
134 | if ((l = b + (rskey(*k->b)>>s&m)) != k) { \ | |
135 | rstype_t tmp = *k->b, swap; \ | |
136 | do { \ | |
137 | swap = tmp; tmp = *l->b; *l->b++ = swap; \ | |
138 | l = b + (rskey(tmp)>>s&m); \ | |
139 | } while (l != k); \ | |
140 | *k->b++ = tmp; \ | |
141 | } else ++k->b; \ | |
142 | } else ++k; \ | |
143 | } \ | |
144 | for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \ | |
145 | if (s) { \ | |
146 | s = s > n_bits? s - n_bits : 0; \ | |
147 | for (k = b; k != be; ++k) \ | |
148 | if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \ | |
149 | else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \ | |
150 | } \ | |
151 | } \ | |
152 | void radix_sort_##name(rstype_t *beg, rstype_t *end) \ | |
153 | { \ | |
154 | if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \ | |
155 | else rs_sort_##name(beg, end, 8, sizeof_key * 8 - 8); \ | |
156 | } | |
157 | ||
158 | #endif |
0 | #include <pthread.h> | |
1 | #include <stdlib.h> | |
2 | #include <limits.h> | |
3 | ||
4 | /************ | |
5 | * kt_for() * | |
6 | ************/ | |
7 | ||
8 | struct kt_for_t; | |
9 | ||
10 | typedef struct { | |
11 | struct kt_for_t *t; | |
12 | long i; | |
13 | } ktf_worker_t; | |
14 | ||
15 | typedef struct kt_for_t { | |
16 | int n_threads; | |
17 | long n; | |
18 | ktf_worker_t *w; | |
19 | void (*func)(void*,long,int); | |
20 | void *data; | |
21 | } kt_for_t; | |
22 | ||
23 | static inline long steal_work(kt_for_t *t) | |
24 | { | |
25 | int i, min_i = -1; | |
26 | long k, min = LONG_MAX; | |
27 | for (i = 0; i < t->n_threads; ++i) | |
28 | if (min > t->w[i].i) min = t->w[i].i, min_i = i; | |
29 | k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads); | |
30 | return k >= t->n? -1 : k; | |
31 | } | |
32 | ||
33 | static void *ktf_worker(void *data) | |
34 | { | |
35 | ktf_worker_t *w = (ktf_worker_t*)data; | |
36 | long i; | |
37 | for (;;) { | |
38 | i = __sync_fetch_and_add(&w->i, w->t->n_threads); | |
39 | if (i >= w->t->n) break; | |
40 | w->t->func(w->t->data, i, w - w->t->w); | |
41 | } | |
42 | while ((i = steal_work(w->t)) >= 0) | |
43 | w->t->func(w->t->data, i, w - w->t->w); | |
44 | pthread_exit(0); | |
45 | } | |
46 | ||
47 | void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n) | |
48 | { | |
49 | int i; | |
50 | kt_for_t t; | |
51 | pthread_t *tid; | |
52 | t.func = func, t.data = data, t.n_threads = n_threads, t.n = n; | |
53 | t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t)); | |
54 | tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t)); | |
55 | for (i = 0; i < n_threads; ++i) | |
56 | t.w[i].t = &t, t.w[i].i = i; | |
57 | for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]); | |
58 | for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); | |
59 | } | |
60 | ||
61 | /***************** | |
62 | * kt_pipeline() * | |
63 | *****************/ | |
64 | ||
65 | struct ktp_t; | |
66 | ||
67 | typedef struct { | |
68 | struct ktp_t *pl; | |
69 | int64_t index; | |
70 | int step; | |
71 | void *data; | |
72 | } ktp_worker_t; | |
73 | ||
74 | typedef struct ktp_t { | |
75 | void *shared; | |
76 | void *(*func)(void*, int, void*); | |
77 | int64_t index; | |
78 | int n_workers, n_steps; | |
79 | ktp_worker_t *workers; | |
80 | pthread_mutex_t mutex; | |
81 | pthread_cond_t cv; | |
82 | } ktp_t; | |
83 | ||
84 | static void *ktp_worker(void *data) | |
85 | { | |
86 | ktp_worker_t *w = (ktp_worker_t*)data; | |
87 | ktp_t *p = w->pl; | |
88 | while (w->step < p->n_steps) { | |
89 | // test whether we can kick off the job with this worker | |
90 | pthread_mutex_lock(&p->mutex); | |
91 | for (;;) { | |
92 | int i; | |
93 | // test whether another worker is doing the same step | |
94 | for (i = 0; i < p->n_workers; ++i) { | |
95 | if (w == &p->workers[i]) continue; // ignore itself | |
96 | if (p->workers[i].step <= w->step && p->workers[i].index < w->index) | |
97 | break; | |
98 | } | |
99 | if (i == p->n_workers) break; // no workers with smaller indices are doing w->step or the previous steps | |
100 | pthread_cond_wait(&p->cv, &p->mutex); | |
101 | } | |
102 | pthread_mutex_unlock(&p->mutex); | |
103 | ||
104 | // working on w->step | |
105 | w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL | |
106 | ||
107 | // update step and let other workers know | |
108 | pthread_mutex_lock(&p->mutex); | |
109 | w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps; | |
110 | if (w->step == 0) w->index = p->index++; | |
111 | pthread_cond_broadcast(&p->cv); | |
112 | pthread_mutex_unlock(&p->mutex); | |
113 | } | |
114 | pthread_exit(0); | |
115 | } | |
116 | ||
117 | void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps) | |
118 | { | |
119 | ktp_t aux; | |
120 | pthread_t *tid; | |
121 | int i; | |
122 | ||
123 | if (n_threads < 1) n_threads = 1; | |
124 | aux.n_workers = n_threads; | |
125 | aux.n_steps = n_steps; | |
126 | aux.func = func; | |
127 | aux.shared = shared_data; | |
128 | aux.index = 0; | |
129 | pthread_mutex_init(&aux.mutex, 0); | |
130 | pthread_cond_init(&aux.cv, 0); | |
131 | ||
132 | aux.workers = (ktp_worker_t*)alloca(n_threads * sizeof(ktp_worker_t)); | |
133 | for (i = 0; i < n_threads; ++i) { | |
134 | ktp_worker_t *w = &aux.workers[i]; | |
135 | w->step = 0; w->pl = &aux; w->data = 0; | |
136 | w->index = aux.index++; | |
137 | } | |
138 | ||
139 | tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t)); | |
140 | for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]); | |
141 | for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); | |
142 | ||
143 | pthread_mutex_destroy(&aux.mutex); | |
144 | pthread_cond_destroy(&aux.cv); | |
145 | } |
0 | /* The MIT License | |
1 | ||
2 | Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk> | |
3 | ||
4 | Permission is hereby granted, free of charge, to any person obtaining | |
5 | a copy of this software and associated documentation files (the | |
6 | "Software"), to deal in the Software without restriction, including | |
7 | without limitation the rights to use, copy, modify, merge, publish, | |
8 | distribute, sublicense, and/or sell copies of the Software, and to | |
9 | permit persons to whom the Software is furnished to do so, subject to | |
10 | the following conditions: | |
11 | ||
12 | The above copyright notice and this permission notice shall be | |
13 | included in all copies or substantial portions of the Software. | |
14 | ||
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
19 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
20 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 | SOFTWARE. | |
23 | */ | |
24 | ||
25 | /* | |
26 | An example: | |
27 | ||
28 | #include "kvec.h" | |
29 | int main() { | |
30 | kvec_t(int) array; | |
31 | kv_init(array); | |
32 | kv_push(int, array, 10); // append | |
33 | kv_a(int, array, 20) = 5; // dynamic | |
34 | kv_A(array, 20) = 4; // static | |
35 | kv_destroy(array); | |
36 | return 0; | |
37 | } | |
38 | */ | |
39 | ||
40 | /* | |
41 | 2008-09-22 (0.1.0): | |
42 | ||
43 | * The initial version. | |
44 | ||
45 | */ | |
46 | ||
47 | #ifndef AC_KVEC_H | |
48 | #define AC_KVEC_H | |
49 | ||
50 | #include <stdlib.h> | |
51 | ||
52 | #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) | |
53 | ||
54 | #define kvec_t(type) struct { size_t n, m; type *a; } | |
55 | #define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) | |
56 | #define kv_destroy(v) free((v).a) | |
57 | #define kv_A(v, i) ((v).a[(i)]) | |
58 | #define kv_pop(v) ((v).a[--(v).n]) | |
59 | #define kv_size(v) ((v).n) | |
60 | #define kv_max(v) ((v).m) | |
61 | ||
62 | #define kv_resize(type, v, s) do { \ | |
63 | if ((v).m < (s)) { \ | |
64 | (v).m = (s); \ | |
65 | kv_roundup32((v).m); \ | |
66 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ | |
67 | } \ | |
68 | } while (0) | |
69 | ||
70 | #define kv_copy(type, v1, v0) do { \ | |
71 | if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ | |
72 | (v1).n = (v0).n; \ | |
73 | memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ | |
74 | } while (0) \ | |
75 | ||
76 | #define kv_push(type, v, x) do { \ | |
77 | if ((v).n == (v).m) { \ | |
78 | (v).m = (v).m? (v).m<<1 : 2; \ | |
79 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ | |
80 | } \ | |
81 | (v).a[(v).n++] = (x); \ | |
82 | } while (0) | |
83 | ||
84 | #define kv_pushp(type, v, p) do { \ | |
85 | if ((v).n == (v).m) { \ | |
86 | (v).m = (v).m? (v).m<<1 : 2; \ | |
87 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ | |
88 | } \ | |
89 | *(p) = &(v).a[(v).n++]; \ | |
90 | } while (0) | |
91 | ||
92 | #define kv_a(type, v, i) ((v).m <= (size_t)(i)? \ | |
93 | ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ | |
94 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ | |
95 | : (v).n <= (size_t)(i)? (v).n = (i) \ | |
96 | : 0), (v).a[(i)] | |
97 | ||
98 | #define kv_reverse(type, v, start) do { \ | |
99 | if ((v).m > 0 && (v).n > (start)) { \ | |
100 | size_t __i, __end = (v).n - (start); \ | |
101 | type *__a = (v).a + (start); \ | |
102 | for (__i = 0; __i < __end>>1; ++__i) { \ | |
103 | type __t = __a[__end - 1 - __i]; \ | |
104 | __a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \ | |
105 | } \ | |
106 | } \ | |
107 | } while (0) | |
108 | ||
109 | #endif |
0 | #include <unistd.h> | |
1 | #include <stdlib.h> | |
2 | #include <stdio.h> | |
3 | #include <string.h> | |
4 | #include <sys/resource.h> | |
5 | #include <sys/time.h> | |
6 | #include "minimap.h" | |
7 | ||
8 | #define MM_VERSION "0.2-r123" | |
9 | ||
10 | void liftrlimit() | |
11 | { | |
12 | #ifdef __linux__ | |
13 | struct rlimit r; | |
14 | getrlimit(RLIMIT_AS, &r); | |
15 | r.rlim_cur = r.rlim_max; | |
16 | setrlimit(RLIMIT_AS, &r); | |
17 | #endif | |
18 | } | |
19 | ||
20 | int main(int argc, char *argv[]) | |
21 | { | |
22 | mm_mapopt_t opt; | |
23 | int i, c, k = 15, w = -1, b = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx = 0; | |
24 | int tbatch_size = 100000000; | |
25 | uint64_t ibatch_size = 4000000000ULL; | |
26 | float f = 0.001; | |
27 | bseq_file_t *fp = 0; | |
28 | char *fnw = 0; | |
29 | FILE *fpr = 0, *fpw = 0; | |
30 | ||
31 | liftrlimit(); | |
32 | mm_realtime0 = realtime(); | |
33 | mm_mapopt_init(&opt); | |
34 | ||
35 | while ((c = getopt(argc, argv, "w:k:B:b:t:r:c:f:Vv:NOg:I:d:lRPST:m:L:Dx:")) >= 0) { | |
36 | if (c == 'w') w = atoi(optarg); | |
37 | else if (c == 'k') k = atoi(optarg); | |
38 | else if (c == 'b') b = atoi(optarg); | |
39 | else if (c == 'r') opt.radius = atoi(optarg); | |
40 | else if (c == 'c') opt.min_cnt = atoi(optarg); | |
41 | else if (c == 'm') opt.merge_frac = atof(optarg); | |
42 | else if (c == 'f') f = atof(optarg); | |
43 | else if (c == 't') n_threads = atoi(optarg); | |
44 | else if (c == 'v') mm_verbose = atoi(optarg); | |
45 | else if (c == 'g') opt.max_gap = atoi(optarg); | |
46 | else if (c == 'N') keep_name = 0; | |
47 | else if (c == 'd') fnw = optarg; | |
48 | else if (c == 'l') is_idx = 1; | |
49 | else if (c == 'R') opt.flag |= MM_F_WITH_REP; | |
50 | else if (c == 'P') opt.flag &= ~MM_F_WITH_REP; | |
51 | else if (c == 'D') opt.flag |= MM_F_NO_SELF; | |
52 | else if (c == 'O') opt.flag |= MM_F_NO_ISO; | |
53 | else if (c == 'S') opt.flag |= MM_F_AVA | MM_F_NO_SELF; | |
54 | else if (c == 'T') opt.sdust_thres = atoi(optarg); | |
55 | else if (c == 'L') opt.min_match = atoi(optarg); | |
56 | else if (c == 'V') { | |
57 | puts(MM_VERSION); | |
58 | return 0; | |
59 | } else if (c == 'B' || c == 'I') { | |
60 | double x; | |
61 | char *p; | |
62 | x = strtod(optarg, &p); | |
63 | if (*p == 'G' || *p == 'g') x *= 1e9; | |
64 | else if (*p == 'M' || *p == 'm') x *= 1e6; | |
65 | else if (*p == 'K' || *p == 'k') x *= 1e3; | |
66 | if (c == 'B') tbatch_size = (uint64_t)(x + .499); | |
67 | else ibatch_size = (uint64_t)(x + .499); | |
68 | } else if (c == 'x') { | |
69 | if (strcmp(optarg, "ava10k") == 0) { | |
70 | opt.flag |= MM_F_AVA | MM_F_NO_SELF; | |
71 | opt.min_match = 100; | |
72 | opt.merge_frac = 0.0; | |
73 | w = 5; | |
74 | } | |
75 | } | |
76 | } | |
77 | if (w < 0) w = (int)(.6666667 * k + .499); | |
78 | ||
79 | if (argc == optind) { | |
80 | fprintf(stderr, "Usage: minimap [options] <target.fa> [query.fa] [...]\n"); | |
81 | fprintf(stderr, "Options:\n"); | |
82 | fprintf(stderr, " Indexing:\n"); | |
83 | fprintf(stderr, " -k INT k-mer size [%d]\n", k); | |
84 | fprintf(stderr, " -w INT minizer window size [{-k}*2/3]\n"); | |
85 | fprintf(stderr, " -I NUM split index for every ~NUM input bases [4G]\n"); | |
86 | fprintf(stderr, " -d FILE dump index to FILE []\n"); | |
87 | fprintf(stderr, " -l the 1st argument is a index file (overriding -k, -w and -I)\n"); | |
88 | // fprintf(stderr, " -b INT bucket bits [%d]\n", b); // most users would care about this | |
89 | fprintf(stderr, " Mapping:\n"); | |
90 | fprintf(stderr, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%.3f]\n", f); | |
91 | fprintf(stderr, " -r INT bandwidth [%d]\n", opt.radius); | |
92 | fprintf(stderr, " -m FLOAT merge two chains if FLOAT fraction of minimizers are shared [%.2f]\n", opt.merge_frac); | |
93 | fprintf(stderr, " -c INT retain a mapping if it consists of >=INT minimizers [%d]\n", opt.min_cnt); | |
94 | fprintf(stderr, " -L INT min matching length [%d]\n", opt.min_match); | |
95 | fprintf(stderr, " -g INT split a mapping if there is a gap longer than INT [%d]\n", opt.max_gap); | |
96 | fprintf(stderr, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); | |
97 | // fprintf(stderr, " -D skip self mappings but keep dual mappings\n"); // too confusing to expose to end users | |
98 | fprintf(stderr, " -S skip self and dual mappings\n"); | |
99 | fprintf(stderr, " -O drop isolated hits before chaining (EXPERIMENTAL)\n"); | |
100 | fprintf(stderr, " -P filtering potential repeats after mapping (EXPERIMENTAL)\n"); | |
101 | // fprintf(stderr, " -R skip post-mapping repeat filtering\n"); // deprecated option for backward compatibility | |
102 | fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n"); | |
103 | fprintf(stderr, " ava10k: -Sw5 -L100 -m0 (PacBio/ONT all-vs-all read mapping)\n"); | |
104 | fprintf(stderr, " Input/Output:\n"); | |
105 | fprintf(stderr, " -t INT number of threads [%d]\n", n_threads); | |
106 | // fprintf(stderr, " -B NUM process ~NUM bp in each batch [100M]\n"); | |
107 | // fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose); | |
108 | // fprintf(stderr, " -N use integer as target names\n"); | |
109 | fprintf(stderr, " -V show version number\n"); | |
110 | fprintf(stderr, "\nSee minimap.1 for detailed description of the command-line options.\n"); | |
111 | return 1; | |
112 | } | |
113 | ||
114 | if (is_idx) fpr = fopen(argv[optind], "rb"); | |
115 | else fp = bseq_open(argv[optind]); | |
116 | if (fnw) fpw = fopen(fnw, "wb"); | |
117 | for (;;) { | |
118 | mm_idx_t *mi = 0; | |
119 | if (fpr) mi = mm_idx_load(fpr); | |
120 | else if (!bseq_eof(fp)) | |
121 | mi = mm_idx_gen(fp, w, k, b, tbatch_size, n_threads, ibatch_size, keep_name); | |
122 | if (mi == 0) break; | |
123 | if (mm_verbose >= 3) | |
124 | fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n", | |
125 | __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n); | |
126 | mm_idx_set_max_occ(mi, f); | |
127 | if (mm_verbose >= 3) | |
128 | fprintf(stderr, "[M::%s] max occurrences of a minimizer to consider: %d\n", __func__, mi->max_occ); | |
129 | if (fpw) mm_idx_dump(fpw, mi); | |
130 | for (i = optind + 1; i < argc; ++i) | |
131 | mm_map_file(mi, argv[i], &opt, n_threads, tbatch_size); | |
132 | mm_idx_destroy(mi); | |
133 | } | |
134 | if (fpw) fclose(fpw); | |
135 | if (fpr) fclose(fpr); | |
136 | if (fp) bseq_close(fp); | |
137 | ||
138 | fprintf(stderr, "[M::%s] Version: %s\n", __func__, MM_VERSION); | |
139 | fprintf(stderr, "[M::%s] CMD:", __func__); | |
140 | for (i = 0; i < argc; ++i) | |
141 | fprintf(stderr, " %s", argv[i]); | |
142 | fprintf(stderr, "\n[M::%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - mm_realtime0, cputime()); | |
143 | return 0; | |
144 | } |
0 | #include <stdlib.h> | |
1 | #include <string.h> | |
2 | #include <stdio.h> | |
3 | #include "bseq.h" | |
4 | #include "kvec.h" | |
5 | #include "minimap.h" | |
6 | #include "sdust.h" | |
7 | ||
8 | void mm_mapopt_init(mm_mapopt_t *opt) | |
9 | { | |
10 | opt->radius = 500; | |
11 | opt->max_gap = 10000; | |
12 | opt->min_cnt = 4; | |
13 | opt->min_match = 40; | |
14 | opt->sdust_thres = 0; | |
15 | opt->flag = MM_F_WITH_REP; | |
16 | opt->merge_frac = .5; | |
17 | } | |
18 | ||
19 | /**************************** | |
20 | * Find approxiate mappings * | |
21 | ****************************/ | |
22 | ||
23 | struct mm_tbuf_s { // per-thread buffer | |
24 | mm128_v mini; // query minimizers | |
25 | mm128_v coef; // Hough transform coefficient | |
26 | mm128_v intv; // intervals on sorted coef | |
27 | uint32_v reg2mini; | |
28 | uint32_v rep_aux; | |
29 | sdust_buf_t *sdb; | |
30 | // the following are for computing LIS | |
31 | uint32_t n, m; | |
32 | uint64_t *a; | |
33 | size_t *b, *p; | |
34 | // final output | |
35 | kvec_t(mm_reg1_t) reg; | |
36 | }; | |
37 | ||
38 | mm_tbuf_t *mm_tbuf_init() | |
39 | { | |
40 | mm_tbuf_t *b; | |
41 | b = (mm_tbuf_t*)calloc(1, sizeof(mm_tbuf_t)); | |
42 | b->sdb = sdust_buf_init(); | |
43 | return b; | |
44 | } | |
45 | ||
46 | void mm_tbuf_destroy(mm_tbuf_t *b) | |
47 | { | |
48 | if (b == 0) return; | |
49 | free(b->mini.a); free(b->coef.a); free(b->intv.a); free(b->reg.a); free(b->reg2mini.a); free(b->rep_aux.a); | |
50 | free(b->a); free(b->b); free(b->p); | |
51 | sdust_buf_destroy(b->sdb); | |
52 | free(b); | |
53 | } | |
54 | ||
55 | #include "ksort.h" | |
56 | #define sort_key_64(a) (a) | |
57 | KRADIX_SORT_INIT(64, uint64_t, sort_key_64, 8) | |
58 | #define lt_low32(a, b) ((uint32_t)(a) < (uint32_t)(b)) | |
59 | KSORT_INIT(low32lt, uint64_t, lt_low32) | |
60 | #define gt_low32(a, b) ((uint32_t)(a) > (uint32_t)(b)) | |
61 | KSORT_INIT(low32gt, uint64_t, gt_low32) | |
62 | ||
63 | /* TODO: drop_rep() is not robust. For all-vs-all mapping but without the -S | |
64 | * flag, all minimizers have at least one hit. The _thres_ computed below will | |
65 | * be highly skewed. Some improvements need to be made. */ | |
66 | ||
67 | static void drop_rep(mm_tbuf_t *b, int min_cnt) | |
68 | { | |
69 | int i, j, n, m; | |
70 | uint32_t thres; | |
71 | b->rep_aux.n = 0; | |
72 | for (i = 0; i < b->mini.n; ++i) | |
73 | if (b->mini.a[i].y>>32) | |
74 | kv_push(uint32_t, b->rep_aux, b->mini.a[i].y>>32); | |
75 | if (b->rep_aux.n < 3) return; | |
76 | thres = (uint32_t)(ks_ksmall_uint32_t(b->rep_aux.n, b->rep_aux.a, b->rep_aux.n>>1) * MM_DEREP_Q50 + .499); | |
77 | for (i = n = m = 0; i < b->reg.n; ++i) { | |
78 | int cnt = 0, all_cnt = b->reg.a[i].cnt; | |
79 | for (j = 0; j < all_cnt; ++j) | |
80 | if (b->mini.a[b->reg2mini.a[m + j]].y>>32 <= thres) | |
81 | ++cnt; | |
82 | if (cnt >= min_cnt) | |
83 | b->reg.a[n++] = b->reg.a[i]; | |
84 | m += all_cnt; | |
85 | } | |
86 | // printf("%ld=>%d\t%d\n", b->reg.n, n, thres); | |
87 | b->reg.n = n; | |
88 | } | |
89 | ||
90 | static void proc_intv(mm_tbuf_t *b, int which, int k, int min_cnt, int max_gap) | |
91 | { | |
92 | int i, j, l_lis, rid = -1, rev = 0, start = b->intv.a[which].y, end = start + b->intv.a[which].x; | |
93 | ||
94 | // make room for arrays needed by LIS (longest increasing sequence) | |
95 | if (end - start > b->m) { | |
96 | b->m = end - start; | |
97 | kv_roundup32(b->m); | |
98 | b->a = (uint64_t*)realloc(b->a, b->m * 8); | |
99 | b->b = (size_t*)realloc(b->b, b->m * sizeof(size_t)); | |
100 | b->p = (size_t*)realloc(b->p, b->m * sizeof(size_t)); | |
101 | } | |
102 | ||
103 | // prepare the input array _a_ for LIS | |
104 | b->n = 0; | |
105 | for (i = start; i < end; ++i) | |
106 | if (b->coef.a[i].x != UINT64_MAX) | |
107 | b->a[b->n++] = b->coef.a[i].y, rid = b->coef.a[i].x << 1 >> 33, rev = b->coef.a[i].x >> 63; | |
108 | if (b->n < min_cnt) return; | |
109 | radix_sort_64(b->a, b->a + b->n); | |
110 | ||
111 | // find the longest increasing sequence | |
112 | l_lis = rev? ks_lis_low32gt(b->n, b->a, b->b, b->p) : ks_lis_low32lt(b->n, b->a, b->b, b->p); // LIS | |
113 | if (l_lis < min_cnt) return; | |
114 | for (i = 1, j = 1; i < l_lis; ++i) // squeeze out minimizaers reused in the LIS sequence | |
115 | if (b->a[b->b[i]]>>32 != b->a[b->b[i-1]]>>32) | |
116 | b->a[b->b[j++]] = b->a[b->b[i]]; | |
117 | l_lis = j; | |
118 | if (l_lis < min_cnt) return; | |
119 | ||
120 | // convert LISes to regions; possibly break an LIS at a long gaps | |
121 | for (i = 1, start = 0; i <= l_lis; ++i) { | |
122 | int32_t qgap = i == l_lis? 0 : ((uint32_t)b->mini.a[b->a[b->b[i]]>>32].y>>1) - ((uint32_t)b->mini.a[b->a[b->b[i-1]]>>32].y>>1); | |
123 | if (i == l_lis || (qgap > max_gap && abs((int32_t)b->a[b->b[i]] - (int32_t)b->a[b->b[i-1]]) > max_gap)) { | |
124 | if (i - start >= min_cnt) { | |
125 | uint32_t lq = 0, lr = 0, eq = 0, er = 0, sq = 0, sr = 0; | |
126 | mm_reg1_t *r; | |
127 | kv_pushp(mm_reg1_t, b->reg, &r); | |
128 | r->rid = rid, r->rev = rev, r->cnt = i - start, r->rep = 0; | |
129 | r->qs = ((uint32_t)b->mini.a[b->a[b->b[start]]>>32].y>>1) - (k - 1); | |
130 | r->qe = ((uint32_t)b->mini.a[b->a[b->b[i-1]]>>32].y>>1) + 1; | |
131 | r->rs = rev? (uint32_t)b->a[b->b[i-1]] : (uint32_t)b->a[b->b[start]]; | |
132 | r->re = rev? (uint32_t)b->a[b->b[start]] : (uint32_t)b->a[b->b[i-1]]; | |
133 | r->rs -= k - 1; | |
134 | r->re += 1; | |
135 | for (j = start; j < i; ++j) { // count the number of times each minimizer is used | |
136 | int jj = b->a[b->b[j]]>>32; | |
137 | b->mini.a[jj].y += 1ULL<<32; | |
138 | kv_push(uint32_t, b->reg2mini, jj); // keep minimizer<=>reg mapping for derep | |
139 | } | |
140 | for (j = start; j < i; ++j) { // compute ->len | |
141 | uint32_t q = ((uint32_t)b->mini.a[b->a[b->b[j]]>>32].y>>1) - (k - 1); | |
142 | uint32_t r = (uint32_t)b->a[b->b[j]]; | |
143 | r = !rev? r - (k - 1) : (0x80000000U - r); | |
144 | if (r > er) lr += er - sr, sr = r, er = sr + k; | |
145 | else er = r + k; | |
146 | if (q > eq) lq += eq - sq, sq = q, eq = sq + k; | |
147 | else eq = q + k; | |
148 | } | |
149 | lr += er - sr, lq += eq - sq; | |
150 | r->len = lr < lq? lr : lq; | |
151 | } | |
152 | start = i; | |
153 | } | |
154 | } | |
155 | } | |
156 | ||
157 | // merge or add a Hough interval; only used by get_reg() | |
158 | static inline void push_intv(mm128_v *intv, int start, int end, float merge_frac) | |
159 | { | |
160 | mm128_t *p; | |
161 | if (intv->n > 0) { // test overlap | |
162 | int last_start, last_end, min; | |
163 | p = &intv->a[intv->n-1]; | |
164 | last_start = p->y, last_end = p->x + last_start; | |
165 | min = end - start < last_end - last_start? end - start : last_end - last_start; | |
166 | if (last_end > start && last_end - start > min * merge_frac) { // large overlap; then merge | |
167 | p->x = end - last_start; | |
168 | return; | |
169 | } | |
170 | } | |
171 | kv_pushp(mm128_t, *intv, &p); // a new interval | |
172 | p->x = end - start, p->y = start; | |
173 | } | |
174 | ||
175 | // find mapping regions from a list of minimizer hits | |
176 | static void get_reg(mm_tbuf_t *b, int radius, int k, int min_cnt, int max_gap, float merge_frac, int flag) | |
177 | { | |
178 | const uint64_t v_kept = ~(1ULL<<31), v_dropped = 1ULL<<31; | |
179 | mm128_v *c = &b->coef; | |
180 | int i, j, start = 0, iso_dist = radius * 2; | |
181 | ||
182 | if (c->n < min_cnt) return; | |
183 | ||
184 | // drop isolated minimizer hits | |
185 | if (flag&MM_F_NO_ISO) { | |
186 | for (i = 0; i < c->n; ++i) c->a[i].y |= v_dropped; | |
187 | for (i = 1; i < c->n; ++i) { | |
188 | uint64_t x = c->a[i].x; | |
189 | int32_t rpos = (uint32_t)c->a[i].y; | |
190 | for (j = i - 1; j >= 0 && x - c->a[j].x < radius; --j) { | |
191 | int32_t y = c->a[j].y; | |
192 | if (abs(y - rpos) < iso_dist) { | |
193 | c->a[i].y &= v_kept, c->a[j].y &= v_kept; | |
194 | break; | |
195 | } | |
196 | } | |
197 | } | |
198 | for (i = j = 0; i < c->n; ++i) // squeeze out hits still marked as v_dropped | |
199 | if ((c->a[i].y&v_dropped) == 0) | |
200 | c->a[j++] = c->a[i]; | |
201 | c->n = j; | |
202 | } | |
203 | ||
204 | // identify (possibly overlapping) intervals within _radius_; an interval is a cluster of hits | |
205 | b->intv.n = 0; | |
206 | for (i = 1; i < c->n; ++i) { | |
207 | if (c->a[i].x - c->a[start].x > radius) { | |
208 | if (i - start >= min_cnt) push_intv(&b->intv, start, i, merge_frac); | |
209 | for (++start; start < i && c->a[i].x - c->a[start].x > radius; ++start); | |
210 | } | |
211 | } | |
212 | if (i - start >= min_cnt) push_intv(&b->intv, start, i, merge_frac); | |
213 | ||
214 | // sort by the size of the interval | |
215 | radix_sort_128x(b->intv.a, b->intv.a + b->intv.n); | |
216 | ||
217 | // generate hits, starting from the largest interval | |
218 | b->reg2mini.n = 0; | |
219 | for (i = b->intv.n - 1; i >= 0; --i) proc_intv(b, i, k, min_cnt, max_gap); | |
220 | ||
221 | // post repeat removal | |
222 | if (!(flag&MM_F_WITH_REP)) drop_rep(b, min_cnt); | |
223 | } | |
224 | ||
225 | const mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name) | |
226 | { | |
227 | int j, n_dreg = 0, u = 0; | |
228 | const uint64_t *dreg = 0; | |
229 | ||
230 | b->mini.n = b->coef.n = 0; | |
231 | mm_sketch(seq, l_seq, mi->w, mi->k, 0, &b->mini); | |
232 | if (opt->sdust_thres > 0) | |
233 | dreg = sdust_core((const uint8_t*)seq, l_seq, opt->sdust_thres, 64, &n_dreg, b->sdb); | |
234 | for (j = 0; j < b->mini.n; ++j) { | |
235 | int k, n; | |
236 | const uint64_t *r; | |
237 | int32_t qpos = (uint32_t)b->mini.a[j].y>>1, strand = b->mini.a[j].y&1; | |
238 | b->mini.a[j].y = b->mini.a[j].y<<32>>32; // clear the rid field | |
239 | if (dreg && n_dreg) { // test complexity | |
240 | int s = qpos - (mi->k - 1), e = s + mi->k; | |
241 | while (u < n_dreg && (uint32_t)dreg[u] <= s) ++u; | |
242 | if (u < n_dreg && dreg[u]>>32 < e) { | |
243 | int v, l = 0; | |
244 | for (v = u; v < n_dreg && dreg[v]>>32 < e; ++v) { // iterate over LCRs overlapping this minimizer | |
245 | int ss = s > dreg[v]>>32? s : dreg[v]>>32; | |
246 | int ee = e < (uint32_t)dreg[v]? e : (uint32_t)dreg[v]; | |
247 | l += ee - ss; | |
248 | } | |
249 | if (l > mi->k>>1) continue; | |
250 | } | |
251 | } | |
252 | r = mm_idx_get(mi, b->mini.a[j].x, &n); | |
253 | if (n > mi->max_occ) continue; | |
254 | for (k = 0; k < n; ++k) { | |
255 | int32_t rpos = (uint32_t)r[k] >> 1; | |
256 | mm128_t *p; | |
257 | if (name && (opt->flag&MM_F_NO_SELF) && mi->name && strcmp(name, mi->name[r[k]>>32]) == 0 && rpos == qpos) | |
258 | continue; | |
259 | if (name && (opt->flag&MM_F_AVA) && mi->name && strcmp(name, mi->name[r[k]>>32]) > 0) | |
260 | continue; | |
261 | kv_pushp(mm128_t, b->coef, &p); | |
262 | if ((r[k]&1) == strand) { // forward strand | |
263 | p->x = (uint64_t)r[k] >> 32 << 32 | (0x80000000U + rpos - qpos); | |
264 | p->y = (uint64_t)j << 32 | rpos; | |
265 | } else { // reverse strand | |
266 | p->x = (uint64_t)r[k] >> 32 << 32 | (rpos + qpos) | 1ULL<<63; | |
267 | p->y = (uint64_t)j << 32 | rpos; | |
268 | } | |
269 | } | |
270 | } | |
271 | radix_sort_128x(b->coef.a, b->coef.a + b->coef.n); | |
272 | b->reg.n = 0; | |
273 | get_reg(b, opt->radius, mi->k, opt->min_cnt, opt->max_gap, opt->merge_frac, opt->flag); | |
274 | *n_regs = b->reg.n; | |
275 | return b->reg.a; | |
276 | } | |
277 | ||
278 | /************************** | |
279 | * Multi-threaded mapping * | |
280 | **************************/ | |
281 | ||
282 | void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n); | |
283 | void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps); | |
284 | ||
285 | typedef struct { | |
286 | int batch_size, n_processed, n_threads; | |
287 | const mm_mapopt_t *opt; | |
288 | bseq_file_t *fp; | |
289 | const mm_idx_t *mi; | |
290 | } pipeline_t; | |
291 | ||
292 | typedef struct { | |
293 | const pipeline_t *p; | |
294 | int n_seq; | |
295 | bseq1_t *seq; | |
296 | int *n_reg; | |
297 | mm_reg1_t **reg; | |
298 | mm_tbuf_t **buf; | |
299 | } step_t; | |
300 | ||
301 | static void worker_for(void *_data, long i, int tid) // kt_for() callback | |
302 | { | |
303 | step_t *step = (step_t*)_data; | |
304 | const mm_reg1_t *regs; | |
305 | int n_regs; | |
306 | ||
307 | regs = mm_map(step->p->mi, step->seq[i].l_seq, step->seq[i].seq, &n_regs, step->buf[tid], step->p->opt, step->seq[i].name); | |
308 | step->n_reg[i] = n_regs; | |
309 | if (n_regs > 0) { | |
310 | step->reg[i] = (mm_reg1_t*)malloc(n_regs * sizeof(mm_reg1_t)); | |
311 | memcpy(step->reg[i], regs, n_regs * sizeof(mm_reg1_t)); | |
312 | } | |
313 | } | |
314 | ||
315 | static void *worker_pipeline(void *shared, int step, void *in) | |
316 | { | |
317 | int i, j; | |
318 | pipeline_t *p = (pipeline_t*)shared; | |
319 | if (step == 0) { // step 0: read sequences | |
320 | step_t *s; | |
321 | s = (step_t*)calloc(1, sizeof(step_t)); | |
322 | s->seq = bseq_read(p->fp, p->batch_size, &s->n_seq); | |
323 | if (s->seq) { | |
324 | s->p = p; | |
325 | for (i = 0; i < s->n_seq; ++i) | |
326 | s->seq[i].rid = p->n_processed++; | |
327 | s->buf = (mm_tbuf_t**)calloc(p->n_threads, sizeof(mm_tbuf_t*)); | |
328 | for (i = 0; i < p->n_threads; ++i) | |
329 | s->buf[i] = mm_tbuf_init(); | |
330 | s->n_reg = (int*)calloc(s->n_seq, sizeof(int)); | |
331 | s->reg = (mm_reg1_t**)calloc(s->n_seq, sizeof(mm_reg1_t**)); | |
332 | return s; | |
333 | } else free(s); | |
334 | } else if (step == 1) { // step 1: map | |
335 | kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_seq); | |
336 | return in; | |
337 | } else if (step == 2) { // step 2: output | |
338 | step_t *s = (step_t*)in; | |
339 | const mm_idx_t *mi = p->mi; | |
340 | for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]); | |
341 | free(s->buf); | |
342 | for (i = 0; i < s->n_seq; ++i) { | |
343 | bseq1_t *t = &s->seq[i]; | |
344 | for (j = 0; j < s->n_reg[i]; ++j) { | |
345 | mm_reg1_t *r = &s->reg[i][j]; | |
346 | if (r->len < p->opt->min_match) continue; | |
347 | printf("%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]); | |
348 | if (mi->name) fputs(mi->name[r->rid], stdout); | |
349 | else printf("%d", r->rid + 1); | |
350 | printf("\t%d\t%d\t%d\t%d\t%d\t255\tcm:i:%d\n", mi->len[r->rid], r->rs, r->re, r->len, | |
351 | r->re - r->rs > r->qe - r->qs? r->re - r->rs : r->qe - r->qs, r->cnt); | |
352 | } | |
353 | free(s->reg[i]); | |
354 | free(s->seq[i].seq); free(s->seq[i].name); | |
355 | } | |
356 | free(s->reg); free(s->n_reg); free(s->seq); | |
357 | free(s); | |
358 | } | |
359 | return 0; | |
360 | } | |
361 | ||
362 | int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads, int tbatch_size) | |
363 | { | |
364 | pipeline_t pl; | |
365 | memset(&pl, 0, sizeof(pipeline_t)); | |
366 | pl.fp = bseq_open(fn); | |
367 | if (pl.fp == 0) return -1; | |
368 | pl.opt = opt, pl.mi = idx; | |
369 | pl.n_threads = n_threads, pl.batch_size = tbatch_size; | |
370 | kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3); | |
371 | bseq_close(pl.fp); | |
372 | return 0; | |
373 | } |
0 | .TH minimap 1 "06 December 2015" "minimap-0.2" "Bioinformatics tools" | |
1 | ||
2 | .SH NAME | |
3 | .PP | |
4 | minimap - fast mapping between long DNA sequences | |
5 | ||
6 | .SH SYNOPSIS | |
7 | .PP | |
8 | minimap | |
9 | .RB [ -lSOV ] | |
10 | .RB [ -k | |
11 | .IR kmer ] | |
12 | .RB [ -w | |
13 | .IR winSize ] | |
14 | .RB [ -I | |
15 | .IR batchSize ] | |
16 | .RB [ -d | |
17 | .IR dumpFile ] | |
18 | .RB [ -f | |
19 | .IR occThres ] | |
20 | .RB [ -r | |
21 | .IR bandWidth ] | |
22 | .RB [ -m | |
23 | .IR minShared ] | |
24 | .RB [ -c | |
25 | .IR minCount ] | |
26 | .RB [ -L | |
27 | .IR minMatch ] | |
28 | .RB [ -g | |
29 | .IR maxGap ] | |
30 | .RB [ -T | |
31 | .IR dustThres ] | |
32 | .RB [ -t | |
33 | .IR nThreads ] | |
34 | .RB [ -x | |
35 | .IR preset ] | |
36 | .I target.fa | |
37 | .I query.fa | |
38 | > | |
39 | .I output.paf | |
40 | ||
41 | .SH DESCRIPTION | |
42 | .PP | |
43 | Minimap is a tool to efficiently find multiple approximate mapping positions | |
44 | between two sets of long sequences, such as between reads and reference | |
45 | genomes, between genomes and between long noisy reads. Minimap has an indexing | |
46 | and a mapping phase. In the indexing phase, it collects all minimizers of a | |
47 | large batch of target sequences in a hash table; in the mapping phase, it | |
48 | identifies good clusters of colinear minimizer hits. Minimap does not generate | |
49 | detailed alignments between the target and the query sequences. It only outputs | |
50 | the approximate start and the end coordinates of these clusters. | |
51 | ||
52 | .SH OPTIONS | |
53 | ||
54 | .SS Indexing options | |
55 | ||
56 | .TP 10 | |
57 | .BI -k \ INT | |
58 | Minimizer k-mer length [15] | |
59 | ||
60 | .TP | |
61 | .BI -w \ INT | |
62 | Minimizer window size [2/3 of k-mer length]. A minimizer is the smallest k-mer | |
63 | in a window of w consecutive k-mers. | |
64 | ||
65 | .TP | |
66 | .BI -I \ NUM | |
67 | Load at most | |
68 | .I NUM | |
69 | target bases into RAM for indexing [4G]. If there are more than | |
70 | .I NUM | |
71 | bases in | |
72 | .IR target.fa , | |
73 | minimap needs to read | |
74 | .I query.fa | |
75 | multiple times to map it against each batch of target sequences. | |
76 | .I NUM | |
77 | may be ending with k/K/m/M/g/G. | |
78 | ||
79 | .TP | |
80 | .BI -d \ FILE | |
81 | Dump minimizer index to | |
82 | .I FILE | |
83 | [no dump] | |
84 | ||
85 | .TP | |
86 | .B -l | |
87 | Indicate that | |
88 | .I target.fa | |
89 | is in fact a minimizer index generated by option | |
90 | .BR -d , | |
91 | not a FASTA or FASTQ file. | |
92 | ||
93 | .SS Mapping options | |
94 | ||
95 | .TP 10 | |
96 | .BI -f \ FLOAT | |
97 | Ignore top | |
98 | .I FLOAT | |
99 | fraction of most occurring minimizers [0.001] | |
100 | ||
101 | .TP | |
102 | .BI -r \ INT | |
103 | Approximate bandwidth for initial minimizer hits clustering [500]. A | |
104 | .I minimizer hit | |
105 | is a minimizer present in both the target and query sequences. A | |
106 | .I minimizer hit cluster | |
107 | is a group of potentially colinear minimizer hits between a target and a query | |
108 | sequence. | |
109 | ||
110 | .TP | |
111 | .BI -m \ FLOAT | |
112 | Merge initial minimizer hit clusters if | |
113 | .I FLOAT | |
114 | or higher fraction of minimizers are shared between the clusters [0.5] | |
115 | ||
116 | .TP | |
117 | .BI -c \ INT | |
118 | Retain a minimizer hit cluster if it contains | |
119 | .I INT | |
120 | or more minimizer hits [4] | |
121 | ||
122 | .TP | |
123 | .BI -L \ INT | |
124 | Discard a minimizer hit cluster if after colinearization, the number of matching bases is below | |
125 | .I INT | |
126 | [40]. This option mainly reduces the size of output. It has little effect on | |
127 | the speed and peak memory. | |
128 | ||
129 | .TP | |
130 | .BI -g \ INT | |
131 | Split a minimizer hit cluster at a gap | |
132 | .IR INT -bp | |
133 | or longer that does not contain any minimizer hits [10000] | |
134 | ||
135 | .TP | |
136 | .BI -T \ INT | |
137 | Mask regions on query sequences with SDUST score threshold | |
138 | .IR INT ; | |
139 | 0 to disable [0]. SDUST is an algorithm | |
140 | to identify low-complexity subsequences. It is not enabled by default. If SDUST | |
141 | is preferred, a value between 20 and 25 is recommended. A higher threshold masks | |
142 | less sequences. | |
143 | ||
144 | .TP | |
145 | .B -S | |
146 | Perform all-vs-all mapping. In this mode, if the query sequence name is | |
147 | lexicographically larger than the target sequence name, the hits between them | |
148 | will be suppressed; if the query sequence name is the same as the target name, | |
149 | diagonal minimizer hits will also be suppressed. | |
150 | ||
151 | .TP | |
152 | .B -O | |
153 | Drop a minimizer hit if it is far away from other hits (EXPERIMENTAL). This | |
154 | option is useful for mapping long chromosomes from two diverged species. | |
155 | ||
156 | .TP | |
157 | .BI -x \ STR | |
158 | Changing multiple settings based on | |
159 | .I STR | |
160 | [not set]. It is recommended to apply this option before other options, such | |
161 | that the following options may override the multiple settings modified by this | |
162 | option. | |
163 | ||
164 | .RS | |
165 | .TP 8 | |
166 | .B ava10k | |
167 | for PacBio or Oxford Nanopore all-vs-all read mapping (-Sw5 -L100 -m0). | |
168 | .RE | |
169 | ||
170 | .SS Input/output options | |
171 | ||
172 | .TP 10 | |
173 | .BI -t \ INT | |
174 | Number of threads [3]. Minimap uses at most three threads when collecting | |
175 | minimizers on target sequences, and uses up to | |
176 | .IR INT +1 | |
177 | threads when mapping (the extra thread is for I/O, which is frequently idle and | |
178 | takes little CPU time). | |
179 | ||
180 | .TP | |
181 | .B -V | |
182 | Print version number to stdout | |
183 | ||
184 | .SH OUTPUT FORMAT | |
185 | ||
186 | .PP | |
187 | Minimap outputs mapping positions in the Pairwise mApping Format (PAF). PAF is | |
188 | a TAB-delimited text format with each line consisting of at least 12 fields as | |
189 | are described in the following table: | |
190 | ||
191 | .TS | |
192 | center box; | |
193 | cb | cb | cb | |
194 | r | c | l . | |
195 | Col Type Description | |
196 | _ | |
197 | 1 string Query sequence name | |
198 | 2 int Query sequence length | |
199 | 3 int Query start coordinate (0-based) | |
200 | 4 int Query end coordinate (0-based) | |
201 | 5 char `+' if query and target on the same strand; `-' if opposite | |
202 | 6 string Target sequence name | |
203 | 7 int Target sequence length | |
204 | 8 int Target start coordinate on the original strand | |
205 | 9 int Target end coordinate on the original strand | |
206 | 10 int Number of matching bases in the mapping | |
207 | 11 int Number bases, including gaps, in the mapping | |
208 | 12 int Mapping quality (0-255 with 255 for missing) | |
209 | .TE | |
210 | ||
211 | .PP | |
212 | When the alignment is available, column 11 gives the total number of sequence | |
213 | matches, mismatches and gaps in the alignment; column 10 divided by column 11 | |
214 | gives the alignment identity. As minimap does not generate detailed alignment, | |
215 | these two columns are approximate. PAF may optionally have additional fields in | |
216 | the SAM-like typed key-value format. Minimap writes the number of minimizer | |
217 | hits in a cluster to the cm tag. | |
218 | ||
219 | .SH SEE ALSO | |
220 | .PP | |
221 | miniasm(1) |
0 | #ifndef MINIMAP_H | |
1 | #define MINIMAP_H | |
2 | ||
3 | #include <stdint.h> | |
4 | #include <stdio.h> | |
5 | #include <sys/types.h> | |
6 | #include "bseq.h" | |
7 | ||
8 | #define MM_IDX_DEF_B 14 | |
9 | #define MM_DEREP_Q50 5.0 | |
10 | ||
11 | #define MM_F_WITH_REP 0x1 | |
12 | #define MM_F_NO_SELF 0x2 | |
13 | #define MM_F_NO_ISO 0x4 | |
14 | #define MM_F_AVA 0x8 | |
15 | ||
16 | typedef struct { | |
17 | uint64_t x, y; | |
18 | } mm128_t; | |
19 | ||
20 | typedef struct { size_t n, m; mm128_t *a; } mm128_v; | |
21 | typedef struct { size_t n, m; uint64_t *a; } uint64_v; | |
22 | typedef struct { size_t n, m; uint32_t *a; } uint32_v; | |
23 | ||
24 | typedef struct { | |
25 | mm128_v a; // (minimizer, position) array | |
26 | int32_t n; // size of the _p_ array | |
27 | uint64_t *p; // position array for minimizers appearing >1 times | |
28 | void *h; // hash table indexing _p_ and minimizers appearing once | |
29 | } mm_idx_bucket_t; | |
30 | ||
31 | typedef struct { | |
32 | int b, w, k; | |
33 | uint32_t n; // number of reference sequences | |
34 | mm_idx_bucket_t *B; | |
35 | uint32_t max_occ; | |
36 | float freq_thres; | |
37 | int32_t *len; // length of each reference sequence | |
38 | char **name; // TODO: if this uses too much RAM, switch one concatenated string | |
39 | } mm_idx_t; | |
40 | ||
41 | typedef struct { | |
42 | uint32_t cnt:31, rev:1; | |
43 | uint32_t rid:31, rep:1; | |
44 | uint32_t len; | |
45 | int32_t qs, qe, rs, re; | |
46 | } mm_reg1_t; | |
47 | ||
48 | typedef struct { | |
49 | int radius; // bandwidth to cluster hits | |
50 | int max_gap; // break a chain if there are no minimizers in a max_gap window | |
51 | int min_cnt; // minimum number of minimizers to start a chain | |
52 | int min_match; | |
53 | int sdust_thres; // score threshold for SDUST; 0 to disable | |
54 | int flag; // see MM_F_* macros | |
55 | float merge_frac; // merge two chains if merge_frac fraction of minimzers are shared between the chains | |
56 | } mm_mapopt_t; | |
57 | ||
58 | extern int mm_verbose; | |
59 | extern double mm_realtime0; | |
60 | ||
61 | struct mm_tbuf_s; | |
62 | typedef struct mm_tbuf_s mm_tbuf_t; | |
63 | ||
64 | #ifdef __cplusplus | |
65 | extern "C" { | |
66 | #endif | |
67 | ||
68 | // compute minimizers | |
69 | void mm_sketch(const char *str, int len, int w, int k, uint32_t rid, mm128_v *p); | |
70 | ||
71 | // minimizer indexing | |
72 | mm_idx_t *mm_idx_init(int w, int k, int b); | |
73 | void mm_idx_destroy(mm_idx_t *mi); | |
74 | mm_idx_t *mm_idx_gen(bseq_file_t *fp, int w, int k, int b, int tbatch_size, int n_threads, uint64_t ibatch_size, int keep_name); | |
75 | void mm_idx_set_max_occ(mm_idx_t *mi, float f); | |
76 | const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n); | |
77 | ||
78 | mm_idx_t *mm_idx_build(const char *fn, int w, int k, int n_threads); | |
79 | ||
80 | // minimizer index I/O | |
81 | void mm_idx_dump(FILE *fp, const mm_idx_t *mi); | |
82 | mm_idx_t *mm_idx_load(FILE *fp); | |
83 | ||
84 | // mapping | |
85 | void mm_mapopt_init(mm_mapopt_t *opt); | |
86 | mm_tbuf_t *mm_tbuf_init(void); | |
87 | void mm_tbuf_destroy(mm_tbuf_t *b); | |
88 | const mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name); | |
89 | ||
90 | int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads, int tbatch_size); | |
91 | ||
92 | // private functions (may be moved to a "mmpriv.h" in future) | |
93 | double cputime(void); | |
94 | double realtime(void); | |
95 | void radix_sort_128x(mm128_t *beg, mm128_t *end); | |
96 | void radix_sort_64(uint64_t *beg, uint64_t *end); | |
97 | uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); | |
98 | ||
99 | #ifdef __cplusplus | |
100 | } | |
101 | #endif | |
102 | ||
103 | #endif |
0 | #include <sys/resource.h> | |
1 | #include <sys/time.h> | |
2 | #include "minimap.h" | |
3 | ||
4 | int mm_verbose = 3; | |
5 | double mm_realtime0; | |
6 | ||
7 | double cputime() | |
8 | { | |
9 | struct rusage r; | |
10 | getrusage(RUSAGE_SELF, &r); | |
11 | return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec); | |
12 | } | |
13 | ||
14 | double realtime() | |
15 | { | |
16 | struct timeval tp; | |
17 | struct timezone tzp; | |
18 | gettimeofday(&tp, &tzp); | |
19 | return tp.tv_sec + tp.tv_usec * 1e-6; | |
20 | } | |
21 | ||
22 | #include "ksort.h" | |
23 | #define sort_key_128x(a) ((a).x) | |
24 | KRADIX_SORT_INIT(128x, mm128_t, sort_key_128x, 8) | |
25 | KSORT_INIT_GENERIC(uint32_t) |
0 | #include <string.h> | |
1 | #include <stdint.h> | |
2 | #include <stdio.h> | |
3 | #include "kdq.h" | |
4 | #include "kvec.h" | |
5 | #include "sdust.h" | |
6 | ||
7 | #define SD_WLEN 3 | |
8 | #define SD_WTOT (1<<(SD_WLEN<<1)) | |
9 | #define SD_WMSK (SD_WTOT - 1) | |
10 | ||
11 | typedef struct { | |
12 | int start, finish; | |
13 | int r, l; | |
14 | } perf_intv_t; | |
15 | ||
16 | typedef kvec_t(perf_intv_t) perf_intv_v; | |
17 | typedef kvec_t(uint64_t) uint64_v; | |
18 | ||
19 | KDQ_INIT(int) | |
20 | ||
21 | #if defined(_NO_NT4_TBL) || defined(_SDUST_MAIN) | |
22 | unsigned char seq_nt4_table[256] = { | |
23 | 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
24 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
25 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
26 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
27 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, | |
28 | 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
29 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, | |
30 | 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
31 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
32 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
33 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
34 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
35 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
36 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
37 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
38 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 | |
39 | }; | |
40 | #else | |
41 | extern unsigned char seq_nt4_table[256]; | |
42 | #endif | |
43 | ||
44 | struct sdust_buf_s { | |
45 | kdq_t(int) *w; | |
46 | perf_intv_v P; // the list of perfect intervals for the current window, sorted by descending start and then by ascending finish | |
47 | uint64_v res; // the result | |
48 | }; | |
49 | ||
50 | sdust_buf_t *sdust_buf_init(void) | |
51 | { | |
52 | sdust_buf_t *buf; | |
53 | buf = (sdust_buf_t*)calloc(1, sizeof(sdust_buf_t)); | |
54 | buf->w = kdq_init(int); | |
55 | return buf; | |
56 | } | |
57 | ||
58 | void sdust_buf_destroy(sdust_buf_t *buf) | |
59 | { | |
60 | if (buf == 0) return; | |
61 | kdq_destroy(int, buf->w); | |
62 | free(buf->P.a); free(buf->res.a); | |
63 | free(buf); | |
64 | } | |
65 | ||
66 | static inline void shift_window(int t, kdq_t(int) *w, int T, int W, int *L, int *rw, int *rv, int *cw, int *cv) | |
67 | { | |
68 | int s; | |
69 | if (kdq_size(w) >= W - SD_WLEN + 1) { // TODO: is this right for SD_WLEN!=3? | |
70 | s = *kdq_shift(int, w); | |
71 | *rw -= --cw[s]; | |
72 | if (*L > kdq_size(w)) | |
73 | --*L, *rv -= --cv[s]; | |
74 | } | |
75 | kdq_push(int, w, t); | |
76 | ++*L; | |
77 | *rw += cw[t]++; | |
78 | *rv += cv[t]++; | |
79 | if (cv[t] * 10 > T<<1) { | |
80 | do { | |
81 | s = kdq_at(w, kdq_size(w) - *L); | |
82 | *rv -= --cv[s]; | |
83 | --*L; | |
84 | } while (s != t); | |
85 | } | |
86 | } | |
87 | ||
88 | static inline void save_masked_regions(uint64_v *res, perf_intv_v *P, int start) | |
89 | { | |
90 | int i, saved = 0; | |
91 | perf_intv_t *p; | |
92 | if (P->n == 0 || P->a[P->n - 1].start >= start) return; | |
93 | p = &P->a[P->n - 1]; | |
94 | if (res->n) { | |
95 | int s = res->a[res->n - 1]>>32, f = (uint32_t)res->a[res->n - 1]; | |
96 | if (p->start <= f) // if overlapping with or adjacent to the previous interval | |
97 | saved = 1, res->a[res->n - 1] = (uint64_t)s<<32 | (f > p->finish? f : p->finish); | |
98 | } | |
99 | if (!saved) kv_push(uint64_t, *res, (uint64_t)p->start<<32|p->finish); | |
100 | for (i = P->n - 1; i >= 0 && P->a[i].start < start; --i); // remove perfect intervals that have falled out of the window | |
101 | P->n = i + 1; | |
102 | } | |
103 | ||
104 | static void find_perfect(perf_intv_v *P, const kdq_t(int) *w, int T, int start, int L, int rv, const int *cv) | |
105 | { | |
106 | int c[SD_WTOT], r = rv, i, max_r = 0, max_l = 0; | |
107 | memcpy(c, cv, SD_WTOT * sizeof(int)); | |
108 | for (i = (long)kdq_size(w) - L - 1; i >= 0; --i) { | |
109 | int j, t = kdq_at(w, i), new_r, new_l; | |
110 | r += c[t]++; | |
111 | new_r = r, new_l = kdq_size(w) - i - 1; | |
112 | if (new_r * 10 > T * new_l) { | |
113 | for (j = 0; j < P->n && P->a[j].start >= i + start; ++j) { // find insertion position | |
114 | perf_intv_t *p = &P->a[j]; | |
115 | if (max_r == 0 || p->r * max_l > max_r * p->l) | |
116 | max_r = p->r, max_l = p->l; | |
117 | } | |
118 | if (max_r == 0 || new_r * max_l >= max_r * new_l) { // then insert | |
119 | max_r = new_r, max_l = new_l; | |
120 | if (P->n == P->m) kv_resize(perf_intv_t, *P, P->n + 1); | |
121 | memmove(&P->a[j+1], &P->a[j], (P->n - j) * sizeof(perf_intv_t)); // make room | |
122 | ++P->n; | |
123 | P->a[j].start = i + start, P->a[j].finish = kdq_size(w) + (SD_WLEN - 1) + start; | |
124 | P->a[j].r = new_r, P->a[j].l = new_l; | |
125 | } | |
126 | } | |
127 | } | |
128 | } | |
129 | ||
130 | const uint64_t *sdust_core(const uint8_t *seq, int l_seq, int T, int W, int *n, sdust_buf_t *buf) | |
131 | { | |
132 | int rv = 0, rw = 0, L = 0, cv[SD_WTOT], cw[SD_WTOT]; | |
133 | int i, start, l; // _start_: start of the current window; _l_: length of a contiguous A/C/G/T (sub)sequence | |
134 | unsigned t; // current word | |
135 | ||
136 | buf->P.n = buf->res.n = 0; | |
137 | buf->w->front = buf->w->count = 0; | |
138 | memset(cv, 0, SD_WTOT * sizeof(int)); | |
139 | memset(cw, 0, SD_WTOT * sizeof(int)); | |
140 | if (l_seq < 0) l_seq = strlen((const char*)seq); | |
141 | for (i = l = t = 0; i <= l_seq; ++i) { | |
142 | int b = i < l_seq? seq_nt4_table[seq[i]] : 4; | |
143 | if (b < 4) { // an A/C/G/T base | |
144 | ++l, t = (t<<2 | b) & SD_WMSK; | |
145 | if (l >= SD_WLEN) { // we have seen a word | |
146 | start = (l - W > 0? l - W : 0) + (i + 1 - l); // set the start of the current window | |
147 | save_masked_regions(&buf->res, &buf->P, start); // save intervals falling out of the current window? | |
148 | shift_window(t, buf->w, T, W, &L, &rw, &rv, cw, cv); | |
149 | if (rw * 10 > L * T) | |
150 | find_perfect(&buf->P, buf->w, T, start, L, rv, cv); | |
151 | } | |
152 | } else { // N or the end of sequence; N effectively breaks input into pieces of independent sequences | |
153 | start = (l - W + 1 > 0? l - W + 1 : 0) + (i + 1 - l); | |
154 | while (buf->P.n) save_masked_regions(&buf->res, &buf->P, start++); // clear up unsaved perfect intervals | |
155 | l = t = 0; | |
156 | } | |
157 | } | |
158 | *n = buf->res.n; | |
159 | return buf->res.a; | |
160 | } | |
161 | ||
162 | uint64_t *sdust(const uint8_t *seq, int l_seq, int T, int W, int *n) | |
163 | { | |
164 | uint64_t *ret; | |
165 | sdust_buf_t *buf; | |
166 | buf = sdust_buf_init(); | |
167 | ret = (uint64_t*)sdust_core(seq, l_seq, T, W, n, buf); | |
168 | buf->res.a = 0; | |
169 | sdust_buf_destroy(buf); | |
170 | return ret; | |
171 | } | |
172 | ||
173 | #ifdef _SDUST_MAIN | |
174 | #include <zlib.h> | |
175 | #include <stdio.h> | |
176 | #include <unistd.h> | |
177 | #include "kseq.h" | |
178 | KSEQ_INIT(gzFile, gzread) | |
179 | ||
180 | int main(int argc, char *argv[]) | |
181 | { | |
182 | gzFile fp; | |
183 | kseq_t *ks; | |
184 | int W = 64, T = 20, c; | |
185 | ||
186 | while ((c = getopt(argc, argv, "w:t:")) >= 0) { | |
187 | if (c == 'w') W = atoi(optarg); | |
188 | else if (c == 't') T = atoi(optarg); | |
189 | } | |
190 | if (optind == argc) { | |
191 | fprintf(stderr, "Usage: sdust [-w %d] [-t %d] <in.fa>\n", W, T); | |
192 | return 1; | |
193 | } | |
194 | fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); | |
195 | ks = kseq_init(fp); | |
196 | while (kseq_read(ks) >= 0) { | |
197 | uint64_t *r; | |
198 | int i, n; | |
199 | r = sdust((uint8_t*)ks->seq.s, -1, T, W, &n); | |
200 | for (i = 0; i < n; ++i) | |
201 | printf("%s\t%d\t%d\n", ks->name.s, (int)(r[i]>>32), (int)r[i]); | |
202 | free(r); | |
203 | } | |
204 | kseq_destroy(ks); | |
205 | gzclose(fp); | |
206 | return 0; | |
207 | } | |
208 | #endif |
0 | #ifndef SDUST_H | |
1 | #define SDUST_H | |
2 | ||
3 | struct sdust_buf_s; | |
4 | typedef struct sdust_buf_s sdust_buf_t; | |
5 | ||
6 | #ifdef __cplusplus | |
7 | extern "C" { | |
8 | #endif | |
9 | ||
10 | // the simple interface | |
11 | uint64_t *sdust(const uint8_t *seq, int l_seq, int T, int W, int *n); | |
12 | ||
13 | // the following interface dramatically reduce heap allocations when sdust is frequently called. | |
14 | sdust_buf_t *sdust_buf_init(void); | |
15 | void sdust_buf_destroy(sdust_buf_t *buf); | |
16 | const uint64_t *sdust_core(const uint8_t *seq, int l_seq, int T, int W, int *n, sdust_buf_t *buf); | |
17 | ||
18 | #ifdef __cplusplus | |
19 | } | |
20 | #endif | |
21 | ||
22 | #endif |
0 | #include <stdio.h> | |
1 | #include <stdlib.h> | |
2 | #include <assert.h> | |
3 | #include <string.h> | |
4 | #include "kvec.h" | |
5 | #include "minimap.h" | |
6 | ||
7 | unsigned char seq_nt4_table[256] = { | |
8 | 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
9 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
10 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
11 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
12 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, | |
13 | 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
14 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, | |
15 | 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
16 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
17 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
18 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
19 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
20 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
21 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
22 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
23 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 | |
24 | }; | |
25 | ||
26 | static inline uint64_t hash64(uint64_t key, uint64_t mask) | |
27 | { | |
28 | key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1; | |
29 | key = key ^ key >> 24; | |
30 | key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265 | |
31 | key = key ^ key >> 14; | |
32 | key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21 | |
33 | key = key ^ key >> 28; | |
34 | key = (key + (key << 31)) & mask; | |
35 | return key; | |
36 | } | |
37 | ||
38 | /** | |
39 | * Find symmetric (w,k)-minimizers on a DNA sequence | |
40 | * | |
41 | * @param str DNA sequence | |
42 | * @param len length of $str | |
43 | * @param w find a minimizer for every $w consecutive k-mers | |
44 | * @param k k-mer size | |
45 | * @param rid reference ID; will be copied to the output $p array | |
46 | * @param p minimizers; p->a[i].x is the 2k-bit hash value; | |
47 | * p->a[i].y = rid<<32 | lastPos<<1 | strand | |
48 | * where lastPos is the position of the last base of the i-th minimizer, | |
49 | * and strand indicates whether the minimizer comes from the top or the bottom strand. | |
50 | * Callers may want to set "p->n = 0"; otherwise results are appended to p | |
51 | */ | |
52 | void mm_sketch(const char *str, int len, int w, int k, uint32_t rid, mm128_v *p) | |
53 | { | |
54 | uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, kmer[2] = {0,0}; | |
55 | int i, j, l, buf_pos, min_pos; | |
56 | mm128_t *buf, min = { UINT64_MAX, UINT64_MAX }; | |
57 | ||
58 | assert(len > 0 && w > 0 && k > 0); | |
59 | buf = (mm128_t*)alloca(w * 16); | |
60 | memset(buf, 0xff, w * 16); | |
61 | ||
62 | for (i = l = buf_pos = min_pos = 0; i < len; ++i) { | |
63 | int c = seq_nt4_table[(uint8_t)str[i]]; | |
64 | mm128_t info = { UINT64_MAX, UINT64_MAX }; | |
65 | if (c < 4) { // not an ambiguous base | |
66 | int z; | |
67 | kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer | |
68 | kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer | |
69 | if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand | |
70 | z = kmer[0] < kmer[1]? 0 : 1; // strand | |
71 | if (++l >= k) | |
72 | info.x = hash64(kmer[z], mask), info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z; | |
73 | } else l = 0; | |
74 | buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below | |
75 | if (l == w + k - 1) { // special case for the first window - because identical k-mers are not stored yet | |
76 | for (j = buf_pos + 1; j < w; ++j) | |
77 | if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, *p, buf[j]); | |
78 | for (j = 0; j < buf_pos; ++j) | |
79 | if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, *p, buf[j]); | |
80 | } | |
81 | if (info.x <= min.x) { // a new minimum; then write the old min | |
82 | if (l >= w + k) kv_push(mm128_t, *p, min); | |
83 | min = info, min_pos = buf_pos; | |
84 | } else if (buf_pos == min_pos) { // old min has moved outside the window | |
85 | if (l >= w + k - 1) kv_push(mm128_t, *p, min); | |
86 | for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers | |
87 | if (min.x >= buf[j].x) min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer | |
88 | for (j = 0; j <= buf_pos; ++j) | |
89 | if (min.x >= buf[j].x) min = buf[j], min_pos = j; | |
90 | if (l >= w + k - 1) { // write identical k-mers | |
91 | for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted | |
92 | if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, *p, buf[j]); | |
93 | for (j = 0; j <= buf_pos; ++j) | |
94 | if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, *p, buf[j]); | |
95 | } | |
96 | } | |
97 | if (++buf_pos == w) buf_pos = 0; | |
98 | } | |
99 | if (min.x != UINT64_MAX) | |
100 | kv_push(mm128_t, *p, min); | |
101 | } |