New upstream version 0.1+git20190320.b499514
Andreas Tille
3 years ago
16 | 16 | is able to retain heterozygous events and thus can be used to assemble diploid |
17 | 17 | regions for the purpose of variant calling. It is one of the limited choices |
18 | 18 | for local re-assembly and arguably the easiest to interface. |
19 | ||
20 | If you use fermi-lite in your work, please cite the FermiKit paper: | |
21 | ||
22 | > Li H (2015) FermiKit: assembly-based variant calling for Illumina | |
23 | > resequencing data, *Bioinformatics*, **31**:3694-6. | |
19 | 24 | |
20 | 25 | ## Usage |
21 | 26 | |
41 | 46 | return 0; |
42 | 47 | } |
43 | 48 | ``` |
44 | The direct assembly output is in fact a graph. You may have a look at the | |
45 | [header file][header] for details. | |
49 | The `fml_assemble()` output is in fact a graph. You may have a look at the | |
50 | `fml_utg_print_gfa()` function in [misc.c][misc] about how to derive a | |
51 | [GFA][gfa] representation from an array of `fml_utg_t` objects. | |
46 | 52 | |
47 | 53 | ## Overview of the Assembly Algorithm |
48 | 54 | |
85 | 91 | [fk]: http://github.com/lh3/fermikit |
86 | 92 | [example]: https://github.com/lh3/fermi-lite/blob/master/example.c |
87 | 93 | [header]: https://github.com/lh3/fermi-lite/blob/master/fml.h |
94 | [misc]: https://github.com/lh3/fermi-lite/blob/master/misc.c | |
95 | [gfa]: https://github.com/pmelsted/GFA-spec |
14 | 14 | |
15 | 15 | #define edge_mark_del(_x) ((_x).x = (uint64_t)-2, (_x).y = 0) |
16 | 16 | #define edge_is_del(_x) ((_x).x == (uint64_t)-2 || (_x).y == 0) |
17 | ||
18 | static int fm_verbose = 1; | |
19 | 17 | |
20 | 18 | /****************** |
21 | 19 | * Closed bubbles * |
177 | 175 | mag_g_merge(g, 0, 0); |
178 | 176 | } |
179 | 177 | |
180 | int mag_vh_pop_simple(mag_t *g, uint64_t idd, float max_cov, float max_frac, int aggressive) | |
178 | int mag_vh_pop_simple(mag_t *g, uint64_t idd, float max_cov, float max_frac, int max_bdiff, int aggressive) | |
181 | 179 | { |
182 | 180 | magv_t *p = &g->v.a[idd>>1], *q[2]; |
183 | 181 | ku128_v *r; |
194 | 192 | dir[j] = x&1; |
195 | 193 | q[j] = &g->v.a[x>>1]; |
196 | 194 | if (q[j]->nei[0].n != 1 || q[j]->nei[1].n != 1) return ret; // no bubble |
197 | l[j] = q[j]->len - (int)(q[j]->nei[0].a->y + q[j]->nei[1].a->y); | |
195 | l[j] = q[j]->len - (int)(q[j]->nei[0].a->y + q[j]->nei[1].a->y); // bubble length, excluding overlaps | |
198 | 196 | } |
199 | 197 | if (q[0]->nei[dir[0]^1].a->x != q[1]->nei[dir[1]^1].a->x) return ret; // no bubble |
198 | if (l[0] - l[1] > max_bdiff || l[1] - l[0] > max_bdiff) return 1; // huge bubble differences | |
200 | 199 | for (j = 0; j < 2; ++j) { // set seq[] and cov[], and compute avg[] |
201 | 200 | if (l[j] > 0) { |
202 | 201 | seq[j] = malloc(l[j]<<1); |
253 | 252 | return ret; |
254 | 253 | } |
255 | 254 | |
256 | void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int aggressive) | |
255 | void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int max_bdiff, int aggressive) | |
257 | 256 | { |
258 | 257 | int64_t i, n_examined = 0, n_popped = 0; |
259 | 258 | int ret; |
260 | 259 | |
261 | 260 | for (i = 0; i < g->v.n; ++i) { |
262 | ret = mag_vh_pop_simple(g, i<<1|0, max_cov, max_frac, aggressive); | |
261 | ret = mag_vh_pop_simple(g, i<<1|0, max_cov, max_frac, max_bdiff, aggressive); | |
263 | 262 | if (ret >= 1) ++n_examined; |
264 | 263 | if (ret >= 2) ++n_popped; |
265 | ret = mag_vh_pop_simple(g, i<<1|1, max_cov, max_frac, aggressive); | |
264 | ret = mag_vh_pop_simple(g, i<<1|1, max_cov, max_frac, max_bdiff, aggressive); | |
266 | 265 | if (ret >= 1) ++n_examined; |
267 | 266 | if (ret >= 2) ++n_popped; |
268 | 267 | } |
5 | 5 | int main(int argc, char *argv[]) |
6 | 6 | { |
7 | 7 | fml_opt_t opt; |
8 | int c, n_seqs, n_utg; | |
8 | int c, n_seqs, n_utg, gfa_out = 0; | |
9 | 9 | bseq1_t *seqs; |
10 | 10 | fml_utg_t *utg; |
11 | 11 | |
12 | 12 | fml_opt_init(&opt); |
13 | while ((c = getopt(argc, argv, "Ae:l:r:t:c:")) >= 0) { | |
13 | while ((c = getopt(argc, argv, "gOAe:l:r:t:c:d:v:")) >= 0) { | |
14 | 14 | if (c == 'e') opt.ec_k = atoi(optarg); |
15 | 15 | else if (c == 'l') opt.min_asm_ovlp = atoi(optarg); |
16 | 16 | else if (c == 'r') opt.mag_opt.min_dratio1 = atof(optarg); |
17 | 17 | else if (c == 'A') opt.mag_opt.flag |= MAG_F_AGGRESSIVE; |
18 | else if (c == 'O') opt.mag_opt.flag &= ~MAG_F_POPOPEN; | |
19 | else if (c == 'd') opt.mag_opt.max_bdiff = atoi(optarg); | |
18 | 20 | else if (c == 't') opt.n_threads = atoi(optarg); |
21 | else if (c == 'g') gfa_out = 1; | |
22 | else if (c == 'v') fm_verbose = atoi(optarg); | |
19 | 23 | else if (c == 'c') { |
20 | 24 | char *p; |
21 | 25 | opt.min_cnt = strtol(optarg, &p, 10); |
30 | 34 | fprintf(stderr, " -l INT min overlap length during initial assembly [%d]\n", opt.min_asm_ovlp); |
31 | 35 | fprintf(stderr, " -r FLOAT drop an overlap if its length is below maxOvlpLen*FLOAT [%g]\n", opt.mag_opt.min_dratio1); |
32 | 36 | fprintf(stderr, " -t INT number of threads (don't use multi-threading for small data sets) [%d]\n", opt.n_threads); |
33 | fprintf(stderr, " -A discard heterozygotes (apply this to assemble bacterial genomes)\n"); | |
37 | fprintf(stderr, " -d INT retain a bubble if one side is longer than the other side by >INT-bp [%d]\n", opt.mag_opt.max_bdiff); | |
38 | fprintf(stderr, " -A discard heterozygotes (apply this to assemble bacterial genomes; override -O)\n"); | |
39 | fprintf(stderr, " -O don't apply aggressive tip trimming\n"); | |
40 | fprintf(stderr, " -g output the assembly graph in the GFA format\n"); | |
34 | 41 | return 1; |
35 | 42 | } |
36 | 43 | seqs = bseq_read(argv[optind], &n_seqs); |
37 | 44 | utg = fml_assemble(&opt, n_seqs, seqs, &n_utg); |
38 | fml_utg_print(n_utg, utg); | |
45 | if (!gfa_out) fml_utg_print(n_utg, utg); | |
46 | else fml_utg_print_gfa(n_utg, utg); | |
39 | 47 | fml_utg_destroy(n_utg, utg); |
40 | 48 | return 0; |
41 | 49 | } |
0 | 0 | #ifndef FML_H |
1 | 1 | #define FML_H |
2 | 2 | |
3 | #define FML_VERSION "r41" | |
3 | #define FML_VERSION "r55" | |
4 | 4 | |
5 | 5 | #include <stdint.h> |
6 | 6 | |
7 | 7 | typedef struct { |
8 | 8 | int32_t l_seq; |
9 | char *seq, *qual; | |
9 | char *seq, *qual; // NULL-terminated strings; length expected to match $l_seq | |
10 | 10 | } bseq1_t; |
11 | 11 | |
12 | #define MAG_F_AGGRESSIVE 0x20 | |
13 | #define MAG_F_NO_SIMPL 0x80 | |
12 | #define MAG_F_AGGRESSIVE 0x20 // pop variant bubbles (not default) | |
13 | #define MAG_F_POPOPEN 0x40 // aggressive tip trimming (default) | |
14 | #define MAG_F_NO_SIMPL 0x80 // skip bubble simplification (default) | |
14 | 15 | |
15 | 16 | typedef struct { |
16 | int flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bvtx, min_merge_len, trim_len, trim_depth; | |
17 | int flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bdiff, max_bvtx, min_merge_len, trim_len, trim_depth; | |
17 | 18 | float min_dratio1, max_bcov, max_bfrac; |
18 | 19 | } magopt_t; |
19 | 20 | |
30 | 31 | struct mag_t; |
31 | 32 | |
32 | 33 | typedef struct { |
33 | uint32_t tid; | |
34 | uint32_t len:31, from:1; | |
34 | uint32_t len:31, from:1; // $from and $to: 0 meaning overlapping 5'-end; 1 overlapping 3'-end | |
35 | uint32_t id:31, to:1; // $id: unitig number | |
35 | 36 | } fml_ovlp_t; |
36 | 37 | |
37 | 38 | typedef struct { |
42 | 43 | int n_ovlp[2]; // number of 5'-end [0] and 3'-end [1] overlaps |
43 | 44 | fml_ovlp_t *ovlp; // overlaps, of size n_ovlp[0]+n_ovlp[1] |
44 | 45 | } fml_utg_t; |
46 | ||
47 | extern int fm_verbose; | |
45 | 48 | |
46 | 49 | #ifdef __cplusplus |
47 | 50 | extern "C" { |
120 | 123 | * @param n number of sequences |
121 | 124 | * @param seq array of sequences; FREED on return |
122 | 125 | * |
123 | * @return FMD-index | |
126 | * @return FMD-index on success; NULL if all input sequences are zero in length | |
124 | 127 | */ |
125 | 128 | struct rld_t *fml_seq2fmi(const fml_opt_t *opt, int n, bseq1_t *seq); |
126 | 129 | |
161 | 164 | void fml_utg_print(int n_utgs, const fml_utg_t *utg); |
162 | 165 | |
163 | 166 | /** |
167 | * Output unitig graph in the GFA format | |
168 | * | |
169 | * @param n_utg number of unitigs | |
170 | * @param utg array of unitigs | |
171 | */ | |
172 | void fml_utg_print_gfa(int n, const fml_utg_t *utg); | |
173 | ||
174 | /** | |
164 | 175 | * Deallocate an FM-index |
165 | 176 | * |
166 | 177 | * @param e pointer to the FM-index |
27 | 27 | #define edge_mark_del(_x) ((_x).x = (uint64_t)-2, (_x).y = 0) |
28 | 28 | #define edge_is_del(_x) ((_x).x == (uint64_t)-2 || (_x).y == 0) |
29 | 29 | |
30 | static int fm_verbose = 1; | |
30 | int fm_verbose = 1; | |
31 | 31 | |
32 | 32 | /********************* |
33 | 33 | * Vector operations * |
552 | 552 | o->max_bfrac = 0.15; |
553 | 553 | o->max_bvtx = 64; |
554 | 554 | o->max_bdist = 512; |
555 | o->max_bdiff = 50; | |
555 | 556 | } |
556 | 557 | |
557 | 558 | void mag_g_clean(mag_t *g, const magopt_t *opt) |
567 | 568 | for (j = 2; j <= opt->min_ensr; ++j) |
568 | 569 | mag_g_rm_vext(g, opt->min_elen, j); |
569 | 570 | mag_g_merge(g, 0, opt->min_merge_len); |
570 | if (opt->flag & MAG_F_AGGRESSIVE) mag_g_pop_open(g, opt->min_elen); | |
571 | if ((opt->flag & MAG_F_AGGRESSIVE) || (opt->flag & MAG_F_POPOPEN)) mag_g_pop_open(g, opt->min_elen); | |
571 | 572 | if (!(opt->flag & MAG_F_NO_SIMPL)) mag_g_simplify_bubble(g, opt->max_bvtx, opt->max_bdist); |
572 | mag_g_pop_simple(g, opt->max_bcov, opt->max_bfrac, opt->min_merge_len, opt->flag & MAG_F_AGGRESSIVE); | |
573 | mag_g_pop_simple(g, opt->max_bcov, opt->max_bfrac, opt->min_merge_len, opt->max_bdiff, opt->flag & MAG_F_AGGRESSIVE); | |
573 | 574 | mag_g_rm_vint(g, opt->min_elen, opt->min_insr, g->min_ovlp); |
574 | 575 | mag_g_rm_edge(g, g->min_ovlp, opt->min_dratio1, opt->min_elen, opt->min_ensr); |
575 | 576 | mag_g_merge(g, 1, opt->min_merge_len); |
576 | 577 | mag_g_rm_vext(g, opt->min_elen, opt->min_ensr); |
577 | 578 | mag_g_merge(g, 0, opt->min_merge_len); |
578 | if (opt->flag & MAG_F_AGGRESSIVE) mag_g_pop_open(g, opt->min_elen); | |
579 | if ((opt->flag & MAG_F_AGGRESSIVE) || (opt->flag & MAG_F_POPOPEN)) mag_g_pop_open(g, opt->min_elen); | |
579 | 580 | mag_g_rm_vext(g, opt->min_elen, opt->min_ensr); |
580 | 581 | mag_g_merge(g, 0, opt->min_merge_len); |
581 | 582 | } |
48 | 48 | void mag_g_rm_edge(mag_t *g, int min_ovlp, double min_ratio, int min_len, int min_nsr); |
49 | 49 | void mag_g_merge(mag_t *g, int rmdup, int min_merge_len); |
50 | 50 | void mag_g_simplify_bubble(mag_t *g, int max_vtx, int max_dist); |
51 | void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int aggressive); | |
51 | void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int max_bdiff, int aggressive); | |
52 | 52 | void mag_g_pop_open(mag_t *g, int min_elen); |
53 | 53 | void mag_g_trim_open(mag_t *g, const magopt_t *opt); |
54 | 54 |
36 | 36 | opt->min_asm_ovlp = 33; |
37 | 37 | opt->min_merge_len = 0; |
38 | 38 | mag_init_opt(&opt->mag_opt); |
39 | opt->mag_opt.flag = MAG_F_NO_SIMPL; | |
39 | opt->mag_opt.flag = MAG_F_NO_SIMPL | MAG_F_POPOPEN; | |
40 | 40 | } |
41 | 41 | |
42 | 42 | void fml_opt_adjust(fml_opt_t *opt, int n_seqs, const bseq1_t *seqs) |
70 | 70 | const uint8_t *block; |
71 | 71 | rld_t *e = 0; |
72 | 72 | int k; |
73 | ||
74 | for (k = 0; k < n; ++k) | |
75 | if (seq[k].l_seq > 0) | |
76 | break; | |
77 | if (k == n) return 0; | |
73 | 78 | |
74 | 79 | mr = mr_init(ROPE_DEF_MAX_NODES, ROPE_DEF_BLOCK_LEN, MR_SO_RCLO); |
75 | 80 | for (k = 0; k < n; ++k) { |
192 | 197 | t = &q->ovlp[a++]; |
193 | 198 | k = kh_get(64, h, s->x); |
194 | 199 | assert(k != kh_end(h)); |
195 | t->tid = kh_val(h, k); | |
200 | t->id = kh_val(h, k) >> 1; | |
201 | t->to = kh_val(h, k) & 1; | |
196 | 202 | t->len = s->y; |
197 | 203 | t->from = from; |
198 | 204 | } |
216 | 222 | kputc('\t', &out); kputw(u->nsr, &out); |
217 | 223 | kputc('\t', &out); |
218 | 224 | for (j = 0; j < u->n_ovlp[0]; ++j) { |
219 | kputw(u->ovlp[j].tid, &out); kputc(',', &out); | |
225 | kputw(u->ovlp[j].id<<1|u->ovlp[j].to, &out); kputc(',', &out); | |
220 | 226 | kputw(u->ovlp[j].len, &out); kputc(';', &out); |
221 | 227 | } |
222 | 228 | if (u->n_ovlp[0] == 0) kputc('.', &out); |
223 | 229 | kputc('\t', &out); |
224 | 230 | for (; j < u->n_ovlp[0] + u->n_ovlp[1]; ++j) { |
225 | kputw(u->ovlp[j].tid, &out); kputc(',', &out); | |
231 | kputw(u->ovlp[j].id<<1|u->ovlp[j].to, &out); kputc(',', &out); | |
226 | 232 | kputw(u->ovlp[j].len, &out); kputc(';', &out); |
227 | 233 | } |
228 | 234 | if (u->n_ovlp[1] == 0) kputc('.', &out); |
237 | 243 | free(out.s); |
238 | 244 | } |
239 | 245 | |
246 | void fml_utg_print_gfa(int n, const fml_utg_t *utg) | |
247 | { | |
248 | int i, j; | |
249 | FILE *fp = stdout; | |
250 | fputs("H\tVN:Z:1.0\n", fp); | |
251 | for (i = 0; i < n; ++i) { | |
252 | const fml_utg_t *u = &utg[i]; | |
253 | fprintf(fp, "S\t%d\t", i); | |
254 | fputs(u->seq, fp); | |
255 | fprintf(fp, "\tLN:i:%d\tRC:i:%d\tPD:Z:", u->len, u->nsr); | |
256 | fputs(u->cov, fp); | |
257 | fputc('\n', fp); | |
258 | for (j = 0; j < u->n_ovlp[0] + u->n_ovlp[1]; ++j) { | |
259 | fml_ovlp_t *o = &u->ovlp[j]; | |
260 | if (i < o->id) | |
261 | fprintf(fp, "L\t%d\t%c\t%d\t%c\t%dM\n", i, "+-"[!o->from], o->id, "+-"[o->to], o->len); | |
262 | } | |
263 | } | |
264 | } | |
265 | ||
240 | 266 | void fml_utg_destroy(int n, fml_utg_t *utg) |
241 | 267 | { |
242 | 268 | int i; |
258 | 284 | fml_opt_t opt = *opt0; |
259 | 285 | float kcov; |
260 | 286 | |
287 | *n_utg = 0; | |
261 | 288 | fml_opt_adjust(&opt, n_seqs, seqs); |
262 | 289 | if (opt.ec_k >= 0) fml_correct(&opt, n_seqs, seqs); |
263 | 290 | kcov = fml_fltuniq(&opt, n_seqs, seqs); |
264 | 291 | e = fml_seq2fmi(&opt, n_seqs, seqs); |
292 | if (e == 0) return 0; // this may happen when all sequences are filtered out | |
265 | 293 | g = fml_fmi2mag(&opt, e); |
266 | 294 | opt.mag_opt.min_ensr = opt.mag_opt.min_ensr > kcov * MAG_MIN_NSR_COEF? opt.mag_opt.min_ensr : (int)(kcov * MAG_MIN_NSR_COEF + .499); |
267 | 295 | opt.mag_opt.min_ensr = opt.mag_opt.min_ensr < opt0->max_cnt? opt.mag_opt.min_ensr : opt0->max_cnt; |