Codebase list fermi-lite / 64754d1
New upstream version 0.1+git20190320.b499514 Andreas Tille 3 years ago
7 changed file(s) with 85 addition(s) and 30 deletion(s). Raw diff Collapse all Expand all
1616 is able to retain heterozygous events and thus can be used to assemble diploid
1717 regions for the purpose of variant calling. It is one of the limited choices
1818 for local re-assembly and arguably the easiest to interface.
19
20 If you use fermi-lite in your work, please cite the FermiKit paper:
21
22 > Li H (2015) FermiKit: assembly-based variant calling for Illumina
23 > resequencing data, *Bioinformatics*, **31**:3694-6.
1924
2025 ## Usage
2126
4146 return 0;
4247 }
4348 ```
44 The direct assembly output is in fact a graph. You may have a look at the
45 [header file][header] for details.
49 The `fml_assemble()` output is in fact a graph. You may have a look at the
50 `fml_utg_print_gfa()` function in [misc.c][misc] about how to derive a
51 [GFA][gfa] representation from an array of `fml_utg_t` objects.
4652
4753 ## Overview of the Assembly Algorithm
4854
8591 [fk]: http://github.com/lh3/fermikit
8692 [example]: https://github.com/lh3/fermi-lite/blob/master/example.c
8793 [header]: https://github.com/lh3/fermi-lite/blob/master/fml.h
94 [misc]: https://github.com/lh3/fermi-lite/blob/master/misc.c
95 [gfa]: https://github.com/pmelsted/GFA-spec
1414
1515 #define edge_mark_del(_x) ((_x).x = (uint64_t)-2, (_x).y = 0)
1616 #define edge_is_del(_x) ((_x).x == (uint64_t)-2 || (_x).y == 0)
17
18 static int fm_verbose = 1;
1917
2018 /******************
2119 * Closed bubbles *
177175 mag_g_merge(g, 0, 0);
178176 }
179177
180 int mag_vh_pop_simple(mag_t *g, uint64_t idd, float max_cov, float max_frac, int aggressive)
178 int mag_vh_pop_simple(mag_t *g, uint64_t idd, float max_cov, float max_frac, int max_bdiff, int aggressive)
181179 {
182180 magv_t *p = &g->v.a[idd>>1], *q[2];
183181 ku128_v *r;
194192 dir[j] = x&1;
195193 q[j] = &g->v.a[x>>1];
196194 if (q[j]->nei[0].n != 1 || q[j]->nei[1].n != 1) return ret; // no bubble
197 l[j] = q[j]->len - (int)(q[j]->nei[0].a->y + q[j]->nei[1].a->y);
195 l[j] = q[j]->len - (int)(q[j]->nei[0].a->y + q[j]->nei[1].a->y); // bubble length, excluding overlaps
198196 }
199197 if (q[0]->nei[dir[0]^1].a->x != q[1]->nei[dir[1]^1].a->x) return ret; // no bubble
198 if (l[0] - l[1] > max_bdiff || l[1] - l[0] > max_bdiff) return 1; // huge bubble differences
200199 for (j = 0; j < 2; ++j) { // set seq[] and cov[], and compute avg[]
201200 if (l[j] > 0) {
202201 seq[j] = malloc(l[j]<<1);
253252 return ret;
254253 }
255254
256 void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int aggressive)
255 void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int max_bdiff, int aggressive)
257256 {
258257 int64_t i, n_examined = 0, n_popped = 0;
259258 int ret;
260259
261260 for (i = 0; i < g->v.n; ++i) {
262 ret = mag_vh_pop_simple(g, i<<1|0, max_cov, max_frac, aggressive);
261 ret = mag_vh_pop_simple(g, i<<1|0, max_cov, max_frac, max_bdiff, aggressive);
263262 if (ret >= 1) ++n_examined;
264263 if (ret >= 2) ++n_popped;
265 ret = mag_vh_pop_simple(g, i<<1|1, max_cov, max_frac, aggressive);
264 ret = mag_vh_pop_simple(g, i<<1|1, max_cov, max_frac, max_bdiff, aggressive);
266265 if (ret >= 1) ++n_examined;
267266 if (ret >= 2) ++n_popped;
268267 }
55 int main(int argc, char *argv[])
66 {
77 fml_opt_t opt;
8 int c, n_seqs, n_utg;
8 int c, n_seqs, n_utg, gfa_out = 0;
99 bseq1_t *seqs;
1010 fml_utg_t *utg;
1111
1212 fml_opt_init(&opt);
13 while ((c = getopt(argc, argv, "Ae:l:r:t:c:")) >= 0) {
13 while ((c = getopt(argc, argv, "gOAe:l:r:t:c:d:v:")) >= 0) {
1414 if (c == 'e') opt.ec_k = atoi(optarg);
1515 else if (c == 'l') opt.min_asm_ovlp = atoi(optarg);
1616 else if (c == 'r') opt.mag_opt.min_dratio1 = atof(optarg);
1717 else if (c == 'A') opt.mag_opt.flag |= MAG_F_AGGRESSIVE;
18 else if (c == 'O') opt.mag_opt.flag &= ~MAG_F_POPOPEN;
19 else if (c == 'd') opt.mag_opt.max_bdiff = atoi(optarg);
1820 else if (c == 't') opt.n_threads = atoi(optarg);
21 else if (c == 'g') gfa_out = 1;
22 else if (c == 'v') fm_verbose = atoi(optarg);
1923 else if (c == 'c') {
2024 char *p;
2125 opt.min_cnt = strtol(optarg, &p, 10);
3034 fprintf(stderr, " -l INT min overlap length during initial assembly [%d]\n", opt.min_asm_ovlp);
3135 fprintf(stderr, " -r FLOAT drop an overlap if its length is below maxOvlpLen*FLOAT [%g]\n", opt.mag_opt.min_dratio1);
3236 fprintf(stderr, " -t INT number of threads (don't use multi-threading for small data sets) [%d]\n", opt.n_threads);
33 fprintf(stderr, " -A discard heterozygotes (apply this to assemble bacterial genomes)\n");
37 fprintf(stderr, " -d INT retain a bubble if one side is longer than the other side by >INT-bp [%d]\n", opt.mag_opt.max_bdiff);
38 fprintf(stderr, " -A discard heterozygotes (apply this to assemble bacterial genomes; override -O)\n");
39 fprintf(stderr, " -O don't apply aggressive tip trimming\n");
40 fprintf(stderr, " -g output the assembly graph in the GFA format\n");
3441 return 1;
3542 }
3643 seqs = bseq_read(argv[optind], &n_seqs);
3744 utg = fml_assemble(&opt, n_seqs, seqs, &n_utg);
38 fml_utg_print(n_utg, utg);
45 if (!gfa_out) fml_utg_print(n_utg, utg);
46 else fml_utg_print_gfa(n_utg, utg);
3947 fml_utg_destroy(n_utg, utg);
4048 return 0;
4149 }
00 #ifndef FML_H
11 #define FML_H
22
3 #define FML_VERSION "r41"
3 #define FML_VERSION "r55"
44
55 #include <stdint.h>
66
77 typedef struct {
88 int32_t l_seq;
9 char *seq, *qual;
9 char *seq, *qual; // NULL-terminated strings; length expected to match $l_seq
1010 } bseq1_t;
1111
12 #define MAG_F_AGGRESSIVE 0x20
13 #define MAG_F_NO_SIMPL 0x80
12 #define MAG_F_AGGRESSIVE 0x20 // pop variant bubbles (not default)
13 #define MAG_F_POPOPEN 0x40 // aggressive tip trimming (default)
14 #define MAG_F_NO_SIMPL 0x80 // skip bubble simplification (default)
1415
1516 typedef struct {
16 int flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bvtx, min_merge_len, trim_len, trim_depth;
17 int flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bdiff, max_bvtx, min_merge_len, trim_len, trim_depth;
1718 float min_dratio1, max_bcov, max_bfrac;
1819 } magopt_t;
1920
3031 struct mag_t;
3132
3233 typedef struct {
33 uint32_t tid;
34 uint32_t len:31, from:1;
34 uint32_t len:31, from:1; // $from and $to: 0 meaning overlapping 5'-end; 1 overlapping 3'-end
35 uint32_t id:31, to:1; // $id: unitig number
3536 } fml_ovlp_t;
3637
3738 typedef struct {
4243 int n_ovlp[2]; // number of 5'-end [0] and 3'-end [1] overlaps
4344 fml_ovlp_t *ovlp; // overlaps, of size n_ovlp[0]+n_ovlp[1]
4445 } fml_utg_t;
46
47 extern int fm_verbose;
4548
4649 #ifdef __cplusplus
4750 extern "C" {
120123 * @param n number of sequences
121124 * @param seq array of sequences; FREED on return
122125 *
123 * @return FMD-index
126 * @return FMD-index on success; NULL if all input sequences are zero in length
124127 */
125128 struct rld_t *fml_seq2fmi(const fml_opt_t *opt, int n, bseq1_t *seq);
126129
161164 void fml_utg_print(int n_utgs, const fml_utg_t *utg);
162165
163166 /**
167 * Output unitig graph in the GFA format
168 *
169 * @param n_utg number of unitigs
170 * @param utg array of unitigs
171 */
172 void fml_utg_print_gfa(int n, const fml_utg_t *utg);
173
174 /**
164175 * Deallocate an FM-index
165176 *
166177 * @param e pointer to the FM-index
2727 #define edge_mark_del(_x) ((_x).x = (uint64_t)-2, (_x).y = 0)
2828 #define edge_is_del(_x) ((_x).x == (uint64_t)-2 || (_x).y == 0)
2929
30 static int fm_verbose = 1;
30 int fm_verbose = 1;
3131
3232 /*********************
3333 * Vector operations *
552552 o->max_bfrac = 0.15;
553553 o->max_bvtx = 64;
554554 o->max_bdist = 512;
555 o->max_bdiff = 50;
555556 }
556557
557558 void mag_g_clean(mag_t *g, const magopt_t *opt)
567568 for (j = 2; j <= opt->min_ensr; ++j)
568569 mag_g_rm_vext(g, opt->min_elen, j);
569570 mag_g_merge(g, 0, opt->min_merge_len);
570 if (opt->flag & MAG_F_AGGRESSIVE) mag_g_pop_open(g, opt->min_elen);
571 if ((opt->flag & MAG_F_AGGRESSIVE) || (opt->flag & MAG_F_POPOPEN)) mag_g_pop_open(g, opt->min_elen);
571572 if (!(opt->flag & MAG_F_NO_SIMPL)) mag_g_simplify_bubble(g, opt->max_bvtx, opt->max_bdist);
572 mag_g_pop_simple(g, opt->max_bcov, opt->max_bfrac, opt->min_merge_len, opt->flag & MAG_F_AGGRESSIVE);
573 mag_g_pop_simple(g, opt->max_bcov, opt->max_bfrac, opt->min_merge_len, opt->max_bdiff, opt->flag & MAG_F_AGGRESSIVE);
573574 mag_g_rm_vint(g, opt->min_elen, opt->min_insr, g->min_ovlp);
574575 mag_g_rm_edge(g, g->min_ovlp, opt->min_dratio1, opt->min_elen, opt->min_ensr);
575576 mag_g_merge(g, 1, opt->min_merge_len);
576577 mag_g_rm_vext(g, opt->min_elen, opt->min_ensr);
577578 mag_g_merge(g, 0, opt->min_merge_len);
578 if (opt->flag & MAG_F_AGGRESSIVE) mag_g_pop_open(g, opt->min_elen);
579 if ((opt->flag & MAG_F_AGGRESSIVE) || (opt->flag & MAG_F_POPOPEN)) mag_g_pop_open(g, opt->min_elen);
579580 mag_g_rm_vext(g, opt->min_elen, opt->min_ensr);
580581 mag_g_merge(g, 0, opt->min_merge_len);
581582 }
4848 void mag_g_rm_edge(mag_t *g, int min_ovlp, double min_ratio, int min_len, int min_nsr);
4949 void mag_g_merge(mag_t *g, int rmdup, int min_merge_len);
5050 void mag_g_simplify_bubble(mag_t *g, int max_vtx, int max_dist);
51 void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int aggressive);
51 void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int max_bdiff, int aggressive);
5252 void mag_g_pop_open(mag_t *g, int min_elen);
5353 void mag_g_trim_open(mag_t *g, const magopt_t *opt);
5454
3636 opt->min_asm_ovlp = 33;
3737 opt->min_merge_len = 0;
3838 mag_init_opt(&opt->mag_opt);
39 opt->mag_opt.flag = MAG_F_NO_SIMPL;
39 opt->mag_opt.flag = MAG_F_NO_SIMPL | MAG_F_POPOPEN;
4040 }
4141
4242 void fml_opt_adjust(fml_opt_t *opt, int n_seqs, const bseq1_t *seqs)
7070 const uint8_t *block;
7171 rld_t *e = 0;
7272 int k;
73
74 for (k = 0; k < n; ++k)
75 if (seq[k].l_seq > 0)
76 break;
77 if (k == n) return 0;
7378
7479 mr = mr_init(ROPE_DEF_MAX_NODES, ROPE_DEF_BLOCK_LEN, MR_SO_RCLO);
7580 for (k = 0; k < n; ++k) {
192197 t = &q->ovlp[a++];
193198 k = kh_get(64, h, s->x);
194199 assert(k != kh_end(h));
195 t->tid = kh_val(h, k);
200 t->id = kh_val(h, k) >> 1;
201 t->to = kh_val(h, k) & 1;
196202 t->len = s->y;
197203 t->from = from;
198204 }
216222 kputc('\t', &out); kputw(u->nsr, &out);
217223 kputc('\t', &out);
218224 for (j = 0; j < u->n_ovlp[0]; ++j) {
219 kputw(u->ovlp[j].tid, &out); kputc(',', &out);
225 kputw(u->ovlp[j].id<<1|u->ovlp[j].to, &out); kputc(',', &out);
220226 kputw(u->ovlp[j].len, &out); kputc(';', &out);
221227 }
222228 if (u->n_ovlp[0] == 0) kputc('.', &out);
223229 kputc('\t', &out);
224230 for (; j < u->n_ovlp[0] + u->n_ovlp[1]; ++j) {
225 kputw(u->ovlp[j].tid, &out); kputc(',', &out);
231 kputw(u->ovlp[j].id<<1|u->ovlp[j].to, &out); kputc(',', &out);
226232 kputw(u->ovlp[j].len, &out); kputc(';', &out);
227233 }
228234 if (u->n_ovlp[1] == 0) kputc('.', &out);
237243 free(out.s);
238244 }
239245
246 void fml_utg_print_gfa(int n, const fml_utg_t *utg)
247 {
248 int i, j;
249 FILE *fp = stdout;
250 fputs("H\tVN:Z:1.0\n", fp);
251 for (i = 0; i < n; ++i) {
252 const fml_utg_t *u = &utg[i];
253 fprintf(fp, "S\t%d\t", i);
254 fputs(u->seq, fp);
255 fprintf(fp, "\tLN:i:%d\tRC:i:%d\tPD:Z:", u->len, u->nsr);
256 fputs(u->cov, fp);
257 fputc('\n', fp);
258 for (j = 0; j < u->n_ovlp[0] + u->n_ovlp[1]; ++j) {
259 fml_ovlp_t *o = &u->ovlp[j];
260 if (i < o->id)
261 fprintf(fp, "L\t%d\t%c\t%d\t%c\t%dM\n", i, "+-"[!o->from], o->id, "+-"[o->to], o->len);
262 }
263 }
264 }
265
240266 void fml_utg_destroy(int n, fml_utg_t *utg)
241267 {
242268 int i;
258284 fml_opt_t opt = *opt0;
259285 float kcov;
260286
287 *n_utg = 0;
261288 fml_opt_adjust(&opt, n_seqs, seqs);
262289 if (opt.ec_k >= 0) fml_correct(&opt, n_seqs, seqs);
263290 kcov = fml_fltuniq(&opt, n_seqs, seqs);
264291 e = fml_seq2fmi(&opt, n_seqs, seqs);
292 if (e == 0) return 0; // this may happen when all sequences are filtered out
265293 g = fml_fmi2mag(&opt, e);
266294 opt.mag_opt.min_ensr = opt.mag_opt.min_ensr > kcov * MAG_MIN_NSR_COEF? opt.mag_opt.min_ensr : (int)(kcov * MAG_MIN_NSR_COEF + .499);
267295 opt.mag_opt.min_ensr = opt.mag_opt.min_ensr < opt0->max_cnt? opt.mag_opt.min_ensr : opt0->max_cnt;