#include <limits.h>
#include <stdio.h>
#include "mag.h"
#include "kvec.h"
#include "ksw.h"
#include "internal.h"
#include "khash.h"
KHASH_DECLARE(64, uint64_t, uint64_t)
typedef khash_t(64) hash64_t;
#define MAX_N_DIFF 2.01 // for evaluating alignment after SW
#define MAX_R_DIFF 0.1
#define L_DIFF_COEF 0.2 // n_diff=|l_0 - l_1|*L_DIFF_COEF
#define edge_mark_del(_x) ((_x).x = (uint64_t)-2, (_x).y = 0)
#define edge_is_del(_x) ((_x).x == (uint64_t)-2 || (_x).y == 0)
/******************
* Closed bubbles *
******************/
typedef struct {
uint64_t id;
int cnt[2];
int n[2][2], d[2][2];
uint64_t v[2][2];
} trinfo_t;
const trinfo_t g_trinull = {-1, {0, 0}, {{INT_MIN, INT_MIN}, {INT_MIN, INT_MIN}}, {{INT_MIN, INT_MIN}, {INT_MIN, INT_MIN}}, {{-1, -1}, {-1, -1}}};
typedef struct {
int n, m;
trinfo_t **buf;
} tipool_t;
struct mogb_aux {
tipool_t pool;
ku64_v stack;
hash64_t *h;
};
mogb_aux_t *mag_b_initaux(void)
{
mogb_aux_t *aux = calloc(1, sizeof(mogb_aux_t));
aux->h = kh_init(64);
return aux;
}
void mag_b_destroyaux(mogb_aux_t *b)
{
int i;
for (i = 0; i < b->pool.m; ++i)
free(b->pool.buf[i]);
free(b->pool.buf); free(b->stack.a);
kh_destroy(64, b->h);
free(b);
}
#define tiptr(p) ((trinfo_t*)(p)->ptr)
static inline trinfo_t *tip_alloc(tipool_t *pool, uint32_t id)
{ // allocate an object from the memory pool
trinfo_t *p;
if (pool->n == pool->m) {
int i, new_m = pool->m? pool->m<<1 : 256;
pool->buf = realloc(pool->buf, new_m * sizeof(void*));
for (i = pool->m; i < new_m; ++i)
pool->buf[i] = malloc(sizeof(trinfo_t));
pool->m = new_m;
}
p = pool->buf[pool->n++];
*p = g_trinull;
p->id = id;
return p;
}
static void backtrace(mag_t *g, uint64_t end, uint64_t start, hash64_t *h)
{
while (end>>32 != start) {
int ret;
kh_put(64, h, end>>33, &ret);
end = tiptr(&g->v.a[end>>33])->v[(end>>32^1)&1][end&1];
}
}
void mag_vh_simplify_bubble(mag_t *g, uint64_t idd, int max_vtx, int max_dist, mogb_aux_t *a)
{
int i, n_pending = 0;
magv_t *p, *q;
p = &g->v.a[idd>>1];
if (p->len < 0 || p->nei[idd&1].n < 2) return; // stop if p is deleted or it has 0 or 1 neighbor
// reset aux data
a->stack.n = a->pool.n = 0;
if (kh_n_buckets(a->h) >= 64) {
kh_destroy(64, a->h);
a->h = kh_init(64);
} else kh_clear(64, a->h);
// add the initial vertex
p->ptr = tip_alloc(&a->pool, idd>>1);
tiptr(p)->d[(idd&1)^1][0] = -p->len;
tiptr(p)->n[(idd&1)^1][0] = -p->nsr;
kv_push(uint64_t, a->stack, idd^1);
// essentially a topological sorting
while (a->stack.n) {
uint64_t x, y;
ku128_v *r;
if (a->stack.n == 1 && a->stack.a[0] != (idd^1) && n_pending == 0) break; // found the other end of the bubble
x = kv_pop(a->stack);
p = &g->v.a[x>>1];
//printf("%lld:%lld\n", p->k[0], p->k[1]);
r = &p->nei[(x&1)^1]; // we will look the the neighbors from the other end of the unitig
if (a->pool.n > max_vtx || tiptr(p)->d[x&1][0] > max_dist || tiptr(p)->d[x&1][1] > max_dist || r->n == 0) break; // we failed
// set the distance to p's neighbors
for (i = 0; i < r->n; ++i) {
int nsr, dist, which;
if ((int64_t)r->a[i].x < 0) continue;
y = mag_tid2idd(g->h, r->a[i].x);
if (y == (idd^1)) { // there is a loop involving the initial vertex
a->stack.n = 0;
break; // not a bubble; stop; this will jump out of the while() loop
}
q = &g->v.a[y>>1];
if (q->ptr == 0) { // has not been attempted
q->ptr = tip_alloc(&a->pool, y>>1), ++n_pending;
mag_v128_clean(&q->nei[y&1]); // make sure there are no deleted edges
}
nsr = tiptr(p)->n[x&1][0] + p->nsr; which = 0;
dist = tiptr(p)->d[x&1][0] + p->len - r->a[i].y;
//printf("01 [%d]\t[%d,%d]\t[%d,%d]\n", i, tiptr(q)->n[y&1][0], tiptr(q)->n[y&1][1], tiptr(q)->d[y&1][0], tiptr(q)->d[y&1][1]);
// test and possibly update the tentative distance
if (nsr > tiptr(q)->n[y&1][0]) { // then move the best to the 2nd best and update the best
tiptr(q)->n[y&1][1] = tiptr(q)->n[y&1][0]; tiptr(q)->n[y&1][0] = nsr;
tiptr(q)->v[y&1][1] = tiptr(q)->v[y&1][0]; tiptr(q)->v[y&1][0] = (x^1)<<32|i<<1|which;
tiptr(q)->d[y&1][1] = tiptr(q)->d[y&1][0]; tiptr(q)->d[y&1][0] = dist;
nsr = tiptr(p)->n[x&1][1] + p->nsr; which = 1; // now nsr is the 2nd best
dist = tiptr(p)->d[x&1][1] + p->len - r->a[i].y;
}
if (nsr > tiptr(q)->n[y&1][1]) // update the 2nd best
tiptr(q)->n[y&1][1] = nsr, tiptr(q)->v[y&1][1] = (x^1)<<32|i<<1|which, tiptr(q)->d[y&1][1] = dist;
if (++tiptr(q)->cnt[y&1] == q->nei[y&1].n) { // all q's predecessors have been processed; then push
kv_push(uint64_t, a->stack, y);
--n_pending;
}
}
}
if (n_pending == 0 && a->stack.n == 1) { // found a bubble
uint64_t x = a->stack.a[0];
p = &g->v.a[x>>1];
//printf("(%d,%d)\t(%d,%d)\n", tiptr(p)->n[x&1][0], tiptr(p)->n[x&1][1], tiptr(p)->d[x&1][0], tiptr(p)->d[x&1][1]);
backtrace(g, tiptr(p)->v[x&1][0], idd, a->h);
backtrace(g, tiptr(p)->v[x&1][1], idd, a->h);
}
for (i = 0; i < a->pool.n; ++i) // reset p->ptr
g->v.a[a->pool.buf[i]->id].ptr = 0;
if (kh_size(a->h)) { // bubble detected; then remove verticies not in the top two paths
for (i = 1; i < a->pool.n; ++i) { // i=0 corresponds to the initial vertex which we want to exclude
uint64_t id = a->pool.buf[i]->id;
if (id != a->stack.a[0]>>1 && kh_get(64, a->h, id) == kh_end(a->h)) // not in the top two paths
mag_v_del(g, &g->v.a[id]);
}
}
}
void mag_g_simplify_bubble(mag_t *g, int max_vtx, int max_dist)
{
int64_t i;
mogb_aux_t *a;
a = mag_b_initaux();
for (i = 0; i < g->v.n; ++i) {
mag_vh_simplify_bubble(g, i<<1|0, max_vtx, max_dist, a);
mag_vh_simplify_bubble(g, i<<1|1, max_vtx, max_dist, a);
}
mag_b_destroyaux(a);
mag_g_merge(g, 0, 0);
}
int mag_vh_pop_simple(mag_t *g, uint64_t idd, float max_cov, float max_frac, int max_bdiff, int aggressive)
{
magv_t *p = &g->v.a[idd>>1], *q[2];
ku128_v *r;
int i, j, k, dir[2], l[2], ret = -1;
char *seq[2], *cov[2];
float n_diff, r_diff, avg[2], max_n_diff = aggressive? MAX_N_DIFF * 2. : MAX_N_DIFF;
if (p->len < 0 || p->nei[idd&1].n != 2) return ret; // deleted or no bubble
r = &p->nei[idd&1];
for (j = 0; j < 2; ++j) {
uint64_t x;
if ((int64_t)r->a[j].x < 0) return ret;
x = mag_tid2idd(g->h, r->a[j].x);
dir[j] = x&1;
q[j] = &g->v.a[x>>1];
if (q[j]->nei[0].n != 1 || q[j]->nei[1].n != 1) return ret; // no bubble
l[j] = q[j]->len - (int)(q[j]->nei[0].a->y + q[j]->nei[1].a->y); // bubble length, excluding overlaps
}
if (q[0]->nei[dir[0]^1].a->x != q[1]->nei[dir[1]^1].a->x) return ret; // no bubble
if (l[0] - l[1] > max_bdiff || l[1] - l[0] > max_bdiff) return 1; // huge bubble differences
for (j = 0; j < 2; ++j) { // set seq[] and cov[], and compute avg[]
if (l[j] > 0) {
seq[j] = malloc(l[j]<<1);
cov[j] = seq[j] + l[j];
for (i = 0; i < l[j]; ++i) {
seq[j][i] = q[j]->seq[i + q[j]->nei[0].a->y];
cov[j][i] = q[j]->cov[i + q[j]->nei[0].a->y];
}
if (dir[j]) {
seq_revcomp6(l[j], (uint8_t*)seq[j]);
seq_reverse(l[j], (uint8_t*)cov[j]);
}
for (i = 0, avg[j] = 0.; i < l[j]; ++i) {
--seq[j][i]; // change DNA6 encoding to DNA4 for SW below
avg[j] += cov[j][i] - 33;
}
avg[j] /= l[j];
} else { // l[j] <= 0; this may happen around a tandem repeat
int beg, end;
seq[j] = cov[j] = 0;
beg = q[j]->nei[0].a->y; end = q[j]->len - q[j]->nei[1].a->y;
if (beg > end) beg ^= end, end ^= beg, beg ^= end; // swap
if (beg < end) {
for (i = beg, avg[j] = 0.; i < end; ++i)
avg[j] += q[j]->cov[i] - 33;
avg[j] /= end - beg;
} else avg[j] = q[j]->cov[beg] - 33; // FIXME: when q[j] is contained, weird thing may happen
}
}
ret = 1;
if (l[0] > 0 && l[1] > 0) { // then do SW to compute n_diff and r_diff
int8_t mat[16];
kswr_t aln;
for (i = k = 0; i < 4; ++i)
for (j = 0; j < 4; ++j)
mat[k++] = i == j? 5 : -4;
aln = ksw_align(l[0], (uint8_t*)seq[0], l[1], (uint8_t*)seq[1], 4, mat, 5, 2, 0, 0);
n_diff = ((l[0] < l[1]? l[0] : l[1]) * 5. - aln.score) / (5. + 4.); // 5: matching score; -4: mismatchig score
r_diff = n_diff / ((l[0] + l[1]) / 2.);
//fprintf(stderr, "===> %f %f <===\n", n_diff, r_diff); for (j = 0; j < 2; ++j) { for (i = 0; i < l[j]; ++i) fputc("ACGTN"[(int)seq[j][i]], stderr); fputc('\n', stderr); }
} else {
n_diff = abs(l[0] - l[1]) * L_DIFF_COEF;
r_diff = 1.;
//fprintf(stderr, "---> (%d,%d) <---\n", l[0], l[1]);
}
if (n_diff < max_n_diff || r_diff < MAX_R_DIFF) {
j = avg[0] < avg[1]? 0 : 1;
if (aggressive || (avg[j] < max_cov && avg[j] / (avg[j^1] + avg[j]) < max_frac)) {
mag_v_del(g, q[j]);
ret = 2;
}
}
free(seq[0]); free(seq[1]);
return ret;
}
void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int max_bdiff, int aggressive)
{
int64_t i, n_examined = 0, n_popped = 0;
int ret;
for (i = 0; i < g->v.n; ++i) {
ret = mag_vh_pop_simple(g, i<<1|0, max_cov, max_frac, max_bdiff, aggressive);
if (ret >= 1) ++n_examined;
if (ret >= 2) ++n_popped;
ret = mag_vh_pop_simple(g, i<<1|1, max_cov, max_frac, max_bdiff, aggressive);
if (ret >= 1) ++n_examined;
if (ret >= 2) ++n_popped;
}
if (fm_verbose >= 3)
fprintf(stderr, "[M::%s] examined %ld bubbles and popped %ld\n", __func__, (long)n_examined, (long)n_popped);
mag_g_merge(g, 0, min_merge_len);
}
/****************
* Open bubbles *
****************/
void mag_v_pop_open(mag_t *g, magv_t *p, int min_elen)
{
int i, j, k, l, dir, max_l, l_qry;
magv_t *q, *t;
ku128_v *r, *s;
uint8_t *seq;
int8_t mat[16];
if (p->len < 0 || p->len >= min_elen) return;
//if (p->nei[0].n && p->nei[1].n) return; // FIXME: between this and the next line, which is better?
if (p->nei[0].n + p->nei[1].n != 1) return;
dir = p->nei[0].n? 0 : 1;
// initialize the scoring system
for (i = k = 0; i < 4; ++i)
for (j = 0; j < 4; ++j)
mat[k++] = i == j? 5 : -4;
s = &p->nei[dir];
for (l = 0; l < s->n; ++l) { // if we use "if (p->nei[0].n + p->nei[1].n != 1)", s->n == 1
uint64_t v;
kswq_t *qry;
if ((int64_t)s->a[l].x < 0) continue;
v = mag_tid2idd(g->h, s->a[l].x);
q = &g->v.a[v>>1];
if (q == p || q->nei[v&1].n == 1) continue;
// get the query ready
max_l = (p->len - s->a[l].y) * 2;
seq = malloc(max_l + 1);
if (dir == 0) { // forward strand
for (j = s->a[l].y, k = 0; j < p->len; ++j)
seq[k++] = p->seq[j] - 1;
} else { // reverse
for (j = p->len - s->a[l].y - 1, k = 0; j >= 0; --j)
seq[k++] = 4 - p->seq[j];
}
l_qry = k;
qry = ksw_qinit(2, l_qry, seq, 4, mat);
//fprintf(stderr, "===> %lld:%lld:%d[%d], %d, %ld <===\n", p->k[0], p->k[1], s->n, l, p->nsr, q->nei[v&1].n);
//for (j = 0; j < k; ++j) fputc("ACGTN"[(int)seq[j]], stderr); fputc('\n', stderr);
r = &q->nei[v&1];
for (i = 0; i < r->n; ++i) {
uint64_t w;
kswr_t aln;
if (r->a[i].x == p->k[dir] || (int64_t)r->a[i].x < 0) continue;
w = mag_tid2idd(g->h, r->a[i].x);
// get the target sequence
t = &g->v.a[w>>1];
if (w&1) { // reverse strand
for (j = t->len - r->a[i].y - 1, k = 0; j >= 0 && k < max_l; --j)
seq[k++] = 4 - t->seq[j];
} else {
for (j = r->a[i].y, k = 0; j < t->len && k < max_l; ++j)
seq[k++] = t->seq[j] - 1;
}
aln = ksw_align(0, 0, k, seq, 4, mat, 5, 2, 0, &qry);
//for (j = 0; j < k; ++j) fputc("ACGTN"[(int)seq[j]], stderr); fprintf(stderr, "\t%d\t%f\n", aln.score, (l_qry * 5. - aln.score) / (5. + 4.));
if (aln.score >= l_qry * 5 / 2) {
double r_diff, n_diff;
n_diff = (l_qry * 5. - aln.score) / (5. + 4.); // 5: matching score; -4: mismatchig score
r_diff = n_diff / l_qry;
if (n_diff < MAX_N_DIFF || r_diff < MAX_R_DIFF) break;
}
}
if (i != r->n) {
// mark delete in p and delete in q
edge_mark_del(s->a[l]);
for (i = 0; i < r->n; ++i)
if (r->a[i].x == p->k[dir])
edge_mark_del(r->a[i]);
}
free(seq); free(qry);
}
for (i = 0; i < s->n; ++i)
if (!edge_is_del(s->a[i])) break;
if (i == s->n) mag_v_del(g, p); // p is not connected to any other vertices
}
void mag_g_pop_open(mag_t *g, int min_elen)
{
int64_t i;
for (i = 0; i < g->v.n; ++i)
mag_v_pop_open(g, &g->v.a[i], min_elen);
if (fm_verbose >= 3)
fprintf(stderr, "[M:%s] popped open bubbles\n", __func__);
mag_g_merge(g, 0, 0);
}