Commit 2d9efbe6bb89dd4905a301d3579bc47ac427c186 - ariba

+2

-2

ariba/ref_genes_getter.py less more

182	182	print('You can use them with ARIBA like this:')
183	183	print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
184	184	print('If you use this downloaded data, please cite:')
185		print('"The Comprehensive Antibiotic Resistance Database", McArthur et al 2013, PMID: 23650175')
	185	print('"CARD 2020: antibiotic resistome surveillance with the comprehensive antibiotic resistance database", Alcock et al 2020, PMID: 31665441')
186	186	print('and in your methods say that version', self.version, 'of the database was used')
187	187
188	188

658	658	print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
659	659
660	660	else:
661		print(f"Nothing to do. Exiting.")
	661	print(f"Nothing to do. Exiting.")
662	662	def run(self, outprefix):
663	663	exec('self._get_from_' + self.ref_db + '(outprefix)')

+3

-0

third_party/fermi-lite-0.1/.gitignore less more

	0	*.o
	1	*.a
	2	.*.swp

+23

-0

third_party/fermi-lite-0.1/LICENSE.txt less more

	0	The MIT License
	1
	2	Copyright (c) 2016 Broad Institute
	3
	4	Permission is hereby granted, free of charge, to any person obtaining
	5	a copy of this software and associated documentation files (the
	6	"Software"), to deal in the Software without restriction, including
	7	without limitation the rights to use, copy, modify, merge, publish,
	8	distribute, sublicense, and/or sell copies of the Software, and to
	9	permit persons to whom the Software is furnished to do so, subject to
	10	the following conditions:
	11
	12	The above copyright notice and this permission notice shall be
	13	included in all copies or substantial portions of the Software.
	14
	15	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	16	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	17	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	18	NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	19	BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	20	ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	21	CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	22	SOFTWARE.

+88

-0

third_party/fermi-lite-0.1/README.md less more

	0	## Getting Started
	1	```sh
	2	git clone https://github.com/lh3/fermi-lite
	3	cd fermi-lite && make
	4	./fml-asm test/MT-simu.fq.gz > MT.fq
	5	# to compile your program:
	6	gcc -Wall -O2 prog.c -o prog -L/path/to/fermi-lite -lfml -lz -lm -lpthread
	7	```
	8
	9	## Introduction
	10
	11	Fermi-lite is a standalone C library as well as a command-line tool for
	12	assembling Illumina short reads in regions from 100bp to 10 million bp in size.
	13	It is largely a light-weight in-memory version of [fermikit][fk] without
	14	generating any intermediate files. It inherits the performance, the relatively
	15	small memory footprint and the features of fermikit. In particular, fermi-lite
	16	is able to retain heterozygous events and thus can be used to assemble diploid
	17	regions for the purpose of variant calling. It is one of the limited choices
	18	for local re-assembly and arguably the easiest to interface.
	19
	20	## Usage
	21
	22	For now, see [example.c][example] for the basic use of the library. Here is a
	23	sketch of the example:
	24	```cpp
	25	#include <stdio.h> // for printf()
	26	#include "fml.h" // only one header file required
	27
	28	int main(int argc, char *argv[])
	29	{
	30	int i, n_seqs, n_utgs;
	31	bseq1_t *seqs; // array of input sequences
	32	fml_utg_t *utgs; // array of output unitigs
	33	fml_opt_t opt;
	34	if (argc == 1) return 1; // do nothing if there is no input file
	35	seqs = bseq_read(argv[1], &n_seqs); // or fill the array with callers' functions
	36	fml_opt_init(&opt); // initialize parameters
	37	utgs = fml_assemble(&opt, n_seqs, seqs, &n_utgs); // assemble!
	38	for (i = 0; i < n_utgs; ++i) // output in fasta
	39	printf(">%d\n%s\n", i+1, utgs[i].seq);
	40	fml_utg_destroy(n_utgs, utgs); // deallocate unitigs
	41	return 0;
	42	}
	43	```
	44	The direct assembly output is in fact a graph. You may have a look at the
	45	[header file][header] for details.
	46
	47	## Overview of the Assembly Algorithm
	48
	49	Fermi-lite is an overlap-based assembler. Given a set of input reads, it counts
	50	k-mers, estimates the k-mer coverage, sets a threshold on k-mer
	51	occurrences to determine solid k-mers and then use them correct sequencing
	52	errors ([Li, 2015][bfc-paper]). After error correction, fermi-lite trims a read
	53	at an l-mer unique to the read. It then constructs an FM-index for trimmed
	54	reads ([Li, 2014][rb2-paper]) and builds a transitively reduced overlap graph from the
	55	FM-index ([Simpson and Durbin, 2010][sga-paper]; [Li, 2012][fm1-paper]),
	56	requiring at least l-bp overlaps. In this graph, fermi-lite trims tips and
	57	pops bubbles caused by uncorrected errors. If a sequence in the graph has
	58	multiple overlaps, fermi-lite discards overlaps significantly shorter than the
	59	longest overlap -- this is a technique applied to overlap graph only. The graph
	60	after these procedure is the final output. Sequences in this graph are unitigs.
	61
	62	## Limitations
	63
	64	1. Fermi-lite can efficiently assemble bacterial genomes. However, it has not
	65	been carefully tuned for this type of assembly. While on a few GAGE-B data
	66	sets fermi-lite appears to work well, it may not compete with recent
	67	mainstream assemblers in general.
	68
	69	2. Fermi-lite does not work with genomes more than tens of megabases as a
	70	whole. It would take too much memory to stage all data in memory. For large
	71	genomes, please use [fermikit][fk] instead.
	72
	73	3. This is the first iteration of fermi-lite. It is still immarture. In
	74	particular, I hope fermi-lite can be smart enough to automatically figure
	75	out various parameters based on input, which is very challenging given the
	76	high variability of input data.
	77
	78	[sga-paper]: http://www.ncbi.nlm.nih.gov/pubmed/20529929
	79	[bfc-paper]: http://www.ncbi.nlm.nih.gov/pubmed/25953801
	80	[rb2-paper]: http://www.ncbi.nlm.nih.gov/pubmed/25107872
	81	[fm1-paper]: http://www.ncbi.nlm.nih.gov/pubmed/22569178
	82	[bfc]: http://github.com/lh3/bfc
	83	[rb2]: http://github.com/lh3/ropebwt2
	84	[fm2]: http://github.com/lh3/fermi2
	85	[fk]: http://github.com/lh3/fermikit
	86	[example]: https://github.com/lh3/fermi-lite/blob/master/example.c
	87	[header]: https://github.com/lh3/fermi-lite/blob/master/fml.h

+674

-0

third_party/fermi-lite-0.1/bfc.c less more

	0	#include <stdlib.h>
	1	#include <string.h>
	2	#include <assert.h>
	3	#include <limits.h>
	4	#include <stdio.h>
	5	#include "htab.h"
	6	#include "kmer.h"
	7	#include "internal.h"
	8	#include "fml.h"
	9
	10	/*******************
	11	* BFC options *
	12	*******************/
	13
	14	typedef struct {
	15	int n_threads, q, k, l_pre;
	16	int min_cov; // a k-mer is considered solid if the count is no less than this
	17
	18	int max_end_ext;
	19	int win_multi_ec;
	20	float min_trim_frac;
	21
	22	// these ec options cannot be changed on the command line
	23	int w_ec, w_ec_high, w_absent, w_absent_high;
	24	int max_path_diff, max_heap;
	25	} bfc_opt_t;
	26
	27	void bfc_opt_init(bfc_opt_t *opt)
	28	{
	29	memset(opt, 0, sizeof(bfc_opt_t));
	30	opt->n_threads = 1;
	31	opt->q = 20;
	32	opt->k = -1;
	33	opt->l_pre = -1;
	34
	35	opt->min_cov = 4; // in BFC, this defaults to 3 because it has Bloom pre-filter
	36	opt->win_multi_ec = 10;
	37	opt->max_end_ext = 5;
	38	opt->min_trim_frac = .8;
	39
	40	opt->w_ec = 1;
	41	opt->w_ec_high = 7;
	42	opt->w_absent = 3;
	43	opt->w_absent_high = 1;
	44	opt->max_path_diff = 15;
	45	opt->max_heap = 100;
	46	}
	47
	48	/**********************
	49	* K-mer counting *
	50	**********************/
	51
	52	#define CNT_BUF_SIZE 256
	53
	54	typedef struct { // cache to reduce locking
	55	uint64_t y[2];
	56	int is_high;
	57	} insbuf_t;
	58
	59	typedef struct {
	60	int k, q;
	61	int n_seqs;
	62	const bseq1_t *seqs;
	63	bfc_ch_t *ch;
	64	int *n_buf;
	65	insbuf_t **buf;
	66	} cnt_step_t;
	67
	68	bfc_kmer_t bfc_kmer_null = {{0,0,0,0}};
	69
	70	static int bfc_kmer_bufclear(cnt_step_t *cs, int forced, int tid)
	71	{
	72	int i, k, r;
	73	if (cs->ch == 0) return 0;
	74	for (i = k = 0; i < cs->n_buf[tid]; ++i) {
	75	r = bfc_ch_insert(cs->ch, cs->buf[tid][i].y, cs->buf[tid][i].is_high, forced);
	76	if (r < 0) cs->buf[tid][k++] = cs->buf[tid][i];
	77	}
	78	cs->n_buf[tid] = k;
	79	return k;
	80	}
	81
	82	static void bfc_kmer_insert(cnt_step_t cs, const bfc_kmer_t x, int is_high, int tid)
	83	{
	84	int k = cs->k;
	85	uint64_t y[2], hash;
	86	hash = bfc_kmer_hash(k, x->x, y);
	87	if (bfc_ch_insert(cs->ch, y, is_high, 0) < 0) {
	88	insbuf_t *p;
	89	if (bfc_kmer_bufclear(cs, 0, tid) == CNT_BUF_SIZE)
	90	bfc_kmer_bufclear(cs, 1, tid);
	91	p = &cs->buf[tid][cs->n_buf[tid]++];
	92	p->y[0] = y[0], p->y[1] = y[1], p->is_high = is_high;
	93	}
	94	}
	95
	96	static void worker_count(void *_data, long k, int tid)
	97	{
	98	cnt_step_t cs = (cnt_step_t)_data;
	99	const bseq1_t *s = &cs->seqs[k];
	100	int i, l;
	101	bfc_kmer_t x = bfc_kmer_null;
	102	uint64_t qmer = 0, mask = (1ULL<<cs->k) - 1;
	103	for (i = l = 0; i < s->l_seq; ++i) {
	104	int c = seq_nt6_table[(uint8_t)s->seq[i]] - 1;
	105	if (c < 4) {
	106	bfc_kmer_append(cs->k, x.x, c);
	107	qmer = (qmer<<1 \| (s->qual == 0 \|\| s->qual[i] - 33 >= cs->q)) & mask;
	108	if (++l >= cs->k) bfc_kmer_insert(cs, &x, (qmer == mask), tid);
	109	} else l = 0, qmer = 0, x = bfc_kmer_null;
	110	}
	111	}
	112
	113	struct bfc_ch_s fml_count(int n, const bseq1_t seq, int k, int q, int l_pre, int n_threads)
	114	{
	115	int i;
	116	cnt_step_t cs;
	117	cs.n_seqs = n, cs.seqs = seq, cs.k = k, cs.q = q;
	118	cs.ch = bfc_ch_init(cs.k, l_pre);
	119	cs.n_buf = calloc(n_threads, sizeof(int));
	120	cs.buf = calloc(n_threads, sizeof(void*));
	121	for (i = 0; i < n_threads; ++i)
	122	cs.buf[i] = malloc(CNT_BUF_SIZE * sizeof(insbuf_t));
	123	kt_for(n_threads, worker_count, &cs, cs.n_seqs);
	124	for (i = 0; i < n_threads; ++i) free(cs.buf[i]);
	125	free(cs.buf); free(cs.n_buf);
	126	return cs.ch;
	127	}
	128
	129	/***************
	130	* Correct *
	131	***************/
	132
	133	#define BFC_MAX_KMER 63
	134	#define BFC_MAX_BF_SHIFT 37
	135
	136	#define BFC_MAX_PATHS 4
	137	#define BFC_EC_HIST 5
	138	#define BFC_EC_HIST_HIGH 2
	139
	140	#define BFC_EC_MIN_COV_COEF .1
	141
	142	/**************************
	143	* Sequence struct for ec *
	144	**************************/
	145
	146	#include "kvec.h"
	147
	148	typedef struct { // NOTE: unaligned memory
	149	uint8_t b:3, q:1, ob:3, oq:1;
	150	uint8_t dummy;
	151	uint16_t lcov:6, hcov:6, solid_end:1, high_end:1, ec:1, absent:1;
	152	int i;
	153	} ecbase_t;
	154
	155	typedef kvec_t(ecbase_t) ecseq_t;
	156
	157	static int bfc_seq_conv(const char s, const char q, int qthres, ecseq_t *seq)
	158	{
	159	int i, l;
	160	l = strlen(s);
	161	kv_resize(ecbase_t, *seq, l);
	162	seq->n = l;
	163	for (i = 0; i < l; ++i) {
	164	ecbase_t *c = &seq->a[i];
	165	c->b = c->ob = seq_nt6_table[(int)s[i]] - 1;
	166	c->q = c->oq = !q? 1 : q[i] - 33 >= qthres? 1 : 0;
	167	if (c->b > 3) c->q = c->oq = 0;
	168	c->i = i;
	169	}
	170	return l;
	171	}
	172
	173	static inline ecbase_t ecbase_comp(const ecbase_t *b)
	174	{
	175	ecbase_t r = *b;
	176	r.b = b->b < 4? 3 - b->b : 4;
	177	r.ob = b->ob < 4? 3 - b->ob : 4;
	178	return r;
	179	}
	180
	181	static void bfc_seq_revcomp(ecseq_t *seq)
	182	{
	183	int i;
	184	for (i = 0; i < seq->n>>1; ++i) {
	185	ecbase_t tmp;
	186	tmp = ecbase_comp(&seq->a[i]);
	187	seq->a[i] = ecbase_comp(&seq->a[seq->n - 1 - i]);
	188	seq->a[seq->n - 1 - i] = tmp;
	189	}
	190	if (seq->n&1) seq->a[i] = ecbase_comp(&seq->a[i]);
	191	}
	192
	193	/***************************
	194	* Independent ec routines *
	195	***************************/
	196
	197	int bfc_ec_greedy_k(int k, int mode, const bfc_kmer_t x, const bfc_ch_t ch)
	198	{
	199	int i, j, max = 0, max_ec = -1, max2 = 0;
	200	for (i = 0; i < k; ++i) {
	201	int c = (x->x[1]>>i&1)<<1 \| (x->x[0]>>i&1);
	202	for (j = 0; j < 4; ++j) {
	203	bfc_kmer_t y = *x;
	204	int ret;
	205	if (j == c) continue;
	206	bfc_kmer_change(k, y.x, i, j);
	207	ret = bfc_ch_kmer_occ(ch, &y);
	208	if (ret < 0) continue;
	209	if ((max&0xff) < (ret&0xff)) max2 = max, max = ret, max_ec = i<<2 \| j;
	210	else if ((max2&0xff) < (ret&0xff)) max2 = ret;
	211	}
	212	}
	213	return (max&0xff) * 3 > mode && (max2&0xff) < 3? max_ec : -1;
	214	}
	215
	216	int bfc_ec_first_kmer(int k, const ecseq_t s, int start, bfc_kmer_t x)
	217	{
	218	int i, l;
	219	*x = bfc_kmer_null;
	220	for (i = start, l = 0; i < s->n; ++i) {
	221	ecbase_t *c = &s->a[i];
	222	if (c->b < 4) {
	223	bfc_kmer_append(k, x->x, c->b);
	224	if (++l == k) break;
	225	} else l = 0, *x = bfc_kmer_null;
	226	}
	227	return i;
	228	}
	229
	230	void bfc_ec_kcov(int k, int min_occ, ecseq_t s, const bfc_ch_t ch)
	231	{
	232	int i, l, r, j;
	233	bfc_kmer_t x = bfc_kmer_null;
	234	for (i = l = 0; i < s->n; ++i) {
	235	ecbase_t *c = &s->a[i];
	236	c->high_end = c->solid_end = c->lcov = c->hcov = 0;
	237	if (c->b < 4) {
	238	bfc_kmer_append(k, x.x, c->b);
	239	if (++l >= k) {
	240	if ((r = bfc_ch_kmer_occ(ch, &x)) >= 0) {
	241	if ((r>>8&0x3f) >= min_occ+1) c->high_end = 1;
	242	if ((r&0xff) >= min_occ) {
	243	c->solid_end = 1;
	244	for (j = i - k + 1; j <= i; ++j)
	245	++s->a[j].lcov, s->a[j].hcov += c->high_end;
	246	}
	247	}
	248	}
	249	} else l = 0, x = bfc_kmer_null;
	250	}
	251	}
	252
	253	uint64_t bfc_ec_best_island(int k, const ecseq_t *s)
	254	{ // IMPORTANT: call bfc_ec_kcov() before calling this function!
	255	int i, l, max, max_i;
	256	for (i = k - 1, max = l = 0, max_i = -1; i < s->n; ++i) {
	257	if (!s->a[i].solid_end) {
	258	if (l > max) max = l, max_i = i;
	259	l = 0;
	260	} else ++l;
	261	}
	262	if (l > max) max = l, max_i = i;
	263	return max > 0? (uint64_t)(max_i - max - k + 1) << 32 \| max_i : 0;
	264	}
	265
	266	/********************
	267	* Correct one read *
	268	********************/
	269
	270	#include "ksort.h"
	271
	272	#define ECCODE_MISC 1
	273	#define ECCODE_MANY_N 2
	274	#define ECCODE_NO_SOLID 3
	275	#define ECCODE_UNCORR_N 4
	276	#define ECCODE_MANY_FAIL 5
	277
	278	typedef struct {
	279	uint32_t ec_code:3, brute:1, n_ec:14, n_ec_high:14;
	280	uint32_t n_absent:24, max_heap:8;
	281	} ecstat_t;
	282
	283	typedef struct {
	284	uint8_t ec:1, ec_high:1, absent:1, absent_high:1, b:4;
	285	} bfc_penalty_t;
	286
	287	typedef struct {
	288	int tot_pen;
	289	int i; // base position
	290	int k; // position in the stack
	291	int32_t ecpos_high[BFC_EC_HIST_HIGH];
	292	int32_t ecpos[BFC_EC_HIST];
	293	bfc_kmer_t x;
	294	} echeap1_t;
	295
	296	typedef struct {
	297	int parent, i, tot_pen;
	298	uint8_t b;
	299	bfc_penalty_t pen;
	300	uint16_t cnt;
	301	} ecstack1_t;
	302
	303	typedef struct {
	304	const bfc_opt_t *opt;
	305	const bfc_ch_t *ch;
	306	kvec_t(echeap1_t) heap;
	307	kvec_t(ecstack1_t) stack;
	308	ecseq_t seq, tmp, ec[2];
	309	int mode;
	310	ecstat_t ori_st;
	311	} bfc_ec1buf_t;
	312
	313	#define heap_lt(a, b) ((a).tot_pen > (b).tot_pen)
	314	KSORT_INIT(ec, echeap1_t, heap_lt)
	315
	316	static bfc_ec1buf_t ec1buf_init(const bfc_opt_t opt, const bfc_ch_t *ch)
	317	{
	318	bfc_ec1buf_t *e;
	319	e = calloc(1, sizeof(bfc_ec1buf_t));
	320	e->opt = opt, e->ch = ch;
	321	return e;
	322	}
	323
	324	static void ec1buf_destroy(bfc_ec1buf_t *e)
	325	{
	326	free(e->heap.a); free(e->stack.a); free(e->seq.a); free(e->tmp.a); free(e->ec[0].a); free(e->ec[1].a);
	327	free(e);
	328	}
	329
	330	#define weighted_penalty(o, p) ((o)->w_ec * (p).ec + (o)->w_ec_high * (p).ec_high + (o)->w_absent * (p).absent + (o)->w_absent_high * (p).absent_high)
	331
	332	static void buf_update(bfc_ec1buf_t e, const echeap1_t prev, bfc_penalty_t pen, int cnt)
	333	{
	334	ecstack1_t *q;
	335	echeap1_t *r;
	336	const bfc_opt_t *o = e->opt;
	337	int b = pen.b;
	338	// update stack
	339	kv_pushp(ecstack1_t, e->stack, &q);
	340	q->parent = prev->k;
	341	q->i = prev->i;
	342	q->b = b;
	343	q->pen = pen;
	344	q->cnt = cnt > 0? cnt&0xff : 0;
	345	q->tot_pen = prev->tot_pen + weighted_penalty(o, pen);
	346	// update heap
	347	kv_pushp(echeap1_t, e->heap, &r);
	348	r->i = prev->i + 1;
	349	r->k = e->stack.n - 1;
	350	r->x = prev->x;
	351	if (pen.ec_high) {
	352	memcpy(r->ecpos_high + 1, prev->ecpos_high, (BFC_EC_HIST_HIGH - 1) * 4);
	353	r->ecpos_high[0] = prev->i;
	354	} else memcpy(r->ecpos_high, prev->ecpos_high, BFC_EC_HIST_HIGH * 4);
	355	if (pen.ec) {
	356	memcpy(r->ecpos + 1, prev->ecpos, (BFC_EC_HIST - 1) * 4);
	357	r->ecpos[0] = prev->i;
	358	} else memcpy(r->ecpos, prev->ecpos, BFC_EC_HIST * 4);
	359	r->tot_pen = q->tot_pen;
	360	bfc_kmer_append(e->opt->k, r->x.x, b);
	361	ks_heapup_ec(e->heap.n, e->heap.a);
	362	}
	363
	364	static int buf_backtrack(ecstack1_t s, int end, const ecseq_t seq, ecseq_t *path)
	365	{
	366	int i, n_absent = 0;
	367	kv_resize(ecbase_t, *path, seq->n);
	368	path->n = seq->n;
	369	while (end >= 0) {
	370	if ((i = s[end].i) < seq->n) {
	371	path->a[i].b = s[end].b;
	372	path->a[i].ec = s[end].pen.ec;
	373	path->a[i].absent = s[end].pen.absent;
	374	n_absent += s[end].pen.absent;
	375	}
	376	end = s[end].parent;
	377	}
	378	return n_absent;
	379	}
	380
	381	static int bfc_ec1dir(bfc_ec1buf_t e, const ecseq_t seq, ecseq_t ec, int start, int end, int max_heap)
	382	{
	383	echeap1_t z;
	384	int i, l, rv = -1, path[BFC_MAX_PATHS], n_paths = 0, min_path = -1, min_path_pen = INT_MAX, n_failures = 0;
	385	assert(end <= seq->n && end - start >= e->opt->k);
	386	e->heap.n = e->stack.n = 0;
	387	*max_heap = 0;
	388	memset(&z, 0, sizeof(echeap1_t));
	389	kv_resize(ecbase_t, *ec, seq->n);
	390	ec->n = seq->n;
	391	for (z.i = start, l = 0; z.i < end; ++z.i) {
	392	int c = seq->a[z.i].b;
	393	if (c < 4) {
	394	if (++l == e->opt->k) break;
	395	bfc_kmer_append(e->opt->k, z.x.x, c);
	396	} else l = 0, z.x = bfc_kmer_null;
	397	}
	398	assert(z.i < end); // before calling this function, there must be at least one solid k-mer
	399	z.k = -1;
	400	for (i = 0; i < BFC_EC_HIST; ++i) z.ecpos[i] = -1;
	401	for (i = 0; i < BFC_EC_HIST_HIGH; ++i) z.ecpos_high[i] = -1;
	402	kv_push(echeap1_t, e->heap, z);
	403	for (i = 0; i < seq->n; ++i) ec->a[i].b = seq->a[i].b, ec->a[i].ob = seq->a[i].ob;
	404	// exhaustive error correction
	405	while (1) {
	406	int stop = 0;
	407	max_heap = max_heap > 255? 255 : max_heap > e->heap.n? max_heap : e->heap.n;
	408	if (e->heap.n == 0) { // may happen when there is an uncorrectable "N"
	409	rv = -2;
	410	break;
	411	}
	412	z = e->heap.a[0];
	413	e->heap.a[0] = kv_pop(e->heap);
	414	ks_heapdown_ec(0, e->heap.n, e->heap.a);
	415	if (min_path >= 0 && z.tot_pen > min_path_pen + e->opt->max_path_diff) break;
	416	if (z.i - end > e->opt->max_end_ext) stop = 1;
	417	if (!stop) {
	418	ecbase_t *c = z.i < seq->n? &seq->a[z.i] : 0;
	419	int b, os = -1, fixed = 0, other_ext = 0, n_added = 0, added_cnt[4];
	420	bfc_penalty_t added[4];
	421	// test if the read extension alone is enough
	422	if (z.i > end) fixed = 1;
	423	if (c && c->b < 4) { // A, C, G or T
	424	bfc_kmer_t x = z.x;
	425	bfc_kmer_append(e->opt->k, x.x, c->b);
	426	os = bfc_ch_kmer_occ(e->ch, &x);
	427	if (c->q && (os&0xff) >= e->opt->min_cov + 1 && c->lcov >= e->opt->min_cov + 1) fixed = 1;
	428	else if (c->hcov > e->opt->k * .75) fixed = 1;
	429	}
	430	// extension
	431	for (b = 0; b < 4; ++b) {
	432	bfc_penalty_t pen;
	433	if (fixed && c && b != c->b) continue;
	434	if (c == 0 \|\| b != c->b) {
	435	int s;
	436	bfc_kmer_t x = z.x;
	437	pen.ec = 0, pen.ec_high = 0, pen.absent = 0, pen.absent_high = 0, pen.b = b;
	438	if (c) { // not over the end
	439	if (c->q && z.ecpos_high[BFC_EC_HIST_HIGH-1] >= 0 && z.i - z.ecpos_high[BFC_EC_HIST_HIGH-1] < e->opt->win_multi_ec) continue; // no close highQ corrections
	440	if (z.ecpos[BFC_EC_HIST-1] >= 0 && z.i - z.ecpos[BFC_EC_HIST-1] < e->opt->win_multi_ec) continue; // no clustered corrections
	441	}
	442	bfc_kmer_append(e->opt->k, x.x, b);
	443	s = bfc_ch_kmer_occ(e->ch, &x);
	444	if (s < 0 \|\| (s&0xff) < e->opt->min_cov) continue; // not solid
	445	//if (os >= 0 && (s&0xff) - (os&0xff) < 2) continue; // not sufficiently better than the read path
	446	pen.ec = c && c->b < 4? 1 : 0;
	447	pen.ec_high = pen.ec? c->oq : 0;
	448	pen.absent = 0;
	449	pen.absent_high = ((s>>8&0xff) < e->opt->min_cov);
	450	pen.b = b;
	451	added_cnt[n_added] = s;
	452	added[n_added++] = pen;
	453	++other_ext;
	454	} else {
	455	pen.ec = pen.ec_high = 0;
	456	pen.absent = (os < 0 \|\| (os&0xff) < e->opt->min_cov);
	457	pen.absent_high = (os < 0 \|\| (os>>8&0xff) < e->opt->min_cov);
	458	pen.b = b;
	459	added_cnt[n_added] = os;
	460	added[n_added++] = pen;
	461	}
	462	} // ~for(b)
	463	if (fixed == 0 && other_ext == 0) ++n_failures;
	464	if (n_failures > seq->n * 2) {
	465	rv = -3;
	466	break;
	467	}
	468	if (c \|\| n_added == 1) {
	469	if (n_added > 1 && e->heap.n > e->opt->max_heap) { // to prevent heap explosion
	470	int min_b = -1, min = INT_MAX;
	471	for (b = 0; b < n_added; ++b) {
	472	int t = weighted_penalty(e->opt, added[b]);
	473	if (min > t) min = t, min_b = b;
	474	}
	475	buf_update(e, &z, added[min_b], added_cnt[min_b]);
	476	} else {
	477	for (b = 0; b < n_added; ++b)
	478	buf_update(e, &z, added[b], added_cnt[b]);
	479	}
	480	} else {
	481	if (n_added == 0)
	482	e->stack.a[z.k].tot_pen += e->opt->w_absent * (e->opt->max_end_ext - (z.i - end));
	483	stop = 1;
	484	}
	485	} // ~if(!stop)
	486	if (stop) {
	487	if (e->stack.a[z.k].tot_pen < min_path_pen)
	488	min_path_pen = e->stack.a[z.k].tot_pen, min_path = n_paths;
	489	path[n_paths++] = z.k;
	490	if (n_paths == BFC_MAX_PATHS) break;
	491	}
	492	} // ~while(1)
	493	// backtrack
	494	if (n_paths == 0) return rv;
	495	assert(min_path >= 0 && min_path < n_paths && e->stack.a[path[min_path]].tot_pen == min_path_pen);
	496	rv = buf_backtrack(e->stack.a, path[min_path], seq, ec);
	497	for (i = 0; i < ec->n; ++i) // mask out uncorrected regions
	498	if (i < start + e->opt->k \|\| i >= end) ec->a[i].b = 4;
	499	return rv;
	500	}
	501
	502	ecstat_t bfc_ec1(bfc_ec1buf_t e, char seq, char *qual)
	503	{
	504	int i, start = 0, end = 0, n_n = 0, rv[2], max_heap[2];
	505	uint64_t r;
	506	ecstat_t s;
	507
	508	s.ec_code = ECCODE_MISC, s.brute = 0, s.n_ec = s.n_ec_high = 0, s.n_absent = s.max_heap = 0;
	509	bfc_seq_conv(seq, qual, e->opt->q, &e->seq);
	510	for (i = 0; i < e->seq.n; ++i)
	511	if (e->seq.a[i].ob > 3) ++n_n;
	512	if (n_n > e->seq.n * .05) {
	513	s.ec_code = ECCODE_MANY_N;
	514	return s;
	515	}
	516	bfc_ec_kcov(e->opt->k, e->opt->min_cov, &e->seq, e->ch);
	517	r = bfc_ec_best_island(e->opt->k, &e->seq);
	518	if (r == 0) { // no solid k-mer
	519	bfc_kmer_t x;
	520	int ec = -1;
	521	while ((end = bfc_ec_first_kmer(e->opt->k, &e->seq, start, &x)) < e->seq.n) {
	522	ec = bfc_ec_greedy_k(e->opt->k, e->mode, &x, e->ch);
	523	if (ec >= 0) break;
	524	if (end + (e->opt->k>>1) >= e->seq.n) break;
	525	start = end - (e->opt->k>>1);
	526	}
	527	if (ec >= 0) {
	528	e->seq.a[end - (ec>>2)].b = ec&3;
	529	++end; start = end - e->opt->k;
	530	s.brute = 1;
	531	} else {
	532	s.ec_code = ECCODE_NO_SOLID;
	533	return s;
	534	}
	535	} else start = r>>32, end = (uint32_t)r;
	536	if ((rv[0] = bfc_ec1dir(e, &e->seq, &e->ec[0], start, e->seq.n, &max_heap[0])) < 0) {
	537	s.ec_code = rv[0] == -2? ECCODE_UNCORR_N : rv[0] == -3? ECCODE_MANY_FAIL : ECCODE_MISC;
	538	return s;
	539	}
	540	bfc_seq_revcomp(&e->seq);
	541	if ((rv[1] = bfc_ec1dir(e, &e->seq, &e->ec[1], e->seq.n - end, e->seq.n, &max_heap[1])) < 0) {
	542	s.ec_code = rv[1] == -2? ECCODE_UNCORR_N : rv[1] == -3? ECCODE_MANY_FAIL : ECCODE_MISC;
	543	return s;
	544	}
	545	s.max_heap = max_heap[0] > max_heap[1]? max_heap[0] : max_heap[1];
	546	s.ec_code = 0, s.n_absent = rv[0] + rv[1];
	547	bfc_seq_revcomp(&e->ec[1]);
	548	bfc_seq_revcomp(&e->seq);
	549	for (i = 0; i < e->seq.n; ++i) {
	550	ecbase_t *c = &e->seq.a[i];
	551	if (e->ec[0].a[i].b == e->ec[1].a[i].b)
	552	c->b = e->ec[0].a[i].b > 3? e->seq.a[i].b : e->ec[0].a[i].b;
	553	else if (e->ec[1].a[i].b > 3) c->b = e->ec[0].a[i].b;
	554	else if (e->ec[0].a[i].b > 3) c->b = e->ec[1].a[i].b;
	555	else c->b = e->seq.a[i].ob;
	556	}
	557	for (i = 0; i < e->seq.n; ++i) {
	558	int is_diff = !(e->seq.a[i].b == e->seq.a[i].ob);
	559	if (is_diff) {
	560	++s.n_ec;
	561	if (e->seq.a[i].q) ++s.n_ec_high;
	562	}
	563	seq[i] = (is_diff? "acgtn" : "ACGTN")[e->seq.a[i].b];
	564	if (qual) qual[i] = is_diff? 34 + e->seq.a[i].ob : "+?"[e->seq.a[i].q];
	565	}
	566	return s;
	567	}
	568
	569	/********************
	570	* Error correction *
	571	********************/
	572
	573	typedef struct {
	574	const bfc_opt_t *opt;
	575	const bfc_ch_t *ch;
	576	bfc_ec1buf_t **e;
	577	int64_t n_processed;
	578	int n_seqs, flt_uniq;
	579	bseq1_t *seqs;
	580	} ec_step_t;
	581
	582	static uint64_t max_streak(int k, const bfc_ch_t ch, const bseq1_t s)
	583	{
	584	int i, l;
	585	uint64_t max = 0, t = 0;
	586	bfc_kmer_t x = bfc_kmer_null;
	587	for (i = l = 0; i < s->l_seq; ++i) {
	588	int c = seq_nt6_table[(uint8_t)s->seq[i]] - 1;
	589	if (c < 4) { // not an ambiguous base
	590	bfc_kmer_append(k, x.x, c);
	591	if (++l >= k) { // ok, we have a k-mer now
	592	if (bfc_ch_kmer_occ(ch, &x) > 0) t += 1ULL<<32;
	593	else t = i + 1;
	594	} else t = i + 1;
	595	} else l = 0, x = bfc_kmer_null, t = i + 1;
	596	max = max > t? max : t;
	597	}
	598	return max;
	599	}
	600
	601	static void worker_ec(void *_data, long k, int tid)
	602	{
	603	ec_step_t es = (ec_step_t)_data;
	604	bseq1_t *s = &es->seqs[k];
	605	if (es->flt_uniq) {
	606	uint64_t max;
	607	max = max_streak(es->opt->k, es->ch, s);
	608	if (max>>32 && (double)((max>>32) + es->opt->k - 1) / s->l_seq > es->opt->min_trim_frac) {
	609	int start = (uint32_t)max, end = start + (max>>32);
	610	start -= es->opt->k - 1;
	611	assert(start >= 0 && end <= s->l_seq);
	612	memmove(s->seq, s->seq + start, end - start);
	613	s->l_seq = end - start;
	614	s->seq[s->l_seq] = 0;
	615	if (s->qual) {
	616	memmove(s->qual, s->qual + start, s->l_seq);
	617	s->qual[s->l_seq] = 0;
	618	}
	619	} else {
	620	free(s->seq); free(s->qual);
	621	s->l_seq = 0, s->seq = s->qual = 0;
	622	}
	623	} else bfc_ec1(es->e[tid], s->seq, s->qual);
	624	}
	625
	626	float fml_correct_core(const fml_opt_t opt, int flt_uniq, int n, bseq1_t seq)
	627	{
	628	bfc_ch_t *ch;
	629	int i, mode;
	630	uint64_t hist[256], hist_high[64], tot_len = 0, sum_k = 0, tot_k = 0;
	631	ec_step_t es;
	632	bfc_opt_t bfc_opt;
	633	float kcov;
	634
	635	// initialize BFC options
	636	bfc_opt_init(&bfc_opt);
	637	bfc_opt.n_threads = opt->n_threads; // copy from FML options
	638	bfc_opt.k = flt_uniq? opt->min_asm_ovlp : opt->ec_k;
	639	for (i = 0; i < n; ++i) tot_len += seq[i].l_seq; // compute total length
	640	bfc_opt.l_pre = tot_len - 8 < 20? tot_len - 8 : 20;
	641
	642	memset(&es, 0, sizeof(ec_step_t));
	643	es.opt = &bfc_opt, es.n_seqs = n, es.seqs = seq, es.flt_uniq = flt_uniq;
	644
	645	es.ch = ch = fml_count(n, seq, bfc_opt.k, bfc_opt.q, bfc_opt.l_pre, bfc_opt.n_threads);
	646	mode = bfc_ch_hist(ch, hist, hist_high);
	647	for (i = opt->min_cnt; i < 256; ++i)
	648	sum_k += hist[i], tot_k += i * hist[i];
	649	kcov = (float)tot_k / sum_k;
	650	bfc_opt.min_cov = (int)(BFC_EC_MIN_COV_COEF * kcov + .499);
	651	bfc_opt.min_cov = bfc_opt.min_cov < opt->max_cnt? bfc_opt.min_cov : opt->max_cnt;
	652	bfc_opt.min_cov = bfc_opt.min_cov > opt->min_cnt? bfc_opt.min_cov : opt->min_cnt;
	653
	654	es.e = calloc(es.opt->n_threads, sizeof(void*));
	655	for (i = 0; i < es.opt->n_threads; ++i)
	656	es.e[i] = ec1buf_init(es.opt, ch), es.e[i]->mode = mode;
	657	kt_for(es.opt->n_threads, worker_ec, &es, es.n_seqs);
	658	for (i = 0; i < es.opt->n_threads; ++i)
	659	ec1buf_destroy(es.e[i]);
	660	free(es.e);
	661	bfc_ch_destroy(ch);
	662	return kcov;
	663	}
	664
	665	float fml_correct(const fml_opt_t opt, int n, bseq1_t seq)
	666	{
	667	return fml_correct_core(opt, 0, n, seq);
	668	}
	669
	670	float fml_fltuniq(const fml_opt_t opt, int n, bseq1_t seq)
	671	{
	672	return fml_correct_core(opt, 1, n, seq);
	673	}

+61

-0

third_party/fermi-lite-0.1/bseq.c less more

	0	#include <zlib.h>
	1	#include <stdio.h>
	2	#include <stdlib.h>
	3	#include <string.h>
	4	#include "fml.h"
	5	#include "kseq.h"
	6	KSEQ_INIT(gzFile, gzread)
	7
	8	bseq1_t bseq_read(const char fn, int *n_)
	9	{
	10	gzFile fp;
	11	bseq1_t *seqs;
	12	kseq_t *ks;
	13	int m, n;
	14	uint64_t size = 0;
	15
	16	*n_ = 0;
	17	fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
	18	if (fp == 0) return 0;
	19	ks = kseq_init(fp);
	20
	21	m = n = 0; seqs = 0;
	22	while (kseq_read(ks) >= 0) {
	23	bseq1_t *s;
	24	if (n >= m) {
	25	m = m? m<<1 : 256;
	26	seqs = realloc(seqs, m * sizeof(bseq1_t));
	27	}
	28	s = &seqs[n];
	29	s->seq = strdup(ks->seq.s);
	30	s->qual = ks->qual.l? strdup(ks->qual.s) : 0;
	31	s->l_seq = ks->seq.l;
	32	size += seqs[n++].l_seq;
	33	}
	34	*n_ = n;
	35
	36	kseq_destroy(ks);
	37	gzclose(fp);
	38	return seqs;
	39	}
	40
	41	void seq_reverse(int l, unsigned char *s)
	42	{
	43	int i;
	44	for (i = 0; i < l>>1; ++i) {
	45	int tmp = s[l-1-i];
	46	s[l-1-i] = s[i]; s[i] = tmp;
	47	}
	48	}
	49
	50	void seq_revcomp6(int l, unsigned char *s)
	51	{
	52	int i;
	53	for (i = 0; i < l>>1; ++i) {
	54	int tmp = s[l-1-i];
	55	tmp = (tmp >= 1 && tmp <= 4)? 5 - tmp : tmp;
	56	s[l-1-i] = (s[i] >= 1 && s[i] <= 4)? 5 - s[i] : s[i];
	57	s[i] = tmp;
	58	}
	59	if (l&1) s[i] = (s[i] >= 1 && s[i] <= 4)? 5 - s[i] : s[i];
	60	}

+367

-0

third_party/fermi-lite-0.1/bubble.c less more

	0	#include <limits.h>
	1	#include <stdio.h>
	2	#include "mag.h"
	3	#include "kvec.h"
	4	#include "ksw.h"
	5	#include "internal.h"
	6	#include "khash.h"
	7	KHASH_DECLARE(64, uint64_t, uint64_t)
	8
	9	typedef khash_t(64) hash64_t;
	10
	11	#define MAX_N_DIFF 2.01 // for evaluating alignment after SW
	12	#define MAX_R_DIFF 0.1
	13	#define L_DIFF_COEF 0.2 // n_diff=\|l_0 - l_1\|*L_DIFF_COEF
	14
	15	#define edge_mark_del(_x) ((_x).x = (uint64_t)-2, (_x).y = 0)
	16	#define edge_is_del(_x) ((_x).x == (uint64_t)-2 \|\| (_x).y == 0)
	17
	18	static int fm_verbose = 1;
	19
	20	/******************
	21	* Closed bubbles *
	22	******************/
	23
	24	typedef struct {
	25	uint64_t id;
	26	int cnt[2];
	27	int n[2][2], d[2][2];
	28	uint64_t v[2][2];
	29	} trinfo_t;
	30
	31	const trinfo_t g_trinull = {-1, {0, 0}, {{INT_MIN, INT_MIN}, {INT_MIN, INT_MIN}}, {{INT_MIN, INT_MIN}, {INT_MIN, INT_MIN}}, {{-1, -1}, {-1, -1}}};
	32
	33	typedef struct {
	34	int n, m;
	35	trinfo_t **buf;
	36	} tipool_t;
	37
	38	struct mogb_aux {
	39	tipool_t pool;
	40	ku64_v stack;
	41	hash64_t *h;
	42	};
	43
	44	mogb_aux_t *mag_b_initaux(void)
	45	{
	46	mogb_aux_t *aux = calloc(1, sizeof(mogb_aux_t));
	47	aux->h = kh_init(64);
	48	return aux;
	49	}
	50
	51	void mag_b_destroyaux(mogb_aux_t *b)
	52	{
	53	int i;
	54	for (i = 0; i < b->pool.m; ++i)
	55	free(b->pool.buf[i]);
	56	free(b->pool.buf); free(b->stack.a);
	57	kh_destroy(64, b->h);
	58	free(b);
	59	}
	60
	61	#define tiptr(p) ((trinfo_t*)(p)->ptr)
	62
	63	static inline trinfo_t tip_alloc(tipool_t pool, uint32_t id)
	64	{ // allocate an object from the memory pool
	65	trinfo_t *p;
	66	if (pool->n == pool->m) {
	67	int i, new_m = pool->m? pool->m<<1 : 256;
	68	pool->buf = realloc(pool->buf, new_m * sizeof(void*));
	69	for (i = pool->m; i < new_m; ++i)
	70	pool->buf[i] = malloc(sizeof(trinfo_t));
	71	pool->m = new_m;
	72	}
	73	p = pool->buf[pool->n++];
	74	*p = g_trinull;
	75	p->id = id;
	76	return p;
	77	}
	78
	79	static void backtrace(mag_t g, uint64_t end, uint64_t start, hash64_t h)
	80	{
	81	while (end>>32 != start) {
	82	int ret;
	83	kh_put(64, h, end>>33, &ret);
	84	end = tiptr(&g->v.a[end>>33])->v[(end>>32^1)&1][end&1];
	85	}
	86	}
	87
	88	void mag_vh_simplify_bubble(mag_t g, uint64_t idd, int max_vtx, int max_dist, mogb_aux_t a)
	89	{
	90	int i, n_pending = 0;
	91	magv_t p, q;
	92
	93	p = &g->v.a[idd>>1];
	94	if (p->len < 0 \|\| p->nei[idd&1].n < 2) return; // stop if p is deleted or it has 0 or 1 neighbor
	95	// reset aux data
	96	a->stack.n = a->pool.n = 0;
	97	if (kh_n_buckets(a->h) >= 64) {
	98	kh_destroy(64, a->h);
	99	a->h = kh_init(64);
	100	} else kh_clear(64, a->h);
	101	// add the initial vertex
	102	p->ptr = tip_alloc(&a->pool, idd>>1);
	103	tiptr(p)->d[(idd&1)^1][0] = -p->len;
	104	tiptr(p)->n[(idd&1)^1][0] = -p->nsr;
	105	kv_push(uint64_t, a->stack, idd^1);
	106	// essentially a topological sorting
	107	while (a->stack.n) {
	108	uint64_t x, y;
	109	ku128_v *r;
	110	if (a->stack.n == 1 && a->stack.a[0] != (idd^1) && n_pending == 0) break; // found the other end of the bubble
	111	x = kv_pop(a->stack);
	112	p = &g->v.a[x>>1];
	113	//printf("%lld:%lld\n", p->k[0], p->k[1]);
	114	r = &p->nei[(x&1)^1]; // we will look the the neighbors from the other end of the unitig
	115	if (a->pool.n > max_vtx \|\| tiptr(p)->d[x&1][0] > max_dist \|\| tiptr(p)->d[x&1][1] > max_dist \|\| r->n == 0) break; // we failed
	116	// set the distance to p's neighbors
	117	for (i = 0; i < r->n; ++i) {
	118	int nsr, dist, which;
	119	if ((int64_t)r->a[i].x < 0) continue;
	120	y = mag_tid2idd(g->h, r->a[i].x);
	121	if (y == (idd^1)) { // there is a loop involving the initial vertex
	122	a->stack.n = 0;
	123	break; // not a bubble; stop; this will jump out of the while() loop
	124	}
	125	q = &g->v.a[y>>1];
	126	if (q->ptr == 0) { // has not been attempted
	127	q->ptr = tip_alloc(&a->pool, y>>1), ++n_pending;
	128	mag_v128_clean(&q->nei[y&1]); // make sure there are no deleted edges
	129	}
	130	nsr = tiptr(p)->n[x&1][0] + p->nsr; which = 0;
	131	dist = tiptr(p)->d[x&1][0] + p->len - r->a[i].y;
	132	//printf("01 [%d]\t[%d,%d]\t[%d,%d]\n", i, tiptr(q)->n[y&1][0], tiptr(q)->n[y&1][1], tiptr(q)->d[y&1][0], tiptr(q)->d[y&1][1]);
	133	// test and possibly update the tentative distance
	134	if (nsr > tiptr(q)->n[y&1][0]) { // then move the best to the 2nd best and update the best
	135	tiptr(q)->n[y&1][1] = tiptr(q)->n[y&1][0]; tiptr(q)->n[y&1][0] = nsr;
	136	tiptr(q)->v[y&1][1] = tiptr(q)->v[y&1][0]; tiptr(q)->v[y&1][0] = (x^1)<<32\|i<<1\|which;
	137	tiptr(q)->d[y&1][1] = tiptr(q)->d[y&1][0]; tiptr(q)->d[y&1][0] = dist;
	138	nsr = tiptr(p)->n[x&1][1] + p->nsr; which = 1; // now nsr is the 2nd best
	139	dist = tiptr(p)->d[x&1][1] + p->len - r->a[i].y;
	140	}
	141	if (nsr > tiptr(q)->n[y&1][1]) // update the 2nd best
	142	tiptr(q)->n[y&1][1] = nsr, tiptr(q)->v[y&1][1] = (x^1)<<32\|i<<1\|which, tiptr(q)->d[y&1][1] = dist;
	143	if (++tiptr(q)->cnt[y&1] == q->nei[y&1].n) { // all q's predecessors have been processed; then push
	144	kv_push(uint64_t, a->stack, y);
	145	--n_pending;
	146	}
	147	}
	148	}
	149	if (n_pending == 0 && a->stack.n == 1) { // found a bubble
	150	uint64_t x = a->stack.a[0];
	151	p = &g->v.a[x>>1];
	152	//printf("(%d,%d)\t(%d,%d)\n", tiptr(p)->n[x&1][0], tiptr(p)->n[x&1][1], tiptr(p)->d[x&1][0], tiptr(p)->d[x&1][1]);
	153	backtrace(g, tiptr(p)->v[x&1][0], idd, a->h);
	154	backtrace(g, tiptr(p)->v[x&1][1], idd, a->h);
	155	}
	156	for (i = 0; i < a->pool.n; ++i) // reset p->ptr
	157	g->v.a[a->pool.buf[i]->id].ptr = 0;
	158	if (kh_size(a->h)) { // bubble detected; then remove verticies not in the top two paths
	159	for (i = 1; i < a->pool.n; ++i) { // i=0 corresponds to the initial vertex which we want to exclude
	160	uint64_t id = a->pool.buf[i]->id;
	161	if (id != a->stack.a[0]>>1 && kh_get(64, a->h, id) == kh_end(a->h)) // not in the top two paths
	162	mag_v_del(g, &g->v.a[id]);
	163	}
	164	}
	165	}
	166
	167	void mag_g_simplify_bubble(mag_t *g, int max_vtx, int max_dist)
	168	{
	169	int64_t i;
	170	mogb_aux_t *a;
	171	a = mag_b_initaux();
	172	for (i = 0; i < g->v.n; ++i) {
	173	mag_vh_simplify_bubble(g, i<<1\|0, max_vtx, max_dist, a);
	174	mag_vh_simplify_bubble(g, i<<1\|1, max_vtx, max_dist, a);
	175	}
	176	mag_b_destroyaux(a);
	177	mag_g_merge(g, 0, 0);
	178	}
	179
	180	int mag_vh_pop_simple(mag_t *g, uint64_t idd, float max_cov, float max_frac, int aggressive)
	181	{
	182	magv_t p = &g->v.a[idd>>1], q[2];
	183	ku128_v *r;
	184	int i, j, k, dir[2], l[2], ret = -1;
	185	char seq[2], cov[2];
	186	float n_diff, r_diff, avg[2], max_n_diff = aggressive? MAX_N_DIFF * 2. : MAX_N_DIFF;
	187
	188	if (p->len < 0 \|\| p->nei[idd&1].n != 2) return ret; // deleted or no bubble
	189	r = &p->nei[idd&1];
	190	for (j = 0; j < 2; ++j) {
	191	uint64_t x;
	192	if ((int64_t)r->a[j].x < 0) return ret;
	193	x = mag_tid2idd(g->h, r->a[j].x);
	194	dir[j] = x&1;
	195	q[j] = &g->v.a[x>>1];
	196	if (q[j]->nei[0].n != 1 \|\| q[j]->nei[1].n != 1) return ret; // no bubble
	197	l[j] = q[j]->len - (int)(q[j]->nei[0].a->y + q[j]->nei[1].a->y);
	198	}
	199	if (q[0]->nei[dir[0]^1].a->x != q[1]->nei[dir[1]^1].a->x) return ret; // no bubble
	200	for (j = 0; j < 2; ++j) { // set seq[] and cov[], and compute avg[]
	201	if (l[j] > 0) {
	202	seq[j] = malloc(l[j]<<1);
	203	cov[j] = seq[j] + l[j];
	204	for (i = 0; i < l[j]; ++i) {
	205	seq[j][i] = q[j]->seq[i + q[j]->nei[0].a->y];
	206	cov[j][i] = q[j]->cov[i + q[j]->nei[0].a->y];
	207	}
	208	if (dir[j]) {
	209	seq_revcomp6(l[j], (uint8_t*)seq[j]);
	210	seq_reverse(l[j], (uint8_t*)cov[j]);
	211	}
	212	for (i = 0, avg[j] = 0.; i < l[j]; ++i) {
	213	--seq[j][i]; // change DNA6 encoding to DNA4 for SW below
	214	avg[j] += cov[j][i] - 33;
	215	}
	216	avg[j] /= l[j];
	217	} else { // l[j] <= 0; this may happen around a tandem repeat
	218	int beg, end;
	219	seq[j] = cov[j] = 0;
	220	beg = q[j]->nei[0].a->y; end = q[j]->len - q[j]->nei[1].a->y;
	221	if (beg > end) beg ^= end, end ^= beg, beg ^= end; // swap
	222	if (beg < end) {
	223	for (i = beg, avg[j] = 0.; i < end; ++i)
	224	avg[j] += q[j]->cov[i] - 33;
	225	avg[j] /= end - beg;
	226	} else avg[j] = q[j]->cov[beg] - 33; // FIXME: when q[j] is contained, weird thing may happen
	227	}
	228	}
	229	ret = 1;
	230	if (l[0] > 0 && l[1] > 0) { // then do SW to compute n_diff and r_diff
	231	int8_t mat[16];
	232	kswr_t aln;
	233	for (i = k = 0; i < 4; ++i)
	234	for (j = 0; j < 4; ++j)
	235	mat[k++] = i == j? 5 : -4;
	236	aln = ksw_align(l[0], (uint8_t)seq[0], l[1], (uint8_t)seq[1], 4, mat, 5, 2, 0, 0);
	237	n_diff = ((l[0] < l[1]? l[0] : l[1]) * 5. - aln.score) / (5. + 4.); // 5: matching score; -4: mismatchig score
	238	r_diff = n_diff / ((l[0] + l[1]) / 2.);
	239	//fprintf(stderr, "===> %f %f <===\n", n_diff, r_diff); for (j = 0; j < 2; ++j) { for (i = 0; i < l[j]; ++i) fputc("ACGTN"[(int)seq[j][i]], stderr); fputc('\n', stderr); }
	240	} else {
	241	n_diff = abs(l[0] - l[1]) * L_DIFF_COEF;
	242	r_diff = 1.;
	243	//fprintf(stderr, "---> (%d,%d) <---\n", l[0], l[1]);
	244	}
	245	if (n_diff < max_n_diff \|\| r_diff < MAX_R_DIFF) {
	246	j = avg[0] < avg[1]? 0 : 1;
	247	if (aggressive \|\| (avg[j] < max_cov && avg[j] / (avg[j^1] + avg[j]) < max_frac)) {
	248	mag_v_del(g, q[j]);
	249	ret = 2;
	250	}
	251	}
	252	free(seq[0]); free(seq[1]);
	253	return ret;
	254	}
	255
	256	void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int aggressive)
	257	{
	258	int64_t i, n_examined = 0, n_popped = 0;
	259	int ret;
	260
	261	for (i = 0; i < g->v.n; ++i) {
	262	ret = mag_vh_pop_simple(g, i<<1\|0, max_cov, max_frac, aggressive);
	263	if (ret >= 1) ++n_examined;
	264	if (ret >= 2) ++n_popped;
	265	ret = mag_vh_pop_simple(g, i<<1\|1, max_cov, max_frac, aggressive);
	266	if (ret >= 1) ++n_examined;
	267	if (ret >= 2) ++n_popped;
	268	}
	269	if (fm_verbose >= 3)
	270	fprintf(stderr, "[M::%s] examined %ld bubbles and popped %ld\n", __func__, (long)n_examined, (long)n_popped);
	271	mag_g_merge(g, 0, min_merge_len);
	272	}
	273
	274	/****************
	275	* Open bubbles *
	276	****************/
	277
	278	void mag_v_pop_open(mag_t g, magv_t p, int min_elen)
	279	{
	280	int i, j, k, l, dir, max_l, l_qry;
	281	magv_t q, t;
	282	ku128_v r, s;
	283	uint8_t *seq;
	284	int8_t mat[16];
	285
	286	if (p->len < 0 \|\| p->len >= min_elen) return;
	287	//if (p->nei[0].n && p->nei[1].n) return; // FIXME: between this and the next line, which is better?
	288	if (p->nei[0].n + p->nei[1].n != 1) return;
	289	dir = p->nei[0].n? 0 : 1;
	290	// initialize the scoring system
	291	for (i = k = 0; i < 4; ++i)
	292	for (j = 0; j < 4; ++j)
	293	mat[k++] = i == j? 5 : -4;
	294
	295	s = &p->nei[dir];
	296	for (l = 0; l < s->n; ++l) { // if we use "if (p->nei[0].n + p->nei[1].n != 1)", s->n == 1
	297	uint64_t v;
	298	kswq_t *qry;
	299	if ((int64_t)s->a[l].x < 0) continue;
	300	v = mag_tid2idd(g->h, s->a[l].x);
	301	q = &g->v.a[v>>1];
	302	if (q == p \|\| q->nei[v&1].n == 1) continue;
	303	// get the query ready
	304	max_l = (p->len - s->a[l].y) * 2;
	305	seq = malloc(max_l + 1);
	306	if (dir == 0) { // forward strand
	307	for (j = s->a[l].y, k = 0; j < p->len; ++j)
	308	seq[k++] = p->seq[j] - 1;
	309	} else { // reverse
	310	for (j = p->len - s->a[l].y - 1, k = 0; j >= 0; --j)
	311	seq[k++] = 4 - p->seq[j];
	312	}
	313	l_qry = k;
	314	qry = ksw_qinit(2, l_qry, seq, 4, mat);
	315	//fprintf(stderr, "===> %lld:%lld:%d[%d], %d, %ld <===\n", p->k[0], p->k[1], s->n, l, p->nsr, q->nei[v&1].n);
	316	//for (j = 0; j < k; ++j) fputc("ACGTN"[(int)seq[j]], stderr); fputc('\n', stderr);
	317
	318	r = &q->nei[v&1];
	319	for (i = 0; i < r->n; ++i) {
	320	uint64_t w;
	321	kswr_t aln;
	322	if (r->a[i].x == p->k[dir] \|\| (int64_t)r->a[i].x < 0) continue;
	323	w = mag_tid2idd(g->h, r->a[i].x);
	324	// get the target sequence
	325	t = &g->v.a[w>>1];
	326	if (w&1) { // reverse strand
	327	for (j = t->len - r->a[i].y - 1, k = 0; j >= 0 && k < max_l; --j)
	328	seq[k++] = 4 - t->seq[j];
	329	} else {
	330	for (j = r->a[i].y, k = 0; j < t->len && k < max_l; ++j)
	331	seq[k++] = t->seq[j] - 1;
	332	}
	333	aln = ksw_align(0, 0, k, seq, 4, mat, 5, 2, 0, &qry);
	334	//for (j = 0; j < k; ++j) fputc("ACGTN"[(int)seq[j]], stderr); fprintf(stderr, "\t%d\t%f\n", aln.score, (l_qry * 5. - aln.score) / (5. + 4.));
	335	if (aln.score >= l_qry * 5 / 2) {
	336	double r_diff, n_diff;
	337	n_diff = (l_qry * 5. - aln.score) / (5. + 4.); // 5: matching score; -4: mismatchig score
	338	r_diff = n_diff / l_qry;
	339	if (n_diff < MAX_N_DIFF \|\| r_diff < MAX_R_DIFF) break;
	340	}
	341	}
	342
	343	if (i != r->n) {
	344	// mark delete in p and delete in q
	345	edge_mark_del(s->a[l]);
	346	for (i = 0; i < r->n; ++i)
	347	if (r->a[i].x == p->k[dir])
	348	edge_mark_del(r->a[i]);
	349	}
	350	free(seq); free(qry);
	351	}
	352
	353	for (i = 0; i < s->n; ++i)
	354	if (!edge_is_del(s->a[i])) break;
	355	if (i == s->n) mag_v_del(g, p); // p is not connected to any other vertices
	356	}
	357
	358	void mag_g_pop_open(mag_t *g, int min_elen)
	359	{
	360	int64_t i;
	361	for (i = 0; i < g->v.n; ++i)
	362	mag_v_pop_open(g, &g->v.a[i], min_elen);
	363	if (fm_verbose >= 3)
	364	fprintf(stderr, "[M:%s] popped open bubbles\n", __func__);
	365	mag_g_merge(g, 0, 0);
	366	}

+42

-0

third_party/fermi-lite-0.1/example.c less more

	0	#include <unistd.h>
	1	#include <stdlib.h>
	2	#include <stdio.h>
	3	#include "fml.h"
	4
	5	int main(int argc, char *argv[])
	6	{
	7	fml_opt_t opt;
	8	int c, n_seqs, n_utg;
	9	bseq1_t *seqs;
	10	fml_utg_t *utg;
	11
	12	fml_opt_init(&opt);
	13	while ((c = getopt(argc, argv, "Ae:l:r:t:c:")) >= 0) {
	14	if (c == 'e') opt.ec_k = atoi(optarg);
	15	else if (c == 'l') opt.min_asm_ovlp = atoi(optarg);
	16	else if (c == 'r') opt.mag_opt.min_dratio1 = atof(optarg);
	17	else if (c == 'A') opt.mag_opt.flag \|= MAG_F_AGGRESSIVE;
	18	else if (c == 't') opt.n_threads = atoi(optarg);
	19	else if (c == 'c') {
	20	char *p;
	21	opt.min_cnt = strtol(optarg, &p, 10);
	22	if (*p == ',') opt.max_cnt = strtol(p + 1, &p, 10);
	23	}
	24	}
	25	if (argc == optind) {
	26	fprintf(stderr, "Usage: fml-asm [options] <in.fq>\n");
	27	fprintf(stderr, "Options:\n");
	28	fprintf(stderr, " -e INT k-mer length for error correction (0 for auto; -1 to disable) [%d]\n", opt.ec_k);
	29	fprintf(stderr, " -c INT1[,INT2] range of k-mer & read count thresholds for ec and graph cleaning [%d,%d]\n", opt.min_cnt, opt.max_cnt);
	30	fprintf(stderr, " -l INT min overlap length during initial assembly [%d]\n", opt.min_asm_ovlp);
	31	fprintf(stderr, " -r FLOAT drop an overlap if its length is below maxOvlpLen*FLOAT [%g]\n", opt.mag_opt.min_dratio1);
	32	fprintf(stderr, " -t INT number of threads (don't use multi-threading for small data sets) [%d]\n", opt.n_threads);
	33	fprintf(stderr, " -A discard heterozygotes (apply this to assemble bacterial genomes)\n");
	34	return 1;
	35	}
	36	seqs = bseq_read(argv[optind], &n_seqs);
	37	utg = fml_assemble(&opt, n_seqs, seqs, &n_utg);
	38	fml_utg_print(n_utg, utg);
	39	fml_utg_destroy(n_utg, utg);
	40	return 0;
	41	}

+182

-0

third_party/fermi-lite-0.1/fml.h less more

	0	#ifndef FML_H
	1	#define FML_H
	2
	3	#define FML_VERSION "r41"
	4
	5	#include <stdint.h>
	6
	7	typedef struct {
	8	int32_t l_seq;
	9	char seq, qual;
	10	} bseq1_t;
	11
	12	#define MAG_F_AGGRESSIVE 0x20
	13	#define MAG_F_NO_SIMPL 0x80
	14
	15	typedef struct {
	16	int flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bvtx, min_merge_len, trim_len, trim_depth;
	17	float min_dratio1, max_bcov, max_bfrac;
	18	} magopt_t;
	19
	20	typedef struct {
	21	int n_threads; // number of threads; don't use multi-threading for small data sets
	22	int ec_k; // k-mer length for error correction; 0 for auto estimate
	23	int min_cnt, max_cnt; // both occ threshold in ec and tip threshold in cleaning lie in [min_cnt,max_cnt]
	24	int min_asm_ovlp; // min overlap length during assembly
	25	int min_merge_len; // during assembly, don't explicitly merge an overlap if shorter than this value
	26	magopt_t mag_opt; // graph cleaning options
	27	} fml_opt_t;
	28
	29	struct rld_t;
	30	struct mag_t;
	31
	32	typedef struct {
	33	uint32_t tid;
	34	uint32_t len:31, from:1;
	35	} fml_ovlp_t;
	36
	37	typedef struct {
	38	int32_t len; // length of sequence
	39	int32_t nsr; // number of supporting reads
	40	char *seq; // unitig sequence
	41	char *cov; // cov[i]-33 gives per-base coverage at i
	42	int n_ovlp[2]; // number of 5'-end [0] and 3'-end [1] overlaps
	43	fml_ovlp_t *ovlp; // overlaps, of size n_ovlp[0]+n_ovlp[1]
	44	} fml_utg_t;
	45
	46	#ifdef __cplusplus
	47	extern "C" {
	48	#endif
	49
	50	/************************
	51	* High-level functions *
	52	************************/
	53
	54	/**
	55	* Read all sequences from a FASTA/FASTQ file
	56	*
	57	* @param fn filename; NULL or "-" for stdin
	58	* @param n (out) number of sequences read into RAM
	59	*
	60	* @return array of sequences
	61	*/
	62	bseq1_t bseq_read(const char fn, int *n);
	63
	64	/**
	65	* Initialize default parameters
	66	*
	67	* @param opt (out) pointer to parameters
	68	*/
	69	void fml_opt_init(fml_opt_t *opt);
	70
	71	/**
	72	* Assemble a list of sequences
	73	*
	74	* @param opt parameters
	75	* @param n_seqs number of input sequences
	76	* @param seqs sequences to assemble; FREED on return
	77	* @param n_utg (out) number of unitigs in return
	78	*
	79	* @return array of unitigs
	80	*/
	81	fml_utg_t fml_assemble(const fml_opt_t opt, int n_seqs, bseq1_t seqs, int n_utg);
	82
	83	/**
	84	* Free unitigs
	85	*
	86	* @param n_utg number of unitigs
	87	* @param utg array of unitigs
	88	*/
	89	void fml_utg_destroy(int n_utg, fml_utg_t *utg);
	90
	91	/************************************************
	92	* Mid-level functions called by fml_assemble() *
	93	************************************************/
	94
	95	/**
	96	* Adjust parameters based on input sequences
	97	*
	98	* @param opt parameters to update IN PLACE
	99	* @param n_seqs number of sequences
	100	* @param seqs array of sequences
	101	*/
	102	void fml_opt_adjust(fml_opt_t opt, int n_seqs, const bseq1_t seqs);
	103
	104	/**
	105	* Error correction
	106	*
	107	* @param opt parameters
	108	* @param n number of sequences
	109	* @param seq array of sequences; corrected IN PLACE
	110	*
	111	* @return k-mer coverage
	112	*/
	113	float fml_correct(const fml_opt_t opt, int n, bseq1_t seq);
	114	float fml_fltuniq(const fml_opt_t opt, int n, bseq1_t seq);
	115
	116	/**
	117	* Construct FMD-index
	118	*
	119	* @param opt parameters
	120	* @param n number of sequences
	121	* @param seq array of sequences; FREED on return
	122	*
	123	* @return FMD-index
	124	*/
	125	struct rld_t fml_seq2fmi(const fml_opt_t opt, int n, bseq1_t *seq);
	126
	127	/**
	128	* Generate initial overlap graph
	129	*
	130	* @param opt parameters
	131	* @param e FMD-index; FREED on return
	132	*
	133	* @return overlap graph in the "mag" structure
	134	*/
	135	struct mag_t fml_fmi2mag(const fml_opt_t opt, struct rld_t *e);
	136
	137	/**
	138	* Clean a mag graph
	139	*
	140	* @param opt parameters
	141	* @param g overlap graph; modified IN PLACE
	142	*/
	143	void fml_mag_clean(const fml_opt_t opt, struct mag_t g);
	144
	145	/**
	146	* Convert a graph in mag to fml_utg_t
	147	*
	148	* @param g graph in the "mag" structure; FREED on return
	149	* @param n_utg (out) number of unitigs
	150	*
	151	* @return array of unitigs
	152	*/
	153	fml_utg_t fml_mag2utg(struct mag_t g, int *n_utg);
	154
	155	/**
	156	* Output unitig graph in the mag format
	157	*
	158	* @param n_utg number of unitigs
	159	* @param utg array of unitigs
	160	*/
	161	void fml_utg_print(int n_utgs, const fml_utg_t *utg);
	162
	163	/**
	164	* Deallocate an FM-index
	165	*
	166	* @param e pointer to the FM-index
	167	*/
	168	void fml_fmi_destroy(struct rld_t *e);
	169
	170	/**
	171	* Deallocate a mag graph
	172	*
	173	* @param g pointer to the mag graph
	174	*/
	175	void fml_mag_destroy(struct mag_t *g);
	176
	177	#ifdef __cplusplus
	178	}
	179	#endif
	180
	181	#endif

+132

-0

third_party/fermi-lite-0.1/htab.c less more

	0	#include <stdio.h>
	1	#include <stdlib.h>
	2	#include <assert.h>
	3	#include "htab.h"
	4	#include "khash.h"
	5
	6	#define _cnt_eq(a, b) ((a)>>14 == (b)>>14)
	7	#define _cnt_hash(a) ((a)>>14)
	8	KHASH_INIT(cnt, uint64_t, char, 0, _cnt_hash, _cnt_eq)
	9	typedef khash_t(cnt) cnthash_t;
	10
	11	struct bfc_ch_s {
	12	int k;
	13	cnthash_t **h;
	14	// private
	15	int l_pre;
	16	};
	17
	18	bfc_ch_t *bfc_ch_init(int k, int l_pre)
	19	{
	20	bfc_ch_t *ch;
	21	int i;
	22	assert(k <= 63);
	23	if (k * 2 - l_pre > BFC_CH_KEYBITS)
	24	l_pre = k * 2 - BFC_CH_KEYBITS;
	25	if (l_pre > BFC_CH_MAXPRE) l_pre = BFC_CH_MAXPRE;
	26	assert(k - l_pre < BFC_CH_KEYBITS);
	27	ch = calloc(1, sizeof(bfc_ch_t));
	28	ch->k = k, ch->l_pre = l_pre;
	29	ch->h = calloc(1<<ch->l_pre, sizeof(void*));
	30	for (i = 0; i < 1<<ch->l_pre; ++i)
	31	ch->h[i] = kh_init(cnt);
	32	return ch;
	33	}
	34
	35	void bfc_ch_destroy(bfc_ch_t *ch)
	36	{
	37	int i;
	38	if (ch == 0) return;
	39	for (i = 0; i < 1<<ch->l_pre; ++i)
	40	kh_destroy(cnt, ch->h[i]);
	41	free(ch->h); free(ch);
	42	}
	43
	44	static inline cnthash_t get_subhash(const bfc_ch_t ch, const uint64_t x[2], uint64_t *key)
	45	{
	46	if (ch->k <= 32) {
	47	int t = ch->k * 2 - ch->l_pre;
	48	uint64_t z = x[0] << ch->k \| x[1];
	49	*key = (z & ((1ULL<<t) - 1)) << 14 \| 1;
	50	return ch->h[z>>t];
	51	} else {
	52	int t = ch->k - ch->l_pre;
	53	int shift = t + ch->k < BFC_CH_KEYBITS? ch->k : BFC_CH_KEYBITS - t;
	54	*key = ((x[0] & ((1ULL<<t) - 1)) << shift ^ x[1]) << 14 \| 1;
	55	return ch->h[x[0]>>t];
	56	}
	57	}
	58
	59	int bfc_ch_insert(bfc_ch_t *ch, const uint64_t x[2], int is_high, int forced)
	60	{
	61	int absent;
	62	uint64_t key;
	63	cnthash_t *h;
	64	khint_t k;
	65	h = get_subhash(ch, x, &key);
	66	if (__sync_lock_test_and_set(&h->lock, 1)) {
	67	if (forced) // then wait until the hash table is unlocked by the thread using it
	68	while (__sync_lock_test_and_set(&h->lock, 1))
	69	while (h->lock); // lock
	70	else return -1;
	71	}
	72	k = kh_put(cnt, h, key, &absent);
	73	if (absent) {
	74	if (is_high) kh_key(h, k) \|= 1<<8;
	75	} else {
	76	if ((kh_key(h, k) & 0xff) != 0xff) ++kh_key(h, k);
	77	if (is_high && (kh_key(h, k) >> 8 & 0x3f) != 0x3f) kh_key(h, k) += 1<<8;
	78	}
	79	__sync_lock_release(&h->lock); // unlock
	80	return 0;
	81	}
	82
	83	int bfc_ch_get(const bfc_ch_t *ch, const uint64_t x[2])
	84	{
	85	uint64_t key;
	86	cnthash_t *h;
	87	khint_t itr;
	88	h = get_subhash(ch, x, &key);
	89	itr = kh_get(cnt, h, key);
	90	return itr == kh_end(h)? -1 : kh_key(h, itr) & 0x3fff;
	91	}
	92
	93	int bfc_ch_kmer_occ(const bfc_ch_t ch, const bfc_kmer_t z)
	94	{
	95	uint64_t x[2];
	96	bfc_kmer_hash(ch->k, z->x, x);
	97	return bfc_ch_get(ch, x);
	98	}
	99
	100	uint64_t bfc_ch_count(const bfc_ch_t *ch)
	101	{
	102	int i;
	103	uint64_t cnt = 0;
	104	for (i = 0; i < 1<<ch->l_pre; ++i)
	105	cnt += kh_size(ch->h[i]);
	106	return cnt;
	107	}
	108
	109	int bfc_ch_hist(const bfc_ch_t *ch, uint64_t cnt[256], uint64_t high[64])
	110	{
	111	int i, max_i = -1;
	112	uint64_t max;
	113	memset(cnt, 0, 256 * 8);
	114	memset(high, 0, 64 * 8);
	115	for (i = 0; i < 1<<ch->l_pre; ++i) {
	116	khint_t k;
	117	cnthash_t *h = ch->h[i];
	118	for (k = 0; k != kh_end(h); ++k)
	119	if (kh_exist(h, k))
	120	++cnt[kh_key(h, k) & 0xff], ++high[kh_key(h, k)>>8 & 0x3f];
	121	}
	122	for (i = 3, max = 0; i < 256; ++i)
	123	if (cnt[i] > max)
	124	max = cnt[i], max_i = i;
	125	return max_i;
	126	}
	127
	128	int bfc_ch_get_k(const bfc_ch_t *ch)
	129	{
	130	return ch->k;
	131	}

+23

-0

third_party/fermi-lite-0.1/htab.h less more

	0	#ifndef BFC_HTAB_H
	1	#define BFC_HTAB_H
	2
	3	#include <stdint.h>
	4	#include "kmer.h"
	5
	6	#define BFC_CH_KEYBITS 50
	7	#define BFC_CH_MAXPRE 20
	8
	9	struct bfc_ch_s;
	10	typedef struct bfc_ch_s bfc_ch_t;
	11
	12	bfc_ch_t *bfc_ch_init(int k, int l_pre);
	13	void bfc_ch_destroy(bfc_ch_t *ch);
	14	int bfc_ch_insert(bfc_ch_t *ch, const uint64_t x[2], int is_high, int forced);
	15	int bfc_ch_get(const bfc_ch_t *ch, const uint64_t x[2]);
	16	uint64_t bfc_ch_count(const bfc_ch_t *ch);
	17	int bfc_ch_hist(const bfc_ch_t *ch, uint64_t cnt[256], uint64_t high[64]);
	18	int bfc_ch_get_k(const bfc_ch_t *ch);
	19
	20	int bfc_ch_kmer_occ(const bfc_ch_t ch, const bfc_kmer_t z);
	21
	22	#endif

+21

-0

third_party/fermi-lite-0.1/internal.h less more

	0	#ifndef FML_INTERNAL_H
	1	#define FML_INTERNAL_H
	2
	3	#include "fml.h"
	4
	5	extern unsigned char seq_nt6_table[256];
	6
	7	#ifdef __cplusplus
	8	extern "C" {
	9	#endif
	10
	11	void kt_for(int n_threads, void (func)(void,long,int), void *data, long n);
	12	void seq_reverse(int l, unsigned char *s);
	13	void seq_revcomp6(int l, unsigned char *s);
	14	struct bfc_ch_s fml_count(int n, const bseq1_t seq, int k, int q, int l_pre, int n_threads);
	15
	16	#ifdef __cplusplus
	17	}
	18	#endif
	19
	20	#endif

+614

-0

third_party/fermi-lite-0.1/khash.h less more

	0	/* The MIT License
	1
	2	Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
	3
	4	Permission is hereby granted, free of charge, to any person obtaining
	5	a copy of this software and associated documentation files (the
	6	"Software"), to deal in the Software without restriction, including
	7	without limitation the rights to use, copy, modify, merge, publish,
	8	distribute, sublicense, and/or sell copies of the Software, and to
	9	permit persons to whom the Software is furnished to do so, subject to
	10	the following conditions:
	11
	12	The above copyright notice and this permission notice shall be
	13	included in all copies or substantial portions of the Software.
	14
	15	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	16	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	17	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	18	NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	19	BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	20	ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	21	CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	22	SOFTWARE.
	23	*/
	24
	25	/*
	26	An example:
	27
	28	#include "khash.h"
	29	KHASH_MAP_INIT_INT(32, char)
	30	int main() {
	31	int ret, is_missing;
	32	khiter_t k;
	33	khash_t(32) *h = kh_init(32);
	34	k = kh_put(32, h, 5, &ret);
	35	kh_value(h, k) = 10;
	36	k = kh_get(32, h, 10);
	37	is_missing = (k == kh_end(h));
	38	k = kh_get(32, h, 5);
	39	kh_del(32, h, k);
	40	for (k = kh_begin(h); k != kh_end(h); ++k)
	41	if (kh_exist(h, k)) kh_value(h, k) = 1;
	42	kh_destroy(32, h);
	43	return 0;
	44	}
	45	*/
	46
	47	/*
	48	2013-05-02 (0.2.8):
	49
	50	* Use quadratic probing. When the capacity is power of 2, stepping function
	51	i*(i+1)/2 guarantees to traverse each bucket. It is better than double
	52	hashing on cache performance and is more robust than linear probing.
	53
	54	In theory, double hashing should be more robust than quadratic probing.
	55	However, my implementation is probably not for large hash tables, because
	56	the second hash function is closely tied to the first hash function,
	57	which reduce the effectiveness of double hashing.
	58
	59	Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
	60
	61	2011-12-29 (0.2.7):
	62
	63	* Minor code clean up; no actual effect.
	64
	65	2011-09-16 (0.2.6):
	66
	67	* The capacity is a power of 2. This seems to dramatically improve the
	68	speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
	69
	70	- http://code.google.com/p/ulib/
	71	- http://nothings.org/computer/judy/
	72
	73	* Allow to optionally use linear probing which usually has better
	74	performance for random input. Double hashing is still the default as it
	75	is more robust to certain non-random input.
	76
	77	* Added Wang's integer hash function (not used by default). This hash
	78	function is more robust to certain non-random input.
	79
	80	2011-02-14 (0.2.5):
	81
	82	* Allow to declare global functions.
	83
	84	2009-09-26 (0.2.4):
	85
	86	* Improve portability
	87
	88	2008-09-19 (0.2.3):
	89
	90	* Corrected the example
	91	* Improved interfaces
	92
	93	2008-09-11 (0.2.2):
	94
	95	* Improved speed a little in kh_put()
	96
	97	2008-09-10 (0.2.1):
	98
	99	* Added kh_clear()
	100	* Fixed a compiling error
	101
	102	2008-09-02 (0.2.0):
	103
	104	* Changed to token concatenation which increases flexibility.
	105
	106	2008-08-31 (0.1.2):
	107
	108	* Fixed a bug in kh_get(), which has not been tested previously.
	109
	110	2008-08-31 (0.1.1):
	111
	112	* Added destructor
	113	*/
	114
	115
	116	#ifndef __AC_KHASH_H
	117	#define __AC_KHASH_H
	118
	119	/*!
	120	@header
	121
	122	Generic hash table library.
	123	*/
	124
	125	#define AC_VERSION_KHASH_H "0.2.8"
	126
	127	#include <stdlib.h>
	128	#include <string.h>
	129	#include <limits.h>
	130
	131	/* compiler specific configuration */
	132
	133	#if UINT_MAX == 0xffffffffu
	134	typedef unsigned int khint32_t;
	135	#elif ULONG_MAX == 0xffffffffu
	136	typedef unsigned long khint32_t;
	137	#endif
	138
	139	#if ULONG_MAX == ULLONG_MAX
	140	typedef unsigned long khint64_t;
	141	#else
	142	typedef unsigned long long khint64_t;
	143	#endif
	144
	145	#ifdef _MSC_VER
	146	#define kh_inline __inline
	147	#else
	148	#define kh_inline inline
	149	#endif
	150
	151	typedef khint32_t khint_t;
	152	typedef khint_t khiter_t;
	153
	154	#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
	155	#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
	156	#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
	157	#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
	158	#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
	159	#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
	160	#define __ac_set_isdel_true(flag, i) (flag[i>>4]\|=1ul<<((i&0xfU)<<1))
	161
	162	#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
	163
	164	#ifndef kroundup32
	165	#define kroundup32(x) (--(x), (x)\|=(x)>>1, (x)\|=(x)>>2, (x)\|=(x)>>4, (x)\|=(x)>>8, (x)\|=(x)>>16, ++(x))
	166	#endif
	167
	168	#ifndef kcalloc
	169	#define kcalloc(N,Z) calloc(N,Z)
	170	#endif
	171	#ifndef kmalloc
	172	#define kmalloc(Z) malloc(Z)
	173	#endif
	174	#ifndef krealloc
	175	#define krealloc(P,Z) realloc(P,Z)
	176	#endif
	177	#ifndef kfree
	178	#define kfree(P) free(P)
	179	#endif
	180
	181	#define __KHASH_TYPE(name, khkey_t, khval_t) \
	182	typedef struct kh_##name##_s { \
	183	khint_t n_buckets, size, n_occupied; \
	184	volatile int lock; \
	185	khint32_t *flags; \
	186	khkey_t *keys; \
	187	khval_t *vals; \
	188	} kh_##name##_t;
	189
	190	#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \
	191	extern kh_##name##_t *kh_init_##name(void); \
	192	extern void kh_destroy_##name(kh_##name##_t *h); \
	193	extern void kh_clear_##name(kh_##name##_t *h); \
	194	extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
	195	extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
	196	extern khint_t kh_put_##name(kh_##name##_t h, khkey_t key, int ret); \
	197	extern void kh_del_##name(kh_##name##_t *h, khint_t x);
	198
	199	#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
	200	SCOPE kh_##name##_t *kh_init_##name(void) { \
	201	return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \
	202	} \
	203	SCOPE void kh_destroy_##name(kh_##name##_t *h) \
	204	{ \
	205	if (h) { \
	206	kfree((void *)h->keys); kfree(h->flags); \
	207	kfree((void *)h->vals); \
	208	kfree(h); \
	209	} \
	210	} \
	211	SCOPE void kh_clear_##name(kh_##name##_t *h) \
	212	{ \
	213	if (h && h->flags) { \
	214	memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
	215	h->size = h->n_occupied = 0; \
	216	} \
	217	} \
	218	SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
	219	{ \
	220	if (h->n_buckets) { \
	221	khint_t k, i, last, mask, step = 0; \
	222	mask = h->n_buckets - 1; \
	223	k = __hash_func(key); i = k & mask; \
	224	last = i; \
	225	while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) \|\| !__hash_equal(h->keys[i], key))) { \
	226	i = (i + (++step)) & mask; \
	227	if (i == last) return h->n_buckets; \
	228	} \
	229	return __ac_iseither(h->flags, i)? h->n_buckets : i; \
	230	} else return 0; \
	231	} \
	232	SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
	233	{ /* This function uses 0.25n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]n_buckets. */ \
	234	khint32_t *new_flags = 0; \
	235	khint_t j = 1; \
	236	{ \
	237	kroundup32(new_n_buckets); \
	238	if (new_n_buckets < 4) new_n_buckets = 4; \
	239	if (h->size >= (new_n_buckets>>1) + (new_n_buckets>>2)) j = 0; /* requested size is too small */ \
	240	else { /* hash table size to be changed (shrink or expand); rehash */ \
	241	new_flags = (khint32_t)kmalloc(__ac_fsize(new_n_buckets) sizeof(khint32_t)); \
	242	if (!new_flags) return -1; \
	243	memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
	244	if (h->n_buckets < new_n_buckets) { /* expand */ \
	245	khkey_t new_keys = (khkey_t)krealloc((void )h->keys, new_n_buckets sizeof(khkey_t)); \
	246	if (!new_keys) return -1; \
	247	h->keys = new_keys; \
	248	if (kh_is_map) { \
	249	khval_t new_vals = (khval_t)krealloc((void )h->vals, new_n_buckets sizeof(khval_t)); \
	250	if (!new_vals) return -1; \
	251	h->vals = new_vals; \
	252	} \
	253	} /* otherwise shrink */ \
	254	} \
	255	} \
	256	if (j) { /* rehashing is needed */ \
	257	for (j = 0; j != h->n_buckets; ++j) { \
	258	if (__ac_iseither(h->flags, j) == 0) { \
	259	khkey_t key = h->keys[j]; \
	260	khval_t val; \
	261	khint_t new_mask; \
	262	new_mask = new_n_buckets - 1; \
	263	if (kh_is_map) val = h->vals[j]; \
	264	__ac_set_isdel_true(h->flags, j); \
	265	while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
	266	khint_t k, i, step = 0; \
	267	k = __hash_func(key); \
	268	i = k & new_mask; \
	269	while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
	270	__ac_set_isempty_false(new_flags, i); \
	271	if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
	272	{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
	273	if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
	274	__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
	275	} else { /* write the element and jump out of the loop */ \
	276	h->keys[i] = key; \
	277	if (kh_is_map) h->vals[i] = val; \
	278	break; \
	279	} \
	280	} \
	281	} \
	282	} \
	283	if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
	284	h->keys = (khkey_t)krealloc((void )h->keys, new_n_buckets * sizeof(khkey_t)); \
	285	if (kh_is_map) h->vals = (khval_t)krealloc((void )h->vals, new_n_buckets * sizeof(khval_t)); \
	286	} \
	287	kfree(h->flags); /* free the working space */ \
	288	h->flags = new_flags; \
	289	h->n_buckets = new_n_buckets; \
	290	h->n_occupied = h->size; \
	291	} \
	292	return 0; \
	293	} \
	294	SCOPE khint_t kh_put_##name(kh_##name##_t h, khkey_t key, int ret) \
	295	{ \
	296	khint_t x; \
	297	if (h->n_occupied >= (h->n_buckets>>2) + (h->n_buckets>>1)) { /* update the hash table */ \
	298	if (h->n_buckets > (h->size<<1)) { \
	299	if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
	300	*ret = -1; return h->n_buckets; \
	301	} \
	302	} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
	303	*ret = -1; return h->n_buckets; \
	304	} \
	305	} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
	306	{ \
	307	khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
	308	x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
	309	if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
	310	else { \
	311	last = i; \
	312	while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) \|\| !__hash_equal(h->keys[i], key))) { \
	313	if (__ac_isdel(h->flags, i)) site = i; \
	314	i = (i + (++step)) & mask; \
	315	if (i == last) { x = site; break; } \
	316	} \
	317	if (x == h->n_buckets) { \
	318	if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
	319	else x = i; \
	320	} \
	321	} \
	322	} \
	323	if (__ac_isempty(h->flags, x)) { /* not present at all */ \
	324	h->keys[x] = key; \
	325	__ac_set_isboth_false(h->flags, x); \
	326	++h->size; ++h->n_occupied; \
	327	*ret = 1; \
	328	} else if (__ac_isdel(h->flags, x)) { /* deleted */ \
	329	h->keys[x] = key; \
	330	__ac_set_isboth_false(h->flags, x); \
	331	++h->size; \
	332	*ret = 2; \
	333	} else ret = 0; / Don't touch h->keys[x] if present and not deleted */ \
	334	return x; \
	335	} \
	336	SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
	337	{ \
	338	if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
	339	__ac_set_isdel_true(h->flags, x); \
	340	--h->size; \
	341	} \
	342	}
	343
	344	#define KHASH_DECLARE(name, khkey_t, khval_t) \
	345	__KHASH_TYPE(name, khkey_t, khval_t) \
	346	__KHASH_PROTOTYPES(name, khkey_t, khval_t)
	347
	348	#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
	349	__KHASH_TYPE(name, khkey_t, khval_t) \
	350	__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
	351
	352	#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
	353	KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
	354
	355	/* --- BEGIN OF HASH FUNCTIONS --- */
	356
	357	/*! @function
	358	@abstract Integer hash function
	359	@param key The integer [khint32_t]
	360	@return The hash value [khint_t]
	361	*/
	362	#define kh_int_hash_func(key) (khint32_t)(key)
	363	/*! @function
	364	@abstract Integer comparison function
	365	*/
	366	#define kh_int_hash_equal(a, b) ((a) == (b))
	367	/*! @function
	368	@abstract 64-bit integer hash function
	369	@param key The integer [khint64_t]
	370	@return The hash value [khint_t]
	371	*/
	372	#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
	373	/*! @function
	374	@abstract 64-bit integer comparison function
	375	*/
	376	#define kh_int64_hash_equal(a, b) ((a) == (b))
	377	/*! @function
	378	@abstract const char* hash function
	379	@param s Pointer to a null terminated string
	380	@return The hash value
	381	*/
	382	static kh_inline khint_t __ac_X31_hash_string(const char *s)
	383	{
	384	khint_t h = (khint_t)*s;
	385	if (h) for (++s ; s; ++s) h = (h << 5) - h + (khint_t)s;
	386	return h;
	387	}
	388	/*! @function
	389	@abstract Another interface to const char* hash function
	390	@param key Pointer to a null terminated string [const char*]
	391	@return The hash value [khint_t]
	392	*/
	393	#define kh_str_hash_func(key) __ac_X31_hash_string(key)
	394	/*! @function
	395	@abstract Const char* comparison function
	396	*/
	397	#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
	398
	399	static kh_inline khint_t __ac_Wang_hash(khint_t key)
	400	{
	401	key += ~(key << 15);
	402	key ^= (key >> 10);
	403	key += (key << 3);
	404	key ^= (key >> 6);
	405	key += ~(key << 11);
	406	key ^= (key >> 16);
	407	return key;
	408	}
	409	#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
	410
	411	/* --- END OF HASH FUNCTIONS --- */
	412
	413	/* Other convenient macros... */
	414
	415	/*!
	416	@abstract Type of the hash table.
	417	@param name Name of the hash table [symbol]
	418	*/
	419	#define khash_t(name) kh_##name##_t
	420
	421	/*! @function
	422	@abstract Initiate a hash table.
	423	@param name Name of the hash table [symbol]
	424	@return Pointer to the hash table [khash_t(name)*]
	425	*/
	426	#define kh_init(name) kh_init_##name()
	427
	428	/*! @function
	429	@abstract Destroy a hash table.
	430	@param name Name of the hash table [symbol]
	431	@param h Pointer to the hash table [khash_t(name)*]
	432	*/
	433	#define kh_destroy(name, h) kh_destroy_##name(h)
	434
	435	/*! @function
	436	@abstract Reset a hash table without deallocating memory.
	437	@param name Name of the hash table [symbol]
	438	@param h Pointer to the hash table [khash_t(name)*]
	439	*/
	440	#define kh_clear(name, h) kh_clear_##name(h)
	441
	442	/*! @function
	443	@abstract Resize a hash table.
	444	@param name Name of the hash table [symbol]
	445	@param h Pointer to the hash table [khash_t(name)*]
	446	@param s New size [khint_t]
	447	*/
	448	#define kh_resize(name, h, s) kh_resize_##name(h, s)
	449
	450	/*! @function
	451	@abstract Insert a key to the hash table.
	452	@param name Name of the hash table [symbol]
	453	@param h Pointer to the hash table [khash_t(name)*]
	454	@param k Key [type of keys]
	455	@param r Extra return code: 0 if the key is present in the hash table;
	456	1 if the bucket is empty (never used); 2 if the element in
	457	the bucket has been deleted [int*]
	458	@return Iterator to the inserted element [khint_t]
	459	*/
	460	#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
	461
	462	/*! @function
	463	@abstract Retrieve a key from the hash table.
	464	@param name Name of the hash table [symbol]
	465	@param h Pointer to the hash table [khash_t(name)*]
	466	@param k Key [type of keys]
	467	@return Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
	468	*/
	469	#define kh_get(name, h, k) kh_get_##name(h, k)
	470
	471	/*! @function
	472	@abstract Remove a key from the hash table.
	473	@param name Name of the hash table [symbol]
	474	@param h Pointer to the hash table [khash_t(name)*]
	475	@param k Iterator to the element to be deleted [khint_t]
	476	*/
	477	#define kh_del(name, h, k) kh_del_##name(h, k)
	478
	479	/*! @function
	480	@abstract Test whether a bucket contains data.
	481	@param h Pointer to the hash table [khash_t(name)*]
	482	@param x Iterator to the bucket [khint_t]
	483	@return 1 if containing data; 0 otherwise [int]
	484	*/
	485	#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
	486
	487	/*! @function
	488	@abstract Get key given an iterator
	489	@param h Pointer to the hash table [khash_t(name)*]
	490	@param x Iterator to the bucket [khint_t]
	491	@return Key [type of keys]
	492	*/
	493	#define kh_key(h, x) ((h)->keys[x])
	494
	495	/*! @function
	496	@abstract Get value given an iterator
	497	@param h Pointer to the hash table [khash_t(name)*]
	498	@param x Iterator to the bucket [khint_t]
	499	@return Value [type of values]
	500	@discussion For hash sets, calling this results in segfault.
	501	*/
	502	#define kh_val(h, x) ((h)->vals[x])
	503
	504	/*! @function
	505	@abstract Alias of kh_val()
	506	*/
	507	#define kh_value(h, x) ((h)->vals[x])
	508
	509	/*! @function
	510	@abstract Get the start iterator
	511	@param h Pointer to the hash table [khash_t(name)*]
	512	@return The start iterator [khint_t]
	513	*/
	514	#define kh_begin(h) (khint_t)(0)
	515
	516	/*! @function
	517	@abstract Get the end iterator
	518	@param h Pointer to the hash table [khash_t(name)*]
	519	@return The end iterator [khint_t]
	520	*/
	521	#define kh_end(h) ((h)->n_buckets)
	522
	523	/*! @function
	524	@abstract Get the number of elements in the hash table
	525	@param h Pointer to the hash table [khash_t(name)*]
	526	@return Number of elements in the hash table [khint_t]
	527	*/
	528	#define kh_size(h) ((h)->size)
	529
	530	/*! @function
	531	@abstract Get the number of buckets in the hash table
	532	@param h Pointer to the hash table [khash_t(name)*]
	533	@return Number of buckets in the hash table [khint_t]
	534	*/
	535	#define kh_n_buckets(h) ((h)->n_buckets)
	536
	537	/*! @function
	538	@abstract Iterate over the entries in the hash table
	539	@param h Pointer to the hash table [khash_t(name)*]
	540	@param kvar Variable to which key will be assigned
	541	@param vvar Variable to which value will be assigned
	542	@param code Block of code to execute
	543	*/
	544	#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \
	545	for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
	546	if (!kh_exist(h,__i)) continue; \
	547	(kvar) = kh_key(h,__i); \
	548	(vvar) = kh_val(h,__i); \
	549	code; \
	550	} }
	551
	552	/*! @function
	553	@abstract Iterate over the values in the hash table
	554	@param h Pointer to the hash table [khash_t(name)*]
	555	@param vvar Variable to which value will be assigned
	556	@param code Block of code to execute
	557	*/
	558	#define kh_foreach_value(h, vvar, code) { khint_t __i; \
	559	for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
	560	if (!kh_exist(h,__i)) continue; \
	561	(vvar) = kh_val(h,__i); \
	562	code; \
	563	} }
	564
	565	/* More conenient interfaces */
	566
	567	/*! @function
	568	@abstract Instantiate a hash set containing integer keys
	569	@param name Name of the hash table [symbol]
	570	*/
	571	#define KHASH_SET_INIT_INT(name) \
	572	KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
	573
	574	/*! @function
	575	@abstract Instantiate a hash map containing integer keys
	576	@param name Name of the hash table [symbol]
	577	@param khval_t Type of values [type]
	578	*/
	579	#define KHASH_MAP_INIT_INT(name, khval_t) \
	580	KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
	581
	582	/*! @function
	583	@abstract Instantiate a hash map containing 64-bit integer keys
	584	@param name Name of the hash table [symbol]
	585	*/
	586	#define KHASH_SET_INIT_INT64(name) \
	587	KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
	588
	589	/*! @function
	590	@abstract Instantiate a hash map containing 64-bit integer keys
	591	@param name Name of the hash table [symbol]
	592	@param khval_t Type of values [type]
	593	*/
	594	#define KHASH_MAP_INIT_INT64(name, khval_t) \
	595	KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
	596
	597	typedef const char *kh_cstr_t;
	598	/*! @function
	599	@abstract Instantiate a hash map containing const char* keys
	600	@param name Name of the hash table [symbol]
	601	*/
	602	#define KHASH_SET_INIT_STR(name) \
	603	KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
	604
	605	/*! @function
	606	@abstract Instantiate a hash map containing const char* keys
	607	@param name Name of the hash table [symbol]
	608	@param khval_t Type of values [type]
	609	*/
	610	#define KHASH_MAP_INIT_STR(name, khval_t) \
	611	KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
	612
	613	#endif /* __AC_KHASH_H */

+106

-0

third_party/fermi-lite-0.1/kmer.h less more

	0	#ifndef BFC_KMER_H
	1	#define BFC_KMER_H
	2
	3	#include <stdint.h>
	4
	5	typedef struct {
	6	uint64_t x[4];
	7	} bfc_kmer_t;
	8
	9	static inline void bfc_kmer_append(int k, uint64_t x[4], int c)
	10	{ // IMPORTANT: 0 <= c < 4
	11	uint64_t mask = (1ULL<<k) - 1;
	12	x[0] = (x[0]<<1 \| (c&1)) & mask;
	13	x[1] = (x[1]<<1 \| (c>>1)) & mask;
	14	x[2] = x[2]>>1 \| (1ULL^(c&1))<<(k-1);
	15	x[3] = x[3]>>1 \| (1ULL^c>>1) <<(k-1);
	16	}
	17
	18	static inline void bfc_kmer_change(int k, uint64_t x[4], int d, int c) // d-bp from the 3'-end of k-mer; 0<=d<k
	19	{ // IMPORTANT: 0 <= c < 4
	20	uint64_t t = ~(1ULL<<d);
	21	x[0] = (uint64_t) (c&1)<<d \| (x[0]&t);
	22	x[1] = (uint64_t)(c>>1)<<d \| (x[1]&t);
	23	t = ~(1ULL<<(k-1-d));
	24	x[2] = (uint64_t)(1^(c&1))<<(k-1-d) \| (x[2]&t);
	25	x[3] = (uint64_t)(1^ c>>1)<<(k-1-d) \| (x[3]&t);
	26	}
	27
	28	// Thomas Wang's integer hash functions. See <https://gist.github.com/lh3/59882d6b96166dfc3d8d> for a snapshot.
	29	static inline uint64_t bfc_hash_64(uint64_t key, uint64_t mask)
	30	{
	31	key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
	32	key = key ^ key >> 24;
	33	key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265
	34	key = key ^ key >> 14;
	35	key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21
	36	key = key ^ key >> 28;
	37	key = (key + (key << 31)) & mask;
	38	return key;
	39	}
	40
	41	static inline uint64_t bfc_hash_64_inv(uint64_t key, uint64_t mask)
	42	{
	43	uint64_t tmp;
	44
	45	// Invert key = key + (key << 31)
	46	tmp = (key - (key << 31));
	47	key = (key - (tmp << 31)) & mask;
	48
	49	// Invert key = key ^ (key >> 28)
	50	tmp = key ^ key >> 28;
	51	key = key ^ tmp >> 28;
	52
	53	// Invert key *= 21
	54	key = (key * 14933078535860113213ull) & mask;
	55
	56	// Invert key = key ^ (key >> 14)
	57	tmp = key ^ key >> 14;
	58	tmp = key ^ tmp >> 14;
	59	tmp = key ^ tmp >> 14;
	60	key = key ^ tmp >> 14;
	61
	62	// Invert key *= 265
	63	key = (key * 15244667743933553977ull) & mask;
	64
	65	// Invert key = key ^ (key >> 24)
	66	tmp = key ^ key >> 24;
	67	key = key ^ tmp >> 24;
	68
	69	// Invert key = (~key) + (key << 21)
	70	tmp = ~key;
	71	tmp = ~(key - (tmp << 21));
	72	tmp = ~(key - (tmp << 21));
	73	key = ~(key - (tmp << 21)) & mask;
	74
	75	return key;
	76	}
	77
	78	static inline uint64_t bfc_kmer_hash(int k, const uint64_t x[4], uint64_t h[2])
	79	{
	80	int t = k>>1, u = ((x[1]>>t&1) > (x[3]>>t&1)); // the middle base is always different
	81	uint64_t mask = (1ULL<<k) - 1, ret;
	82	h[0] = bfc_hash_64((x[u<<1\|0] + x[u<<1\|1]) & mask, mask);
	83	h[1] = bfc_hash_64(h[0] ^ x[u<<1\|1], mask);
	84	ret = (h[0] ^ h[1]) << k \| ((h[0] + h[1]) & mask);
	85	h[0] = (h[0] + h[1]) & mask;
	86	return ret;
	87	}
	88
	89	static inline void bfc_kmer_hash_inv(int k, const uint64_t h[2], uint64_t y[2])
	90	{
	91	uint64_t mask = (1ULL<<k) - 1, t = (h[0] - h[1]) & mask;
	92	y[1] = bfc_hash_64_inv(h[1], mask) ^ t;
	93	y[0] = (bfc_hash_64_inv(t, mask) - y[1]) & mask;
	94	}
	95
	96	static inline char bfc_kmer_2str(int k, const uint64_t y[2], char buf)
	97	{
	98	int l;
	99	for (l = 0; l < k; ++l)
	100	buf[k - 1 - l] = "ACGT"[(y[1]>>l&1)<<1 \| (y[0]>>l&1)];
	101	buf[k] = 0;
	102	return buf;
	103	}
	104
	105	#endif

+248

-0

third_party/fermi-lite-0.1/kseq.h less more

	0	/* The MIT License
	1
	2	Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
	3
	4	Permission is hereby granted, free of charge, to any person obtaining
	5	a copy of this software and associated documentation files (the
	6	"Software"), to deal in the Software without restriction, including
	7	without limitation the rights to use, copy, modify, merge, publish,
	8	distribute, sublicense, and/or sell copies of the Software, and to
	9	permit persons to whom the Software is furnished to do so, subject to
	10	the following conditions:
	11
	12	The above copyright notice and this permission notice shall be
	13	included in all copies or substantial portions of the Software.
	14
	15	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	16	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	17	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	18	NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	19	BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	20	ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	21	CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	22	SOFTWARE.
	23	*/
	24
	25	/* Last Modified: 05MAR2012 */
	26
	27	#ifndef AC_KSEQ_H
	28	#define AC_KSEQ_H
	29
	30	#include <ctype.h>
	31	#include <string.h>
	32	#include <stdlib.h>
	33
	34	#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
	35	#define KS_SEP_TAB 1 // isspace() && !' '
	36	#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
	37	#define KS_SEP_MAX 2
	38
	39	#define __KS_TYPE(type_t) \
	40	typedef struct __kstream_t { \
	41	int begin, end; \
	42	int is_eof:2, bufsize:30; \
	43	type_t f; \
	44	unsigned char *buf; \
	45	} kstream_t;
	46
	47	#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
	48	#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
	49
	50	#define __KS_BASIC(SCOPE, type_t, __bufsize) \
	51	SCOPE kstream_t *ks_init(type_t f) \
	52	{ \
	53	kstream_t ks = (kstream_t)calloc(1, sizeof(kstream_t)); \
	54	ks->f = f; ks->bufsize = __bufsize; \
	55	ks->buf = (unsigned char*)malloc(__bufsize); \
	56	return ks; \
	57	} \
	58	SCOPE void ks_destroy(kstream_t *ks) \
	59	{ \
	60	if (!ks) return; \
	61	free(ks->buf); \
	62	free(ks); \
	63	}
	64
	65	#define __KS_INLINED(__read) \
	66	static inline int ks_getc(kstream_t *ks) \
	67	{ \
	68	if (ks->is_eof && ks->begin >= ks->end) return -1; \
	69	if (ks->begin >= ks->end) { \
	70	ks->begin = 0; \
	71	ks->end = __read(ks->f, ks->buf, ks->bufsize); \
	72	if (ks->end < ks->bufsize) ks->is_eof = 1; \
	73	if (ks->end == 0) return -1; \
	74	} \
	75	return (int)ks->buf[ks->begin++]; \
	76	} \
	77	static inline int ks_getuntil(kstream_t ks, int delimiter, kstring_t str, int *dret) \
	78	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
	79
	80	#ifndef KSTRING_T
	81	#define KSTRING_T kstring_t
	82	typedef struct __kstring_t {
	83	size_t l, m;
	84	char *s;
	85	} kstring_t;
	86	#endif
	87
	88	#ifndef kroundup32
	89	#define kroundup32(x) (--(x), (x)\|=(x)>>1, (x)\|=(x)>>2, (x)\|=(x)>>4, (x)\|=(x)>>8, (x)\|=(x)>>16, ++(x))
	90	#endif
	91
	92	#define __KS_GETUNTIL(SCOPE, __read) \
	93	SCOPE int ks_getuntil2(kstream_t ks, int delimiter, kstring_t str, int *dret, int append) \
	94	{ \
	95	if (dret) *dret = 0; \
	96	str->l = append? str->l : 0; \
	97	if (ks->begin >= ks->end && ks->is_eof) return -1; \
	98	for (;;) { \
	99	int i; \
	100	if (ks->begin >= ks->end) { \
	101	if (!ks->is_eof) { \
	102	ks->begin = 0; \
	103	ks->end = __read(ks->f, ks->buf, ks->bufsize); \
	104	if (ks->end < ks->bufsize) ks->is_eof = 1; \
	105	if (ks->end == 0) break; \
	106	} else break; \
	107	} \
	108	if (delimiter == KS_SEP_LINE) { \
	109	for (i = ks->begin; i < ks->end; ++i) \
	110	if (ks->buf[i] == '\n') break; \
	111	} else if (delimiter > KS_SEP_MAX) { \
	112	for (i = ks->begin; i < ks->end; ++i) \
	113	if (ks->buf[i] == delimiter) break; \
	114	} else if (delimiter == KS_SEP_SPACE) { \
	115	for (i = ks->begin; i < ks->end; ++i) \
	116	if (isspace(ks->buf[i])) break; \
	117	} else if (delimiter == KS_SEP_TAB) { \
	118	for (i = ks->begin; i < ks->end; ++i) \
	119	if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
	120	} else i = 0; /* never come to here! */ \
	121	if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
	122	str->m = str->l + (i - ks->begin) + 1; \
	123	kroundup32(str->m); \
	124	str->s = (char*)realloc(str->s, str->m); \
	125	} \
	126	memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
	127	str->l = str->l + (i - ks->begin); \
	128	ks->begin = i + 1; \
	129	if (i < ks->end) { \
	130	if (dret) *dret = ks->buf[i]; \
	131	break; \
	132	} \
	133	} \
	134	if (str->s == 0) { \
	135	str->m = 1; \
	136	str->s = (char*)calloc(1, 1); \
	137	} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
	138	str->s[str->l] = '\0'; \
	139	return str->l; \
	140	}
	141
	142	#define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \
	143	__KS_TYPE(type_t) \
	144	__KS_BASIC(SCOPE, type_t, __bufsize) \
	145	__KS_GETUNTIL(SCOPE, __read) \
	146	__KS_INLINED(__read)
	147
	148	#define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize)
	149
	150	#define KSTREAM_DECLARE(type_t, __read) \
	151	__KS_TYPE(type_t) \
	152	extern int ks_getuntil2(kstream_t ks, int delimiter, kstring_t str, int *dret, int append); \
	153	extern kstream_t *ks_init(type_t f); \
	154	extern void ks_destroy(kstream_t *ks); \
	155	__KS_INLINED(__read)
	156
	157	/******************
	158	* FASTA/Q parser *
	159	******************/
	160
	161	#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
	162
	163	#define __KSEQ_BASIC(SCOPE, type_t) \
	164	SCOPE kseq_t *kseq_init(type_t fd) \
	165	{ \
	166	kseq_t s = (kseq_t)calloc(1, sizeof(kseq_t)); \
	167	s->f = ks_init(fd); \
	168	return s; \
	169	} \
	170	SCOPE void kseq_destroy(kseq_t *ks) \
	171	{ \
	172	if (!ks) return; \
	173	free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
	174	ks_destroy(ks->f); \
	175	free(ks); \
	176	}
	177
	178	/* Return value:
	179	>=0 length of the sequence (normal)
	180	-1 end-of-file
	181	-2 truncated quality string
	182	*/
	183	#define __KSEQ_READ(SCOPE) \
	184	SCOPE int kseq_read(kseq_t *seq) \
	185	{ \
	186	int c; \
	187	kstream_t *ks = seq->f; \
	188	if (seq->last_char == 0) { /* then jump to the next header line */ \
	189	while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
	190	if (c == -1) return -1; /* end of file */ \
	191	seq->last_char = c; \
	192	} /* else: the first header char has been read in the previous call */ \
	193	seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
	194	if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
	195	if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
	196	if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
	197	seq->seq.m = 256; \
	198	seq->seq.s = (char*)malloc(seq->seq.m); \
	199	} \
	200	while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
	201	if (c == '\n') continue; /* skip empty lines */ \
	202	seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
	203	ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
	204	} \
	205	if (c == '>' \|\| c == '@') seq->last_char = c; /* the first header char has been read */ \
	206	if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
	207	seq->seq.m = seq->seq.l + 2; \
	208	kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
	209	seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
	210	} \
	211	seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
	212	if (c != '+') return seq->seq.l; /* FASTA */ \
	213	if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
	214	seq->qual.m = seq->seq.m; \
	215	seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
	216	} \
	217	while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
	218	if (c == -1) return -2; /* error: no quality string */ \
	219	while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
	220	seq->last_char = 0; /* we have not come to the next header line */ \
	221	if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
	222	return seq->seq.l; \
	223	}
	224
	225	#define __KSEQ_TYPE(type_t) \
	226	typedef struct { \
	227	kstring_t name, comment, seq, qual; \
	228	int last_char; \
	229	kstream_t *f; \
	230	} kseq_t;
	231
	232	#define KSEQ_INIT2(SCOPE, type_t, __read) \
	233	KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \
	234	__KSEQ_TYPE(type_t) \
	235	__KSEQ_BASIC(SCOPE, type_t) \
	236	__KSEQ_READ(SCOPE)
	237
	238	#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
	239
	240	#define KSEQ_DECLARE(type_t) \
	241	__KS_TYPE(type_t) \
	242	__KSEQ_TYPE(type_t) \
	243	extern kseq_t *kseq_init(type_t fd); \
	244	void kseq_destroy(kseq_t *ks); \
	245	int kseq_read(kseq_t *seq);
	246
	247	#endif

+309

-0

third_party/fermi-lite-0.1/ksort.h less more

	0	/* The MIT License
	1
	2	Copyright (c) 2008, 2011 Attractive Chaos <attractor@live.co.uk>
	3
	4	Permission is hereby granted, free of charge, to any person obtaining
	5	a copy of this software and associated documentation files (the
	6	"Software"), to deal in the Software without restriction, including
	7	without limitation the rights to use, copy, modify, merge, publish,
	8	distribute, sublicense, and/or sell copies of the Software, and to
	9	permit persons to whom the Software is furnished to do so, subject to
	10	the following conditions:
	11
	12	The above copyright notice and this permission notice shall be
	13	included in all copies or substantial portions of the Software.
	14
	15	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	16	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	17	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	18	NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	19	BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	20	ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	21	CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	22	SOFTWARE.
	23	*/
	24
	25	/*
	26	2011-04-10 (0.1.6):
	27
	28	* Added sample
	29
	30	2011-03 (0.1.5):
	31
	32	* Added shuffle/permutation
	33
	34	2008-11-16 (0.1.4):
	35
	36	* Fixed a bug in introsort() that happens in rare cases.
	37
	38	2008-11-05 (0.1.3):
	39
	40	* Fixed a bug in introsort() for complex comparisons.
	41
	42	* Fixed a bug in mergesort(). The previous version is not stable.
	43
	44	2008-09-15 (0.1.2):
	45
	46	* Accelerated introsort. On my Mac (not on another Linux machine),
	47	my implementation is as fast as std::sort on random input.
	48
	49	* Added combsort and in introsort, switch to combsort if the
	50	recursion is too deep.
	51
	52	2008-09-13 (0.1.1):
	53
	54	* Added k-small algorithm
	55
	56	2008-09-05 (0.1.0):
	57
	58	* Initial version
	59
	60	*/
	61
	62	#ifndef AC_KSORT_H
	63	#define AC_KSORT_H
	64
	65	#include <stdlib.h>
	66	#include <string.h>
	67
	68	typedef struct {
	69	void left, right;
	70	int depth;
	71	} ks_isort_stack_t;
	72
	73	#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
	74
	75	#define KSORT_INIT(name, type_t, __sort_lt) \
	76	void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \
	77	{ \
	78	type_t a2[2], a, *b; \
	79	int curr, shift; \
	80	\
	81	a2[0] = array; \
	82	a2[1] = temp? temp : (type_t)malloc(sizeof(type_t) n); \
	83	for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) { \
	84	a = a2[curr]; b = a2[1-curr]; \
	85	if (shift == 0) { \
	86	type_t p = b, i, *eb = a + n; \
	87	for (i = a; i < eb; i += 2) { \
	88	if (i == eb - 1) p++ = i; \
	89	else { \
	90	if (__sort_lt((i+1), i)) { \
	91	p++ = (i+1); p++ = i; \
	92	} else { \
	93	p++ = i; p++ = (i+1); \
	94	} \
	95	} \
	96	} \
	97	} else { \
	98	size_t i, step = 1ul<<shift; \
	99	for (i = 0; i < n; i += step<<1) { \
	100	type_t p, j, k, ea, *eb; \
	101	if (n < i + step) { \
	102	ea = a + n; eb = a; \
	103	} else { \
	104	ea = a + i + step; \
	105	eb = a + (n < i + (step<<1)? n : i + (step<<1)); \
	106	} \
	107	j = a + i; k = a + i + step; p = b + i; \
	108	while (j < ea && k < eb) { \
	109	if (__sort_lt(k, j)) p++ = k++; \
	110	else p++ = j++; \
	111	} \
	112	while (j < ea) p++ = j++; \
	113	while (k < eb) p++ = k++; \
	114	} \
	115	} \
	116	curr = 1 - curr; \
	117	} \
	118	if (curr == 1) { \
	119	type_t p = a2[0], i = a2[1], *eb = array + n; \
	120	for (; p < eb; ++i) p++ = i; \
	121	} \
	122	if (temp == 0) free(a2[1]); \
	123	} \
	124	void ks_heapdown_##name(size_t i, size_t n, type_t l[]) \
	125	{ \
	126	size_t k = i; \
	127	type_t tmp = l[i]; \
	128	while ((k = (k << 1) + 1) < n) { \
	129	if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k; \
	130	if (__sort_lt(l[k], tmp)) break; \
	131	l[i] = l[k]; i = k; \
	132	} \
	133	l[i] = tmp; \
	134	} \
	135	void ks_heapup_##name(size_t n, type_t l[]) \
	136	{ \
	137	size_t i, k = n - 1; \
	138	type_t tmp = l[k]; \
	139	while (k) { \
	140	i = (k - 1) >> 1; \
	141	if (__sort_lt(tmp, l[i])) break; \
	142	l[k] = l[i]; k = i; \
	143	} \
	144	l[k] = tmp; \
	145	} \
	146	void ks_heapmake_##name(size_t lsize, type_t l[]) \
	147	{ \
	148	size_t i; \
	149	for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i) \
	150	ks_heapdown_##name(i, lsize, l); \
	151	} \
	152	void ks_heapsort_##name(size_t lsize, type_t l[]) \
	153	{ \
	154	size_t i; \
	155	for (i = lsize - 1; i > 0; --i) { \
	156	type_t tmp; \
	157	tmp = l; l = l[i]; l[i] = tmp; ks_heapdown_##name(0, i, l); \
	158	} \
	159	} \
	160	static inline void __ks_insertsort_##name(type_t s, type_t t) \
	161	{ \
	162	type_t i, j, swap_tmp; \
	163	for (i = s + 1; i < t; ++i) \
	164	for (j = i; j > s && __sort_lt(j, (j-1)); --j) { \
	165	swap_tmp = j; j = (j-1); (j-1) = swap_tmp; \
	166	} \
	167	} \
	168	void ks_combsort_##name(size_t n, type_t a[]) \
	169	{ \
	170	const double shrink_factor = 1.2473309501039786540366528676643; \
	171	int do_swap; \
	172	size_t gap = n; \
	173	type_t tmp, i, j; \
	174	do { \
	175	if (gap > 2) { \
	176	gap = (size_t)(gap / shrink_factor); \
	177	if (gap == 9 \|\| gap == 10) gap = 11; \
	178	} \
	179	do_swap = 0; \
	180	for (i = a; i < a + n - gap; ++i) { \
	181	j = i + gap; \
	182	if (__sort_lt(j, i)) { \
	183	tmp = i; i = j; j = tmp; \
	184	do_swap = 1; \
	185	} \
	186	} \
	187	} while (do_swap \|\| gap > 2); \
	188	if (gap != 1) __ks_insertsort_##name(a, a + n); \
	189	} \
	190	void ks_introsort_##name(size_t n, type_t a[]) \
	191	{ \
	192	int d; \
	193	ks_isort_stack_t top, stack; \
	194	type_t rp, swap_tmp; \
	195	type_t s, t, i, j, *k; \
	196	\
	197	if (n < 1) return; \
	198	else if (n == 2) { \
	199	if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \
	200	return; \
	201	} \
	202	for (d = 2; 1ul<<d < n; ++d); \
	203	stack = (ks_isort_stack_t)malloc(sizeof(ks_isort_stack_t) ((sizeof(size_t)*d)+2)); \
	204	top = stack; s = a; t = a + (n-1); d <<= 1; \
	205	while (1) { \
	206	if (s < t) { \
	207	if (--d == 0) { \
	208	ks_combsort_##name(t - s + 1, s); \
	209	t = s; \
	210	continue; \
	211	} \
	212	i = s; j = t; k = i + ((j-i)>>1) + 1; \
	213	if (__sort_lt(k, i)) { \
	214	if (__sort_lt(k, j)) k = j; \
	215	} else k = __sort_lt(j, i)? i : j; \
	216	rp = *k; \
	217	if (k != t) { swap_tmp = k; k = t; t = swap_tmp; } \
	218	for (;;) { \
	219	do ++i; while (__sort_lt(*i, rp)); \
	220	do --j; while (i <= j && __sort_lt(rp, *j)); \
	221	if (j <= i) break; \
	222	swap_tmp = i; i = j; j = swap_tmp; \
	223	} \
	224	swap_tmp = i; i = t; t = swap_tmp; \
	225	if (i-s > t-i) { \
	226	if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \
	227	s = t-i > 16? i+1 : t; \
	228	} else { \
	229	if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \
	230	t = i-s > 16? i-1 : s; \
	231	} \
	232	} else { \
	233	if (top == stack) { \
	234	free(stack); \
	235	__ks_insertsort_##name(a, a+n); \
	236	return; \
	237	} else { --top; s = (type_t)top->left; t = (type_t)top->right; d = top->depth; } \
	238	} \
	239	} \
	240	} \
	241	/* This function is adapted from: http://ndevilla.free.fr/median/ */ \
	242	/* 0 <= kk < n */ \
	243	type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \
	244	{ \
	245	type_t low, high, k, ll, hh, mid; \
	246	low = arr; high = arr + n - 1; k = arr + kk; \
	247	for (;;) { \
	248	if (high <= low) return *k; \
	249	if (high == low + 1) { \
	250	if (__sort_lt(high, low)) KSORT_SWAP(type_t, low, high); \
	251	return *k; \
	252	} \
	253	mid = low + (high - low) / 2; \
	254	if (__sort_lt(high, mid)) KSORT_SWAP(type_t, mid, high); \
	255	if (__sort_lt(high, low)) KSORT_SWAP(type_t, low, high); \
	256	if (__sort_lt(low, mid)) KSORT_SWAP(type_t, mid, low); \
	257	KSORT_SWAP(type_t, mid, (low+1)); \
	258	ll = low + 1; hh = high; \
	259	for (;;) { \
	260	do ++ll; while (__sort_lt(ll, low)); \
	261	do --hh; while (__sort_lt(low, hh)); \
	262	if (hh < ll) break; \
	263	KSORT_SWAP(type_t, ll, hh); \
	264	} \
	265	KSORT_SWAP(type_t, low, hh); \
	266	if (hh <= k) low = ll; \
	267	if (hh >= k) high = hh - 1; \
	268	} \
	269	} \
	270	void ks_shuffle_##name(size_t n, type_t a[]) \
	271	{ \
	272	int i, j; \
	273	for (i = n; i > 1; --i) { \
	274	type_t tmp; \
	275	j = (int)(drand48() * i); \
	276	tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; \
	277	} \
	278	} \
	279	void ks_sample_##name(size_t n, size_t r, type_t a[]) /* FIXME: NOT TESTED!!! */ \
	280	{ /* reference: http://code.activestate.com/recipes/272884/ */ \
	281	int i, k, pop = n; \
	282	for (i = (int)r, k = 0; i >= 0; --i) { \
	283	double z = 1., x = drand48(); \
	284	type_t tmp; \
	285	while (x < z) z -= z * i / (pop--); \
	286	if (k != n - pop - 1) tmp = a[k], a[k] = a[n-pop-1], a[n-pop-1] = tmp; \
	287	++k; \
	288	} \
	289	}
	290
	291	#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
	292	#define ks_introsort(name, n, a) ks_introsort_##name(n, a)
	293	#define ks_combsort(name, n, a) ks_combsort_##name(n, a)
	294	#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)
	295	#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
	296	#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
	297	#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
	298	#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a)
	299
	300	#define ks_lt_generic(a, b) ((a) < (b))
	301	#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
	302
	303	typedef const char *ksstr_t;
	304
	305	#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
	306	#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
	307
	308	#endif

+169

-0

third_party/fermi-lite-0.1/kstring.h less more

	0	/* The MIT License
	1
	2	Copyright (c) by Attractive Chaos <attractor@live.co.uk>
	3
	4	Permission is hereby granted, free of charge, to any person obtaining
	5	a copy of this software and associated documentation files (the
	6	"Software"), to deal in the Software without restriction, including
	7	without limitation the rights to use, copy, modify, merge, publish,
	8	distribute, sublicense, and/or sell copies of the Software, and to
	9	permit persons to whom the Software is furnished to do so, subject to
	10	the following conditions:
	11
	12	The above copyright notice and this permission notice shall be
	13	included in all copies or substantial portions of the Software.
	14
	15	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	16	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	17	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	18	NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	19	BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	20	ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	21	CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	22	SOFTWARE.
	23	*/
	24
	25	#ifndef KSTRING_H
	26	#define KSTRING_H
	27
	28	#include <stdlib.h>
	29	#include <string.h>
	30	#include <stdint.h>
	31
	32	#ifndef kroundup32
	33	#define kroundup32(x) (--(x), (x)\|=(x)>>1, (x)\|=(x)>>2, (x)\|=(x)>>4, (x)\|=(x)>>8, (x)\|=(x)>>16, ++(x))
	34	#endif
	35
	36	#ifndef KSTRING_T
	37	#define KSTRING_T kstring_t
	38	typedef struct __kstring_t {
	39	size_t l, m;
	40	char *s;
	41	} kstring_t;
	42	#endif
	43
	44	typedef struct {
	45	uint64_t tab[4];
	46	int sep, finished;
	47	const char *p; // end of the current token
	48	} ks_tokaux_t;
	49
	50	#ifdef __cplusplus
	51	extern "C" {
	52	#endif
	53
	54	int ksprintf(kstring_t s, const char fmt, ...);
	55	int ksprintf_fast(kstring_t s, const char fmt, ...);
	56	int ksplit_core(char s, int delimiter, int _max, int **_offsets);
	57	char kstrstr(const char str, const char pat, int *_prep);
	58	char kstrnstr(const char str, const char pat, int n, int *_prep);
	59	void kmemmem(const void _str, int n, const void _pat, int m, int *_prep);
	60
	61	/* kstrtok() is similar to strtok_r() except that str is not
	62	* modified and both str and sep can be NULL. For efficiency, it is
	63	* actually recommended to set both to NULL in the subsequent calls
	64	* if sep is not changed. */
	65	char kstrtok(const char str, const char sep, ks_tokaux_t aux);
	66
	67	#ifdef __cplusplus
	68	}
	69	#endif
	70
	71	static inline void ks_resize(kstring_t *s, size_t size)
	72	{
	73	if (s->m < size) {
	74	s->m = size;
	75	kroundup32(s->m);
	76	s->s = (char*)realloc(s->s, s->m);
	77	}
	78	}
	79
	80	static inline int kputsn(const char p, int l, kstring_t s)
	81	{
	82	if (s->l + l + 1 >= s->m) {
	83	s->m = s->l + l + 2;
	84	kroundup32(s->m);
	85	s->s = (char*)realloc(s->s, s->m);
	86	}
	87	memcpy(s->s + s->l, p, l);
	88	s->l += l;
	89	s->s[s->l] = 0;
	90	return l;
	91	}
	92
	93	static inline int kputs(const char p, kstring_t s)
	94	{
	95	return kputsn(p, strlen(p), s);
	96	}
	97
	98	static inline int kputc(int c, kstring_t *s)
	99	{
	100	if (s->l + 1 >= s->m) {
	101	s->m = s->l + 2;
	102	kroundup32(s->m);
	103	s->s = (char*)realloc(s->s, s->m);
	104	}
	105	s->s[s->l++] = c;
	106	s->s[s->l] = 0;
	107	return c;
	108	}
	109
	110	static inline int kputw(int c, kstring_t *s)
	111	{
	112	char buf[16];
	113	int l, x;
	114	if (c == 0) return kputc('0', s);
	115	for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
	116	if (c < 0) buf[l++] = '-';
	117	if (s->l + l + 1 >= s->m) {
	118	s->m = s->l + l + 2;
	119	kroundup32(s->m);
	120	s->s = (char*)realloc(s->s, s->m);
	121	}
	122	for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
	123	s->s[s->l] = 0;
	124	return 0;
	125	}
	126
	127	static inline int kputuw(unsigned c, kstring_t *s)
	128	{
	129	char buf[16];
	130	int l, i;
	131	unsigned x;
	132	if (c == 0) return kputc('0', s);
	133	for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
	134	if (s->l + l + 1 >= s->m) {
	135	s->m = s->l + l + 2;
	136	kroundup32(s->m);
	137	s->s = (char*)realloc(s->s, s->m);
	138	}
	139	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
	140	s->s[s->l] = 0;
	141	return 0;
	142	}
	143
	144	static inline int kputl(long c, kstring_t *s)
	145	{
	146	char buf[32];
	147	long l, x;
	148	if (c == 0) return kputc('0', s);
	149	for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
	150	if (c < 0) buf[l++] = '-';
	151	if (s->l + l + 1 >= s->m) {
	152	s->m = s->l + l + 2;
	153	kroundup32(s->m);
	154	s->s = (char*)realloc(s->s, s->m);
	155	}
	156	for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
	157	s->s[s->l] = 0;
	158	return 0;
	159	}
	160
	161	static inline int ksplit(kstring_t s, int delimiter, int *n)
	162	{
	163	int max = 0, *offsets = 0;
	164	*n = ksplit_core(s->s, delimiter, &max, &offsets);
	165	return offsets;
	166	}
	167
	168	#endif

+352

-0

third_party/fermi-lite-0.1/ksw.c less more

	0	/* The MIT License
	1
	2	Copyright (c) 2011 by Attractive Chaos <attractor@live.co.uk>
	3
	4	Permission is hereby granted, free of charge, to any person obtaining
	5	a copy of this software and associated documentation files (the
	6	"Software"), to deal in the Software without restriction, including
	7	without limitation the rights to use, copy, modify, merge, publish,
	8	distribute, sublicense, and/or sell copies of the Software, and to
	9	permit persons to whom the Software is furnished to do so, subject to
	10	the following conditions:
	11
	12	The above copyright notice and this permission notice shall be
	13	included in all copies or substantial portions of the Software.
	14
	15	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	16	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	17	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	18	NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	19	BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	20	ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	21	CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	22	SOFTWARE.
	23	*/
	24
	25	#include <stdlib.h>
	26	#include <stdint.h>
	27	#include <emmintrin.h>
	28	#include "ksw.h"
	29
	30	#ifdef __GNUC__
	31	#define LIKELY(x) __builtin_expect((x),1)
	32	#define UNLIKELY(x) __builtin_expect((x),0)
	33	#else
	34	#define LIKELY(x) (x)
	35	#define UNLIKELY(x) (x)
	36	#endif
	37
	38	const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
	39
	40	struct _kswq_t {
	41	int qlen, slen;
	42	uint8_t shift, mdiff, max, size;
	43	__m128i qp, H0, H1, E, *Hmax;
	44	};
	45
	46	/**
	47	* Initialize the query data structure
	48	*
	49	* @param size Number of bytes used to store a score; valid valures are 1 or 2
	50	* @param qlen Length of the query sequence
	51	* @param query Query sequence
	52	* @param m Size of the alphabet
	53	* @param mat Scoring matrix in a one-dimension array
	54	*
	55	* @return Query data structure
	56	*/
	57	kswq_t ksw_qinit(int size, int qlen, const uint8_t query, int m, const int8_t *mat)
	58	{
	59	kswq_t *q;
	60	int slen, a, tmp, p;
	61
	62	size = size > 1? 2 : 1;
	63	p = 8 * (3 - size); // # values per __m128i
	64	slen = (qlen + p - 1) / p; // segmented length
	65	q = (kswq_t)malloc(sizeof(kswq_t) + 256 + 16 slen * (m + 4)); // a single block of memory
	66	q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
	67	q->H0 = q->qp + slen * m;
	68	q->H1 = q->H0 + slen;
	69	q->E = q->H1 + slen;
	70	q->Hmax = q->E + slen;
	71	q->slen = slen; q->qlen = qlen; q->size = size;
	72	// compute shift
	73	tmp = m * m;
	74	for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
	75	if (mat[a] < (int8_t)q->shift) q->shift = mat[a];
	76	if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a];
	77	}
	78	q->max = q->mdiff;
	79	q->shift = 256 - q->shift; // NB: q->shift is uint8_t
	80	q->mdiff += q->shift; // this is the difference between the min and max scores
	81	// An example: p=8, qlen=19, slen=3 and segmentation:
	82	// {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
	83	if (size == 1) {
	84	int8_t t = (int8_t)q->qp;
	85	for (a = 0; a < m; ++a) {
	86	int i, k, nlen = slen * p;
	87	const int8_t ma = mat + a m;
	88	for (i = 0; i < slen; ++i)
	89	for (k = i; k < nlen; k += slen) // p iterations
	90	*t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift;
	91	}
	92	} else {
	93	int16_t t = (int16_t)q->qp;
	94	for (a = 0; a < m; ++a) {
	95	int i, k, nlen = slen * p;
	96	const int8_t ma = mat + a m;
	97	for (i = 0; i < slen; ++i)
	98	for (k = i; k < nlen; k += slen) // p iterations
	99	*t++ = (k >= qlen? 0 : ma[query[k]]);
	100	}
	101	}
	102	return q;
	103	}
	104
	105	kswr_t ksw_u8(kswq_t q, int tlen, const uint8_t target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
	106	{
	107	int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
	108	uint64_t *b;
	109	__m128i zero, gapoe, gape, shift, H0, H1, E, Hmax;
	110	kswr_t r;
	111
	112	#define __max_16(ret, xx) do { \
	113	(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
	114	(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \
	115	(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \
	116	(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \
	117	(ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \
	118	} while (0)
	119
	120	// initialization
	121	r = g_defr;
	122	minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
	123	endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
	124	m_b = n_b = 0; b = 0;
	125	zero = _mm_set1_epi32(0);
	126	gapoe = _mm_set1_epi8(_gapo + _gape);
	127	gape = _mm_set1_epi8(_gape);
	128	shift = _mm_set1_epi8(q->shift);
	129	H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
	130	slen = q->slen;
	131	for (i = 0; i < slen; ++i) {
	132	_mm_store_si128(E + i, zero);
	133	_mm_store_si128(H0 + i, zero);
	134	_mm_store_si128(Hmax + i, zero);
	135	}
	136	// the core loop
	137	for (i = 0; i < tlen; ++i) {
	138	int j, k, cmp, imax;
	139	__m128i e, h, f = zero, max = zero, S = q->qp + target[i] slen; // s is the 1st score vector
	140	h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
	141	h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
	142	for (j = 0; LIKELY(j < slen); ++j) {
	143	/* SW cells are computed in the following order:
	144	* H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
	145	* E(i+1,j) = max{H(i,j)-q, E(i,j)-r}
	146	* F(i,j+1) = max{H(i,j)-q, F(i,j)-r}
	147	*/
	148	// compute H'(i,j); note that at the beginning, h=H'(i-1,j-1)
	149	h = _mm_adds_epu8(h, _mm_load_si128(S + j));
	150	h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j)
	151	e = _mm_load_si128(E + j); // e=E'(i,j)
	152	h = _mm_max_epu8(h, e);
	153	h = _mm_max_epu8(h, f); // h=H'(i,j)
	154	max = _mm_max_epu8(max, h); // set max
	155	_mm_store_si128(H1 + j, h); // save to H'(i,j)
	156	// now compute E'(i+1,j)
	157	h = _mm_subs_epu8(h, gapoe); // h=H'(i,j)-gapo
	158	e = _mm_subs_epu8(e, gape); // e=E'(i,j)-gape
	159	e = _mm_max_epu8(e, h); // e=E'(i+1,j)
	160	_mm_store_si128(E + j, e); // save to E'(i+1,j)
	161	// now compute F'(i,j+1)
	162	f = _mm_subs_epu8(f, gape);
	163	f = _mm_max_epu8(f, h);
	164	// get H'(i-1,j) and prepare for the next j
	165	h = _mm_load_si128(H0 + j); // h=H'(i-1,j)
	166	}
	167	// NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion
	168	for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max
	169	f = _mm_slli_si128(f, 1);
	170	for (j = 0; LIKELY(j < slen); ++j) {
	171	h = _mm_load_si128(H1 + j);
	172	h = _mm_max_epu8(h, f); // h=H'(i,j)
	173	_mm_store_si128(H1 + j, h);
	174	h = _mm_subs_epu8(h, gapoe);
	175	f = _mm_subs_epu8(f, gape);
	176	cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero));
	177	if (UNLIKELY(cmp == 0xffff)) goto end_loop16;
	178	}
	179	}
	180	end_loop16:
	181	//int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n");
	182	__max_16(imax, max); // imax is the maximum number in max
	183	if (imax >= minsc) { // write the b array; this condition adds branching unfornately
	184	if (n_b == 0 \|\| (int32_t)b[n_b-1] + 1 != i) { // then append
	185	if (n_b == m_b) {
	186	m_b = m_b? m_b<<1 : 8;
	187	b = (uint64_t)realloc(b, 8 m_b);
	188	}
	189	b[n_b++] = (uint64_t)imax<<32 \| i;
	190	} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 \| i; // modify the last
	191	}
	192	if (imax > gmax) {
	193	gmax = imax; te = i; // te is the end position on the target
	194	for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
	195	_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
	196	if (gmax + q->shift >= 255 \|\| gmax >= endsc) break;
	197	}
	198	S = H1; H1 = H0; H0 = S; // swap H0 and H1
	199	}
	200	r.score = gmax + q->shift < 255? gmax : 255;
	201	r.te = te;
	202	if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
	203	int max = -1, low, high, qlen = slen * 16;
	204	uint8_t t = (uint8_t)Hmax;
	205	for (i = 0; i < qlen; ++i, ++t)
	206	if ((int)t > max) max = t, r.qe = i / 16 + i % 16 * slen;
	207	//printf("%d,%d\n", max, gmax);
	208	if (b) {
	209	i = (r.score + q->max - 1) / q->max;
	210	low = te - i; high = te + i;
	211	for (i = 0; i < n_b; ++i) {
	212	int e = (int32_t)b[i];
	213	if ((e < low \|\| e > high) && b[i]>>32 > (uint32_t)r.score2)
	214	r.score2 = b[i]>>32, r.te2 = e;
	215	}
	216	}
	217	}
	218	free(b);
	219	return r;
	220	}
	221
	222	kswr_t ksw_i16(kswq_t q, int tlen, const uint8_t target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
	223	{
	224	int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
	225	uint64_t *b;
	226	__m128i zero, gapoe, gape, H0, H1, E, Hmax;
	227	kswr_t r;
	228
	229	#define __max_8(ret, xx) do { \
	230	(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
	231	(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
	232	(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
	233	(ret) = _mm_extract_epi16((xx), 0); \
	234	} while (0)
	235
	236	// initialization
	237	r = g_defr;
	238	minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
	239	endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
	240	m_b = n_b = 0; b = 0;
	241	zero = _mm_set1_epi32(0);
	242	gapoe = _mm_set1_epi16(_gapo + _gape);
	243	gape = _mm_set1_epi16(_gape);
	244	H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
	245	slen = q->slen;
	246	for (i = 0; i < slen; ++i) {
	247	_mm_store_si128(E + i, zero);
	248	_mm_store_si128(H0 + i, zero);
	249	_mm_store_si128(Hmax + i, zero);
	250	}
	251	// the core loop
	252	for (i = 0; i < tlen; ++i) {
	253	int j, k, imax;
	254	__m128i e, h, f = zero, max = zero, S = q->qp + target[i] slen; // s is the 1st score vector
	255	h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
	256	h = _mm_slli_si128(h, 2);
	257	for (j = 0; LIKELY(j < slen); ++j) {
	258	h = _mm_adds_epi16(h, *S++);
	259	e = _mm_load_si128(E + j);
	260	h = _mm_max_epi16(h, e);
	261	h = _mm_max_epi16(h, f);
	262	max = _mm_max_epi16(max, h);
	263	_mm_store_si128(H1 + j, h);
	264	h = _mm_subs_epu16(h, gapoe);
	265	e = _mm_subs_epu16(e, gape);
	266	e = _mm_max_epi16(e, h);
	267	_mm_store_si128(E + j, e);
	268	f = _mm_subs_epu16(f, gape);
	269	f = _mm_max_epi16(f, h);
	270	h = _mm_load_si128(H0 + j);
	271	}
	272	for (k = 0; LIKELY(k < 16); ++k) {
	273	f = _mm_slli_si128(f, 2);
	274	for (j = 0; LIKELY(j < slen); ++j) {
	275	h = _mm_load_si128(H1 + j);
	276	h = _mm_max_epi16(h, f);
	277	_mm_store_si128(H1 + j, h);
	278	h = _mm_subs_epu16(h, gapoe);
	279	f = _mm_subs_epu16(f, gape);
	280	if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
	281	}
	282	}
	283	end_loop8:
	284	__max_8(imax, max);
	285	if (imax >= minsc) {
	286	if (n_b == 0 \|\| (int32_t)b[n_b-1] + 1 != i) {
	287	if (n_b == m_b) {
	288	m_b = m_b? m_b<<1 : 8;
	289	b = (uint64_t)realloc(b, 8 m_b);
	290	}
	291	b[n_b++] = (uint64_t)imax<<32 \| i;
	292	} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 \| i; // modify the last
	293	}
	294	if (imax > gmax) {
	295	gmax = imax; te = i;
	296	for (j = 0; LIKELY(j < slen); ++j)
	297	_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
	298	if (gmax >= endsc) break;
	299	}
	300	S = H1; H1 = H0; H0 = S;
	301	}
	302	r.score = gmax; r.te = te;
	303	{
	304	int max = -1, low, high, qlen = slen * 8;
	305	uint16_t t = (uint16_t)Hmax;
	306	for (i = 0, r.qe = -1; i < qlen; ++i, ++t)
	307	if ((int)t > max) max = t, r.qe = i / 8 + i % 8 * slen;
	308	if (b) {
	309	i = (r.score + q->max - 1) / q->max;
	310	low = te - i; high = te + i;
	311	for (i = 0; i < n_b; ++i) {
	312	int e = (int32_t)b[i];
	313	if ((e < low \|\| e > high) && b[i]>>32 > (uint32_t)r.score2)
	314	r.score2 = b[i]>>32, r.te2 = e;
	315	}
	316	}
	317	}
	318	free(b);
	319	return r;
	320	}
	321
	322	static void revseq(int l, uint8_t *s)
	323	{
	324	int i, t;
	325	for (i = 0; i < l>>1; ++i)
	326	t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t;
	327	}
	328
	329	kswr_t ksw_align(int qlen, uint8_t query, int tlen, uint8_t target, int m, const int8_t mat, int gapo, int gape, int xtra, kswq_t *qry)
	330	{
	331	int size;
	332	kswq_t *q;
	333	kswr_t r, rr;
	334	kswr_t (func)(kswq_t, int, const uint8_t*, int, int, int);
	335
	336	q = (qry && qry)? qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat);
	337	if (qry && qry == 0) qry = q;
	338	func = q->size == 2? ksw_i16 : ksw_u8;
	339	size = q->size;
	340	r = func(q, tlen, target, gapo, gape, xtra);
	341	if (qry == 0) free(q);
	342	if ((xtra&KSW_XSTART) == 0 \|\| ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r;
	343	revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end
	344	q = ksw_qinit(size, r.qe + 1, query, m, mat);
	345	rr = func(q, tlen, target, gapo, gape, KSW_XSTOP \| r.score);
	346	revseq(r.qe + 1, query); revseq(r.te + 1, target);
	347	free(q);
	348	if (r.score == rr.score)
	349	r.tb = r.te - rr.te, r.qb = r.qe - rr.qe;
	350	return r;
	351	}

+71

-0

third_party/fermi-lite-0.1/ksw.h less more

	0	#ifndef __AC_KSW_H
	1	#define __AC_KSW_H
	2
	3	#include <stdint.h>
	4
	5	#define KSW_XBYTE 0x10000
	6	#define KSW_XSTOP 0x20000
	7	#define KSW_XSUBO 0x40000
	8	#define KSW_XSTART 0x80000
	9
	10	struct _kswq_t;
	11	typedef struct _kswq_t kswq_t;
	12
	13	typedef struct {
	14	int score; // best score
	15	int te, qe; // target end and query end
	16	int score2, te2; // second best score and ending position on the target
	17	int tb, qb; // target start and query start
	18	} kswr_t;
	19
	20	#ifdef __cplusplus
	21	extern "C" {
	22	#endif
	23
	24	/**
	25	* Aligning two sequences
	26	*
	27	* @param qlen length of the query sequence (typically <tlen)
	28	* @param query query sequence with 0 <= query[i] < m
	29	* @param tlen length of the target sequence
	30	* @param target target sequence
	31	* @param m number of residue types
	32	* @param mat m*m scoring matrix in one-dimention array
	33	* @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)"
	34	* @param gape gap extension penalty
	35	* @param xtra extra information (see below)
	36	* @param qry query profile (see below)
	37	*
	38	* @return alignment information in a struct; unset values to -1
	39	*
	40	* When xtra==0, ksw_align() uses a signed two-byte integer to store a
	41	* score and only finds the best score and the end positions. The 2nd best
	42	* score or the start positions are not attempted. The default behavior can
	43	* be tuned by setting KSW_X* flags:
	44	*
	45	* KSW_XBYTE: use an unsigned byte to store a score. If overflow occurs,
	46	* kswr_t::score will be set to 255
	47	*
	48	* KSW_XSUBO: track the 2nd best score and the ending position on the
	49	* target if the 2nd best is higher than (xtra&0xffff)
	50	*
	51	* KSW_XSTOP: stop if the maximum score is above (xtra&0xffff)
	52	*
	53	* KSW_XSTART: find the start positions
	54	*
	55	* When *qry==NULL, ksw_align() will compute and allocate the query profile
	56	* and when the function returns, *qry will point to the profile, which can
	57	* be deallocated simply by free(). If one query is aligned against multiple
	58	* target sequences, *qry should be set to NULL during the first call and
	59	* freed after the last call. Note that qry can equal 0. In this case, the
	60	* query profile will be deallocated in ksw_align().
	61	*/
	62	kswr_t ksw_align(int qlen, uint8_t query, int tlen, uint8_t target, int m, const int8_t mat, int gapo, int gape, int xtra, kswq_t *qry);
	63
	64	kswq_t ksw_qinit(int size, int qlen, const uint8_t query, int m, const int8_t *mat);
	65
	66	#ifdef __cplusplus
	67	}
	68	#endif
	69
	70	#endif

+65

-0

third_party/fermi-lite-0.1/kthread.c less more

	0	#include <pthread.h>
	1	#include <stdlib.h>
	2	#include <limits.h>
	3
	4	/************
	5	* kt_for() *
	6	************/
	7
	8	struct kt_for_t;
	9
	10	typedef struct {
	11	struct kt_for_t *t;
	12	long i;
	13	} ktf_worker_t;
	14
	15	typedef struct kt_for_t {
	16	int n_threads;
	17	long n;
	18	ktf_worker_t *w;
	19	void (func)(void,long,int);
	20	void *data;
	21	} kt_for_t;
	22
	23	static inline long steal_work(kt_for_t *t)
	24	{
	25	int i, min_i = -1;
	26	long k, min = LONG_MAX;
	27	for (i = 0; i < t->n_threads; ++i)
	28	if (min > t->w[i].i) min = t->w[i].i, min_i = i;
	29	k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads);
	30	return k >= t->n? -1 : k;
	31	}
	32
	33	static void ktf_worker(void data)
	34	{
	35	ktf_worker_t w = (ktf_worker_t)data;
	36	long i;
	37	for (;;) {
	38	i = __sync_fetch_and_add(&w->i, w->t->n_threads);
	39	if (i >= w->t->n) break;
	40	w->t->func(w->t->data, i, w - w->t->w);
	41	}
	42	while ((i = steal_work(w->t)) >= 0)
	43	w->t->func(w->t->data, i, w - w->t->w);
	44	pthread_exit(0);
	45	}
	46
	47	void kt_for(int n_threads, void (func)(void,long,int), void *data, long n)
	48	{
	49	if (n_threads > 1) {
	50	int i;
	51	kt_for_t t;
	52	pthread_t *tid;
	53	t.func = func, t.data = data, t.n_threads = n_threads, t.n = n;
	54	t.w = (ktf_worker_t)alloca(n_threads sizeof(ktf_worker_t));
	55	tid = (pthread_t)alloca(n_threads sizeof(pthread_t));
	56	for (i = 0; i < n_threads; ++i)
	57	t.w[i].t = &t, t.w[i].i = i;
	58	for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]);
	59	for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
	60	} else {
	61	long j;
	62	for (j = 0; j < n; ++j) func(data, j, 0);
	63	}
	64	}

+110

-0

third_party/fermi-lite-0.1/kvec.h less more

	0	/* The MIT License
	1
	2	Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
	3
	4	Permission is hereby granted, free of charge, to any person obtaining
	5	a copy of this software and associated documentation files (the
	6	"Software"), to deal in the Software without restriction, including
	7	without limitation the rights to use, copy, modify, merge, publish,
	8	distribute, sublicense, and/or sell copies of the Software, and to
	9	permit persons to whom the Software is furnished to do so, subject to
	10	the following conditions:
	11
	12	The above copyright notice and this permission notice shall be
	13	included in all copies or substantial portions of the Software.
	14
	15	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	16	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	17	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	18	NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	19	BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	20	ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	21	CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	22	SOFTWARE.
	23	*/
	24
	25	/*
	26	An example:
	27
	28	#include "kvec.h"
	29	int main() {
	30	kvec_t(int) array;
	31	kv_init(array);
	32	kv_push(int, array, 10); // append
	33	kv_a(int, array, 20) = 5; // dynamic
	34	kv_A(array, 20) = 4; // static
	35	kv_destroy(array);
	36	return 0;
	37	}
	38	*/
	39
	40	/*
	41	2008-09-22 (0.1.0):
	42
	43	* The initial version.
	44
	45	*/
	46
	47	#ifndef AC_KVEC_H
	48	#define AC_KVEC_H
	49
	50	#include <stdlib.h>
	51
	52	#define kv_roundup32(x) (--(x), (x)\|=(x)>>1, (x)\|=(x)>>2, (x)\|=(x)>>4, (x)\|=(x)>>8, (x)\|=(x)>>16, ++(x))
	53
	54	#define kvec_t(type) struct { size_t n, m; type *a; }
	55	#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
	56	#define kv_destroy(v) free((v).a)
	57	#define kv_A(v, i) ((v).a[(i)])
	58	#define kv_pop(v) ((v).a[--(v).n])
	59	#define kv_size(v) ((v).n)
	60	#define kv_max(v) ((v).m)
	61
	62	#define kv_resize(type, v, s) do { \
	63	if ((v).m < (s)) { \
	64	(v).m = (s); \
	65	kv_roundup32((v).m); \
	66	(v).a = (type)realloc((v).a, sizeof(type) (v).m); \
	67	} \
	68	} while (0)
	69
	70	#define kv_copy(type, v1, v0) do { \
	71	if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \
	72	(v1).n = (v0).n; \
	73	memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \
	74	} while (0) \
	75
	76	#define kv_push(type, v, x) do { \
	77	if ((v).n == (v).m) { \
	78	(v).m = (v).m? (v).m<<1 : 2; \
	79	(v).a = (type)realloc((v).a, sizeof(type) (v).m); \
	80	} \
	81	(v).a[(v).n++] = (x); \
	82	} while (0)
	83
	84	#define kv_pushp(type, v, p) do { \
	85	if ((v).n == (v).m) { \
	86	(v).m = (v).m? (v).m<<1 : 2; \
	87	(v).a = (type)realloc((v).a, sizeof(type) (v).m); \
	88	} \
	89	*(p) = &(v).a[(v).n++]; \
	90	} while (0)
	91
	92	#define kv_a(type, v, i) ((v).m <= (size_t)(i)? \
	93	((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
	94	(v).a = (type)realloc((v).a, sizeof(type) (v).m), 0) \
	95	: (v).n <= (size_t)(i)? (v).n = (i) \
	96	: 0), (v).a[(i)]
	97
	98	#define kv_reverse(type, v, start) do { \
	99	if ((v).m > 0 && (v).n > (start)) { \
	100	size_t __i, __end = (v).n - (start); \
	101	type *__a = (v).a + (start); \
	102	for (__i = 0; __i < __end>>1; ++__i) { \
	103	type __t = __a[__end - 1 - __i]; \
	104	__a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \
	105	} \
	106	} \
	107	} while (0)
	108
	109	#endif

+619

-0

third_party/fermi-lite-0.1/mag.c less more

	0	/* remaining problems:
	1
	2	1. multiedges due to tandem repeats
	3	*/
	4
	5	#include <math.h>
	6	#include <zlib.h>
	7	#include <stdio.h>
	8	#include <assert.h>
	9	#include "mag.h"
	10	#include "kvec.h"
	11	#include "internal.h"
	12	#include "kseq.h"
	13	KSEQ_DECLARE(gzFile)
	14
	15	#include "khash.h"
	16	KHASH_INIT2(64,, khint64_t, uint64_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
	17
	18	typedef khash_t(64) hash64_t;
	19
	20	#define ku128_xlt(a, b) ((a).x < (b).x \|\| ((a).x == (b).x && (a).y > (b).y))
	21	#define ku128_ylt(a, b) ((int64_t)(a).y > (int64_t)(b).y)
	22	#include "ksort.h"
	23	KSORT_INIT(128x, ku128_t, ku128_xlt)
	24	KSORT_INIT(128y, ku128_t, ku128_ylt)
	25	KSORT_INIT_GENERIC(uint64_t)
	26
	27	#define edge_mark_del(_x) ((_x).x = (uint64_t)-2, (_x).y = 0)
	28	#define edge_is_del(_x) ((_x).x == (uint64_t)-2 \|\| (_x).y == 0)
	29
	30	static int fm_verbose = 1;
	31
	32	/*********************
	33	* Vector operations *
	34	*********************/
	35
	36	static inline void v128_clean(ku128_v *r)
	37	{
	38	int i, j;
	39	for (i = j = 0; i < r->n; ++i)
	40	if (!edge_is_del(r->a[i])) { // keep this arc
	41	if (j != i) r->a[j++] = r->a[i];
	42	else ++j;
	43	}
	44	r->n = j;
	45	}
	46
	47	void mag_v128_clean(ku128_v *r)
	48	{
	49	v128_clean(r);
	50	}
	51
	52	static inline void v128_rmdup(ku128_v *r)
	53	{
	54	int l, cnt;
	55	uint64_t x;
	56	if (r->n > 1) ks_introsort(128x, r->n, r->a);
	57	for (l = cnt = 0; l < r->n; ++l) // jump to the first node to be retained
	58	if (edge_is_del(r->a[l])) ++cnt;
	59	else break;
	60	if (l == r->n) { // no good arcs
	61	r->n = 0;
	62	return;
	63	}
	64	x = r->a[l].x;
	65	for (++l; l < r->n; ++l) { // mark duplicated node
	66	if (edge_is_del(r->a[l]) \|\| r->a[l].x == x)
	67	edge_mark_del(r->a[l]), ++cnt;
	68	else x = r->a[l].x;
	69	}
	70	if (cnt) v128_clean(r);
	71	}
	72
	73	static inline void v128_cap(ku128_v *r, int max)
	74	{
	75	int i, thres;
	76	if (r->n <= max) return;
	77	ks_introsort(128y, r->n, r->a);
	78	thres = r->a[max].y;
	79	for (i = 0; i < r->n; ++i)
	80	if (r->a[i].y == thres) break;
	81	r->n = i;
	82	}
	83
	84	/*************************************************
	85	* Mapping between vertex id and interval end id *
	86	*************************************************/
	87
	88	void mag_g_build_hash(mag_t *g)
	89	{
	90	long i;
	91	int j, ret;
	92	hash64_t *h;
	93	h = kh_init(64);
	94	for (i = 0; i < g->v.n; ++i) {
	95	const magv_t *p = &g->v.a[i];
	96	for (j = 0; j < 2; ++j) {
	97	khint_t k = kh_put(64, h, p->k[j], &ret);
	98	if (ret == 0) {
	99	if (fm_verbose >= 2)
	100	fprintf(stderr, "[W::%s] terminal %ld is duplicated.\n", __func__, (long)p->k[j]);
	101	kh_val(h, k) = (uint64_t)-1;
	102	} else kh_val(h, k) = i<<1\|j;
	103	}
	104	}
	105	g->h = h;
	106	}
	107
	108	static inline uint64_t tid2idd(hash64_t *h, uint64_t tid)
	109	{
	110	khint_t k = kh_get(64, h, tid);
	111	assert(k != kh_end(h));
	112	return kh_val(h, k);
	113	}
	114
	115	uint64_t mag_tid2idd(void *h, uint64_t tid) // exported version
	116	{
	117	return tid2idd(h, tid);
	118	}
	119
	120	void mag_g_amend(mag_t *g)
	121	{
	122	int i, j, l, ll;
	123	for (i = 0; i < g->v.n; ++i) {
	124	magv_t *p = &g->v.a[i];
	125	ku128_v *r;
	126	for (j = 0; j < 2; ++j) {
	127	for (l = 0; l < p->nei[j].n; ++l) {
	128	khint_t k;
	129	uint64_t z, x = p->nei[j].a[l].x;
	130	k = kh_get(64, g->h, x);
	131	if (k == kh_end((hash64_t*)g->h)) { // neighbor is not in the hash table; likely due to tip removal
	132	edge_mark_del(p->nei[j].a[l]);
	133	continue;
	134	} else z = kh_val((hash64_t*)g->h, k);
	135	r = &g->v.a[z>>1].nei[z&1];
	136	for (ll = 0, z = p->k[j]; ll < r->n; ++ll)
	137	if (r->a[ll].x == z) break;
	138	if (ll == r->n) // not in neighbor's neighor
	139	edge_mark_del(p->nei[j].a[l]);
	140	}
	141	v128_rmdup(&p->nei[j]);
	142	}
	143	}
	144	}
	145
	146	/*********************************
	147	* Graph I/O initialization etc. *
	148	*********************************/
	149
	150	void mag_v_write(const magv_t p, kstring_t out)
	151	{
	152	int j, k;
	153	if (p->len <= 0) return;
	154	out->l = 0;
	155	kputc('@', out); kputl(p->k[0], out); kputc(':', out); kputl(p->k[1], out);
	156	kputc('\t', out); kputw(p->nsr, out);
	157	for (j = 0; j < 2; ++j) {
	158	const ku128_v *r = &p->nei[j];
	159	kputc('\t', out);
	160	for (k = 0; k < r->n; ++k) {
	161	if (edge_is_del(r->a[k])) continue;
	162	kputl(r->a[k].x, out); kputc(',', out); kputw((int32_t)r->a[k].y, out);
	163	kputc(';', out);
	164	}
	165	if (p->nei[j].n == 0) kputc('.', out);
	166	}
	167	kputc('\n', out);
	168	ks_resize(out, out->l + 2 * p->len + 5);
	169	for (j = 0; j < p->len; ++j)
	170	out->s[out->l++] = "ACGT"[(int)p->seq[j] - 1];
	171	out->s[out->l] = 0;
	172	kputsn("\n+\n", 3, out);
	173	kputsn(p->cov, p->len, out);
	174	kputc('\n', out);
	175	}
	176
	177	void mag_g_print(const mag_t *g)
	178	{
	179	int i;
	180	kstring_t out;
	181	out.l = out.m = 0; out.s = 0;
	182	for (i = 0; i < g->v.n; ++i) {
	183	if (g->v.a[i].len < 0) continue;
	184	mag_v_write(&g->v.a[i], &out);
	185	fwrite(out.s, 1, out.l, stdout);
	186	}
	187	free(out.s);
	188	fflush(stdout);
	189	}
	190
	191	/**************************
	192	* Basic graph operations *
	193	**************************/
	194
	195	void mag_v_destroy(magv_t *v)
	196	{
	197	free(v->nei[0].a); free(v->nei[1].a);
	198	free(v->seq); free(v->cov);
	199	memset(v, 0, sizeof(magv_t));
	200	v->len = -1;
	201	}
	202
	203	void mag_g_destroy(mag_t *g)
	204	{
	205	int i;
	206	kh_destroy(64, g->h);
	207	for (i = 0; i < g->v.n; ++i)
	208	mag_v_destroy(&g->v.a[i]);
	209	free(g->v.a);
	210	free(g);
	211	}
	212
	213	void mag_v_copy_to_empty(magv_t dst, const magv_t src) // NB: memory leak if dst is allocated
	214	{
	215	memcpy(dst, src, sizeof(magv_t));
	216	dst->max_len = dst->len + 1;
	217	kroundup32(dst->max_len);
	218	dst->seq = calloc(dst->max_len, 1); memcpy(dst->seq, src->seq, src->len);
	219	dst->cov = calloc(dst->max_len, 1); memcpy(dst->cov, src->cov, src->len);
	220	kv_init(dst->nei[0]); kv_copy(ku128_t, dst->nei[0], src->nei[0]);
	221	kv_init(dst->nei[1]); kv_copy(ku128_t, dst->nei[1], src->nei[1]);
	222	}
	223
	224	void mag_eh_add(mag_t *g, uint64_t u, uint64_t v, int ovlp) // add v to u
	225	{
	226	ku128_v *r;
	227	ku128_t *q;
	228	uint64_t idd;
	229	int i;
	230	if ((int64_t)u < 0) return;
	231	idd = tid2idd(g->h, u);
	232	r = &g->v.a[idd>>1].nei[idd&1];
	233	for (i = 0; i < r->n; ++i) // no multi-edges
	234	if (r->a[i].x == v) return;
	235	kv_pushp(ku128_t, *r, &q);
	236	q->x = v; q->y = ovlp;
	237	}
	238
	239	void mag_eh_markdel(mag_t *g, uint64_t u, uint64_t v) // mark deletion of v from u
	240	{
	241	int i;
	242	uint64_t idd;
	243	if ((int64_t)u < 0) return;
	244	idd = tid2idd(g->h, u);
	245	ku128_v *r = &g->v.a[idd>>1].nei[idd&1];
	246	for (i = 0; i < r->n; ++i)
	247	if (r->a[i].x == v) edge_mark_del(r->a[i]);
	248	}
	249
	250	void mag_v_del(mag_t g, magv_t p)
	251	{
	252	int i, j;
	253	khint_t k;
	254	if (p->len < 0) return;
	255	for (i = 0; i < 2; ++i) {
	256	ku128_v *r = &p->nei[i];
	257	for (j = 0; j < r->n; ++j)
	258	if (!edge_is_del(r->a[j]) && r->a[j].x != p->k[0] && r->a[j].x != p->k[1])
	259	mag_eh_markdel(g, r->a[j].x, p->k[i]);
	260	}
	261	for (i = 0; i < 2; ++i) {
	262	k = kh_get(64, g->h, p->k[i]);
	263	kh_del(64, g->h, k);
	264	}
	265	mag_v_destroy(p);
	266	}
	267
	268	void mag_v_transdel(mag_t g, magv_t p, int min_ovlp)
	269	{
	270	if (p->nei[0].n && p->nei[1].n) {
	271	int i, j, ovlp;
	272	for (i = 0; i < p->nei[0].n; ++i) {
	273	if (edge_is_del(p->nei[0].a[i]) \|\| p->nei[0].a[i].x == p->k[0] \|\| p->nei[0].a[i].x == p->k[1]) continue; // due to p->p loop
	274	for (j = 0; j < p->nei[1].n; ++j) {
	275	if (edge_is_del(p->nei[1].a[j]) \|\| p->nei[1].a[j].x == p->k[0] \|\| p->nei[1].a[j].x == p->k[1]) continue;
	276	ovlp = (int)(p->nei[0].a[i].y + p->nei[1].a[j].y) - p->len;
	277	if (ovlp >= min_ovlp) {
	278	mag_eh_add(g, p->nei[0].a[i].x, p->nei[1].a[j].x, ovlp);
	279	mag_eh_add(g, p->nei[1].a[j].x, p->nei[0].a[i].x, ovlp);
	280	}
	281	}
	282	}
	283	}
	284	mag_v_del(g, p);
	285	}
	286
	287	void mag_v_flip(mag_t g, magv_t p)
	288	{
	289	ku128_v t;
	290	khint_t k;
	291	hash64_t h = (hash64_t)g->h;
	292
	293	seq_revcomp6(p->len, (uint8_t*)p->seq);
	294	seq_reverse(p->len, (uint8_t*)p->cov);
	295	p->k[0] ^= p->k[1]; p->k[1] ^= p->k[0]; p->k[0] ^= p->k[1];
	296	t = p->nei[0]; p->nei[0] = p->nei[1]; p->nei[1] = t;
	297	k = kh_get(64, h, p->k[0]);
	298	assert(k != kh_end(h));
	299	kh_val(h, k) ^= 1;
	300	k = kh_get(64, h, p->k[1]);
	301	assert(k != kh_end(h));
	302	kh_val(h, k) ^= 1;
	303	}
	304
	305	/*********************
	306	* Unambiguous merge *
	307	*********************/
	308
	309	int mag_vh_merge_try(mag_t g, magv_t p, int min_merge_len) // merge p's neighbor to the right-end of p
	310	{
	311	magv_t *q;
	312	khint_t kp, kq;
	313	int i, j, new_l;
	314	hash64_t h = (hash64_t)g->h;
	315
	316	// check if an unambiguous merge can be performed
	317	if (p->nei[1].n != 1) return -1; // multiple or no neighbor; do not merge
	318	if ((int64_t)p->nei[1].a[0].x < 0) return -2; // deleted neighbor
	319	if ((int)p->nei[1].a[0].y < min_merge_len) return -5;
	320	kq = kh_get(64, g->h, p->nei[1].a[0].x);
	321	assert(kq != kh_end(h)); // otherwise the neighbor is non-existant
	322	q = &g->v.a[kh_val((hash64_t*)g->h, kq)>>1];
	323	if (p == q) return -3; // we have a loop p->p. We cannot merge in this case
	324	if (q->nei[kh_val(h, kq)&1].n != 1) return -4; // the neighbor q has multiple neighbors. cannot be an unambiguous merge
	325
	326	// we can perform a merge; do further consistency check (mostly check bugs)
	327	if (kh_val(h, kq)&1) mag_v_flip(g, q); // a "><" bidirectional arc; flip q
	328	kp = kh_get(64, g->h, p->k[1]); assert(kp != kh_end(h)); // get the iterator to p
	329	kh_del(64, g->h, kp); kh_del(64, g->h, kq); // remove the two ends of the arc in the hash table
	330	assert(p->k[1] == q->nei[0].a[0].x && q->k[0] == p->nei[1].a[0].x); // otherwise inconsistent topology
	331	assert(p->nei[1].a[0].y == q->nei[0].a[0].y); // the overlap length must be the same
	332	assert(p->len >= p->nei[1].a[0].y && q->len >= p->nei[1].a[0].y); // and the overlap is shorter than both vertices
	333
	334	// update the read count and sequence length
	335	p->nsr += q->nsr;
	336	new_l = p->len + q->len - p->nei[1].a[0].y;
	337	if (new_l + 1 > p->max_len) { // then double p->seq and p->cov
	338	p->max_len = new_l + 1;
	339	kroundup32(p->max_len);
	340	p->seq = realloc(p->seq, p->max_len);
	341	p->cov = realloc(p->cov, p->max_len);
	342	}
	343	// merge seq and cov
	344	for (i = p->len - p->nei[1].a[0].y, j = 0; j < q->len; ++i, ++j) { // write seq and cov
	345	p->seq[i] = q->seq[j];
	346	if (i < p->len) {
	347	if ((int)p->cov[i] + (q->cov[j] - 33) > 126) p->cov[i] = 126;
	348	else p->cov[i] += q->cov[j] - 33;
	349	} else p->cov[i] = q->cov[j];
	350	}
	351	p->seq[new_l] = p->cov[new_l] = 0;
	352	p->len = new_l;
	353	// merge neighbors
	354	free(p->nei[1].a);
	355	p->nei[1] = q->nei[1]; p->k[1] = q->k[1];
	356	q->nei[1].a = 0; // to avoid freeing p->nei[1] by mag_v_destroy() below
	357	// update the hash table for the right end of p
	358	kp = kh_get(64, g->h, p->k[1]);
	359	assert(kp != kh_end((hash64_t*)g->h));
	360	kh_val(h, kp) = (p - g->v.a)<<1 \| 1;
	361	// clean up q
	362	mag_v_destroy(q);
	363	return 0;
	364	}
	365
	366	void mag_g_merge(mag_t *g, int rmdup, int min_merge_len)
	367	{
	368	int i;
	369	uint64_t n = 0;
	370	for (i = 0; i < g->v.n; ++i) { // remove multiedges; FIXME: should we do that?
	371	if (rmdup) {
	372	v128_rmdup(&g->v.a[i].nei[0]);
	373	v128_rmdup(&g->v.a[i].nei[1]);
	374	} else {
	375	v128_clean(&g->v.a[i].nei[0]);
	376	v128_clean(&g->v.a[i].nei[1]);
	377	}
	378	}
	379	for (i = 0; i < g->v.n; ++i) {
	380	magv_t *p = &g->v.a[i];
	381	if (p->len < 0) continue;
	382	while (mag_vh_merge_try(g, p, min_merge_len) == 0) ++n;
	383	mag_v_flip(g, p);
	384	while (mag_vh_merge_try(g, p, min_merge_len) == 0) ++n;
	385	}
	386	if (fm_verbose >= 3)
	387	fprintf(stderr, "[M::%s] unambiguously merged %ld pairs of vertices\n", __func__, (long)n);
	388	}
	389
	390	/*****************************
	391	* Easy graph simplification *
	392	*****************************/
	393
	394	typedef magv_t *magv_p;
	395
	396	#define mag_vlt1(a, b) ((a)->nsr < (b)->nsr \|\| ((a)->nsr == (b)->nsr && (a)->len < (b)->len))
	397	KSORT_INIT(vlt1, magv_p, mag_vlt1)
	398
	399	#define mag_vlt2(a, b) ((a)->nei[0].n + (a)->nei[1].n < (b)->nei[0].n + (b)->nei[1].n)
	400	KSORT_INIT(vlt2, magv_p, mag_vlt2)
	401
	402	int mag_g_rm_vext(mag_t *g, int min_len, int min_nsr)
	403	{
	404	int i;
	405	kvec_t(magv_p) a = {0,0,0};
	406
	407	for (i = 0; i < g->v.n; ++i) {
	408	magv_t *p = &g->v.a[i];
	409	if (p->len < 0 \|\| (p->nei[0].n > 0 && p->nei[1].n > 0)) continue;
	410	if (p->len >= min_len \|\| p->nsr >= min_nsr) continue;
	411	kv_push(magv_p, a, p);
	412	}
	413	ks_introsort(vlt1, a.n, a.a);
	414	for (i = 0; i < a.n; ++i) mag_v_del(g, a.a[i]);
	415	free(a.a);
	416	if (fm_verbose >= 3)
	417	fprintf(stderr, "[M::%s] removed %ld tips (min_len=%d, min_nsr=%d)\n", __func__, a.n, min_len, min_nsr);
	418	return a.n;
	419	}
	420
	421	int mag_g_rm_vint(mag_t *g, int min_len, int min_nsr, int min_ovlp)
	422	{
	423	int i;
	424	kvec_t(magv_p) a = {0,0,0};
	425
	426	for (i = 0; i < g->v.n; ++i) {
	427	magv_t *p = &g->v.a[i];
	428	if (p->len >= 0 && p->len < min_len && p->nsr < min_nsr)
	429	kv_push(magv_p, a, p);
	430	}
	431	ks_introsort(vlt1, a.n, a.a);
	432	for (i = 0; i < a.n; ++i) mag_v_transdel(g, a.a[i], min_ovlp);
	433	free(a.a);
	434	if (fm_verbose >= 3)
	435	fprintf(stderr, "[M::%s] removed %ld internal vertices (min_len=%d, min_nsr=%d)\n", __func__, a.n, min_len, min_nsr);
	436	return a.n;
	437	}
	438
	439	void mag_g_rm_edge(mag_t *g, int min_ovlp, double min_ratio, int min_len, int min_nsr)
	440	{
	441	int i, j, k;
	442	kvec_t(magv_p) a = {0,0,0};
	443	uint64_t n_marked = 0;
	444
	445	for (i = 0; i < g->v.n; ++i) {
	446	magv_t *p = &g->v.a[i];
	447	if (p->len < 0) continue;
	448	if ((p->nei[0].n == 0 \|\| p->nei[1].n == 0) && p->len < min_len && p->nsr < min_nsr)
	449	continue; // skip tips
	450	kv_push(magv_p, a, p);
	451	}
	452	ks_introsort(vlt1, a.n, a.a);
	453
	454	for (i = a.n - 1; i >= 0; --i) {
	455	magv_t *p = a.a[i];
	456	for (j = 0; j < 2; ++j) {
	457	ku128_v *r = &p->nei[j];
	458	int max_ovlp = min_ovlp, max_k = -1;
	459	if (r->n == 0) continue; // no overlapping reads
	460	for (k = 0; k < r->n; ++k) // get the max overlap length
	461	if (max_ovlp < r->a[k].y)
	462	max_ovlp = r->a[k].y, max_k = k;
	463	if (max_k >= 0) { // test if max_k is a tip
	464	uint64_t x = tid2idd(g->h, r->a[max_k].x);
	465	magv_t *q = &g->v.a[x>>1];
	466	if (q->len >= 0 && (q->nei[0].n == 0 \|\| q->nei[1].n == 0) && q->len < min_len && q->nsr < min_nsr)
	467	max_ovlp = min_ovlp;
	468	}
	469	for (k = 0; k < r->n; ++k) {
	470	if (edge_is_del(r->a[k])) continue;
	471	if (r->a[k].y < min_ovlp \|\| (double)r->a[k].y / max_ovlp < min_ratio) {
	472	mag_eh_markdel(g, r->a[k].x, p->k[j]); // FIXME: should we check if r->a[k] is p itself?
	473	edge_mark_del(r->a[k]);
	474	++n_marked;
	475	}
	476	}
	477	}
	478	}
	479	free(a.a);
	480	if (fm_verbose >= 3)
	481	fprintf(stderr, "[M::%s] removed %ld edges\n", __func__, (long)n_marked);
	482	}
	483
	484	/*********************************************
	485	* A-statistics and simplistic flow analysis *
	486	*********************************************/
	487
	488	#define A_THRES 20.
	489	#define A_MIN_SUPP 5
	490
	491	double mag_cal_rdist(mag_t *g)
	492	{
	493	magv_v *v = &g->v;
	494	int j;
	495	uint64_t *srt;
	496	double rdist = -1.;
	497	int64_t i, sum_n_all, sum_n, sum_l;
	498
	499	srt = calloc(v->n, 8);
	500	for (i = 0, sum_n_all = 0; i < v->n; ++i) {
	501	srt[i] = (uint64_t)v->a[i].nsr<<32 \| i;
	502	sum_n_all += v->a[i].nsr;
	503	}
	504	ks_introsort_uint64_t(v->n, srt);
	505
	506	for (j = 0; j < 2; ++j) {
	507	sum_n = sum_l = 0;
	508	for (i = v->n - 1; i >= 0; --i) {
	509	const magv_t *p = &v->a[srt[i]<<32>>32];
	510	int tmp1, tmp2;
	511	tmp1 = tmp2 = 0;
	512	if (p->nei[0].n) ++tmp1, tmp2 += p->nei[0].a[0].y;
	513	if (p->nei[1].n) ++tmp1, tmp2 += p->nei[1].a[0].y;
	514	if (tmp1) tmp2 /= tmp1;
	515	if (rdist > 0.) {
	516	double A = (p->len - tmp1) / rdist - p->nsr * M_LN2;
	517	if (A < A_THRES) continue;
	518	}
	519	sum_n += p->nsr;
	520	sum_l += p->len - tmp1;
	521	if (sum_n >= sum_n_all * 0.5) break;
	522	}
	523	rdist = (double)sum_l / sum_n;
	524	}
	525	if (fm_verbose >= 3) {
	526	fprintf(stderr, "[M::%s] average read distance %.3f.\n", __func__, rdist);
	527	fprintf(stderr, "[M::%s] approximate genome size: %.0f (inaccurate!)\n", __func__, rdist * sum_n_all);
	528	}
	529
	530	free(srt);
	531	return rdist;
	532	}
	533
	534	/**************
	535	* Key portal *
	536	**************/
	537
	538	void mag_init_opt(magopt_t *o)
	539	{
	540	memset(o, 0, sizeof(magopt_t));
	541	o->trim_len = 0;
	542	o->trim_depth = 6;
	543
	544	o->min_elen = 300;
	545	o->min_ovlp = 0;
	546	o->min_merge_len = 0;
	547	o->min_ensr = 4;
	548	o->min_insr = 3;
	549	o->min_dratio1 = 0.7;
	550
	551	o->max_bcov = 10.;
	552	o->max_bfrac = 0.15;
	553	o->max_bvtx = 64;
	554	o->max_bdist = 512;
	555	}
	556
	557	void mag_g_clean(mag_t g, const magopt_t opt)
	558	{
	559	int j;
	560
	561	if (g->min_ovlp < opt->min_ovlp) g->min_ovlp = opt->min_ovlp;
	562	for (j = 2; j <= opt->min_ensr; ++j)
	563	mag_g_rm_vext(g, opt->min_elen, j);
	564	mag_g_merge(g, 0, opt->min_merge_len);
	565	mag_g_rm_edge(g, g->min_ovlp, opt->min_dratio1, opt->min_elen, opt->min_ensr);
	566	mag_g_merge(g, 1, opt->min_merge_len);
	567	for (j = 2; j <= opt->min_ensr; ++j)
	568	mag_g_rm_vext(g, opt->min_elen, j);
	569	mag_g_merge(g, 0, opt->min_merge_len);
	570	if (opt->flag & MAG_F_AGGRESSIVE) mag_g_pop_open(g, opt->min_elen);
	571	if (!(opt->flag & MAG_F_NO_SIMPL)) mag_g_simplify_bubble(g, opt->max_bvtx, opt->max_bdist);
	572	mag_g_pop_simple(g, opt->max_bcov, opt->max_bfrac, opt->min_merge_len, opt->flag & MAG_F_AGGRESSIVE);
	573	mag_g_rm_vint(g, opt->min_elen, opt->min_insr, g->min_ovlp);
	574	mag_g_rm_edge(g, g->min_ovlp, opt->min_dratio1, opt->min_elen, opt->min_ensr);
	575	mag_g_merge(g, 1, opt->min_merge_len);
	576	mag_g_rm_vext(g, opt->min_elen, opt->min_ensr);
	577	mag_g_merge(g, 0, opt->min_merge_len);
	578	if (opt->flag & MAG_F_AGGRESSIVE) mag_g_pop_open(g, opt->min_elen);
	579	mag_g_rm_vext(g, opt->min_elen, opt->min_ensr);
	580	mag_g_merge(g, 0, opt->min_merge_len);
	581	}
	582
	583	void mag_v_trim_open(mag_t g, magv_t v, int trim_len, int trim_depth)
	584	{
	585	int i, j, tl[2];
	586	if (v->nei[0].n > 0 && v->nei[1].n > 0) return; // no open end; do nothing
	587	if (v->nei[0].n == 0 && v->nei[1].n == 0 && v->len < trim_len * 3) { // disconnected short vertex
	588	mag_v_del(g, v);
	589	return;
	590	}
	591	for (j = 0; j < 2; ++j) {
	592	ku128_v *r = &v->nei[!j];
	593	int max_ovlp = 0;
	594	for (i = 0; i < r->n; ++i)
	595	max_ovlp = max_ovlp > r->a[i].y? max_ovlp : r->a[i].y;
	596	tl[j] = v->len - max_ovlp < trim_len? v->len - max_ovlp : trim_len;
	597	}
	598	if (v->nei[0].n == 0) {
	599	for (i = 0; i < tl[0] && v->cov[i] - 33 < trim_depth; ++i);
	600	tl[0] = i;
	601	v->len -= i;
	602	memmove(v->seq, v->seq + tl[0], v->len);
	603	memmove(v->cov, v->cov + tl[0], v->len);
	604	}
	605	if (v->nei[1].n == 0) {
	606	for (i = v->len - 1; i >= v->len - tl[1] && v->cov[i] - 33 < trim_depth; --i);
	607	tl[1] = v->len - 1 - i;
	608	v->len -= tl[1];
	609	}
	610	}
	611
	612	void mag_g_trim_open(mag_t g, const magopt_t opt)
	613	{
	614	int i;
	615	if (opt->trim_len == 0) return;
	616	for (i = 0; i < g->v.n; ++i)
	617	mag_v_trim_open(g, &g->v.a[i], opt->trim_len, opt->trim_depth);
	618	}

+69

-0

third_party/fermi-lite-0.1/mag.h less more

	0	#ifndef FM_MOG_H
	1	#define FM_MOG_H
	2
	3	#include <stdint.h>
	4	#include <stdlib.h>
	5	#include "kstring.h"
	6	#include "fml.h"
	7
	8	#ifndef KINT_DEF
	9	#define KINT_DEF
	10	typedef struct { uint64_t x, y; } ku128_t;
	11	typedef struct { size_t n, m; uint64_t *a; } ku64_v;
	12	typedef struct { size_t n, m; ku128_t *a; } ku128_v;
	13	#endif
	14
	15	typedef struct {
	16	int len, nsr; // length; number supporting reads
	17	uint32_t max_len;// allocated seq/cov size
	18	uint64_t k[2]; // bi-interval
	19	ku128_v nei[2]; // neighbors
	20	char seq, cov; // sequence and coverage
	21	void *ptr; // additional information
	22	} magv_t;
	23
	24	typedef struct { size_t n, m; magv_t *a; } magv_v;
	25
	26	typedef struct mag_t {
	27	magv_v v;
	28	float rdist; // read distance
	29	int min_ovlp; // minimum overlap seen from the graph
	30	void *h;
	31	} mag_t;
	32
	33	struct mogb_aux;
	34	typedef struct mogb_aux mogb_aux_t;
	35
	36	#ifdef __cplusplus
	37	extern "C" {
	38	#endif
	39
	40	void mag_init_opt(magopt_t *o);
	41	void mag_g_clean(mag_t g, const magopt_t opt);
	42
	43	void mag_g_destroy(mag_t *g);
	44	void mag_g_amend(mag_t *g);
	45	void mag_g_build_hash(mag_t *g);
	46	void mag_g_print(const mag_t *g);
	47	int mag_g_rm_vext(mag_t *g, int min_len, int min_nsr);
	48	void mag_g_rm_edge(mag_t *g, int min_ovlp, double min_ratio, int min_len, int min_nsr);
	49	void mag_g_merge(mag_t *g, int rmdup, int min_merge_len);
	50	void mag_g_simplify_bubble(mag_t *g, int max_vtx, int max_dist);
	51	void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int aggressive);
	52	void mag_g_pop_open(mag_t *g, int min_elen);
	53	void mag_g_trim_open(mag_t g, const magopt_t opt);
	54
	55	void mag_v_copy_to_empty(magv_t dst, const magv_t src); // NB: memory leak if dst is allocated
	56	void mag_v_del(mag_t g, magv_t p);
	57	void mag_v_write(const magv_t p, kstring_t out);
	58	void mag_v_pop_open(mag_t g, magv_t p, int min_elen);
	59
	60	uint64_t mag_tid2idd(void *h, uint64_t tid);
	61	void mag_v128_clean(ku128_v *r);
	62	double mag_cal_rdist(mag_t *g);
	63
	64	#ifdef __cplusplus
	65	}
	66	#endif
	67
	68	#endif

+274

-0

third_party/fermi-lite-0.1/misc.c less more

	0	#include <assert.h>
	1	#include "internal.h"
	2	#include "kstring.h"
	3	#include "rle.h"
	4	#include "mrope.h"
	5	#include "rld0.h"
	6	#include "mag.h"
	7	#include "kvec.h"
	8	#include "fml.h"
	9	#include "htab.h"
	10
	11	unsigned char seq_nt6_table[256] = {
	12	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
	13	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
	14	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
	15	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
	16	5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5,
	17	5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
	18	5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5,
	19	5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
	20	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
	21	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
	22	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
	23	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
	24	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
	25	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
	26	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
	27	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
	28	};
	29
	30	void fml_opt_init(fml_opt_t *opt)
	31	{
	32	opt->n_threads = 1;
	33	opt->ec_k = 0;
	34	opt->min_cnt = 4;
	35	opt->max_cnt = 8;
	36	opt->min_asm_ovlp = 33;
	37	opt->min_merge_len = 0;
	38	mag_init_opt(&opt->mag_opt);
	39	opt->mag_opt.flag = MAG_F_NO_SIMPL;
	40	}
	41
	42	void fml_opt_adjust(fml_opt_t opt, int n_seqs, const bseq1_t seqs)
	43	{
	44	int i, log_len;
	45	uint64_t tot_len = 0;
	46	if (opt->n_threads < 1) opt->n_threads = 1;
	47	for (i = 0; i < n_seqs; ++i) tot_len += seqs[i].l_seq; // compute total length
	48	for (log_len = 10; log_len < 32; ++log_len) // compute ceil(log2(tot_len))
	49	if (1ULL<<log_len > tot_len) break;
	50	if (opt->ec_k == 0) opt->ec_k = (log_len + 12) / 2;
	51	if (opt->ec_k%2 == 0) ++opt->ec_k;
	52	opt->mag_opt.min_elen = (int)((double)tot_len / n_seqs * 2.5 + .499);
	53	}
	54
	55	static inline int is_rev_same(int l, const char *s)
	56	{
	57	int i;
	58	if (l&1) return 0;
	59	for (i = 0; i < l>>1; ++i)
	60	if (s[i] + s[l-1-i] != 5) break;
	61	return (i == l>>1);
	62	}
	63
	64	struct rld_t fml_fmi_gen(int n, bseq1_t seq, int is_mt)
	65	{
	66	mrope_t *mr;
	67	kstring_t str = {0,0,0};
	68	mritr_t itr;
	69	rlditr_t di;
	70	const uint8_t *block;
	71	rld_t *e = 0;
	72	int k;
	73
	74	mr = mr_init(ROPE_DEF_MAX_NODES, ROPE_DEF_BLOCK_LEN, MR_SO_RCLO);
	75	for (k = 0; k < n; ++k) {
	76	int i;
	77	bseq1_t *s = &seq[k];
	78	if (s->l_seq == 0) continue;
	79	free(s->qual);
	80	for (i = 0; i < s->l_seq; ++i)
	81	s->seq[i] = seq_nt6_table[(int)s->seq[i]];
	82	for (i = 0; i < s->l_seq; ++i)
	83	if (s->seq[i] == 5) break;
	84	if (i < s->l_seq) {
	85	free(s->seq);
	86	continue;
	87	}
	88	if (is_rev_same(s->l_seq, s->seq))
	89	--s->l_seq, s->seq[s->l_seq] = 0;
	90	seq_reverse(s->l_seq, (uint8_t*)s->seq);
	91	kputsn(s->seq, s->l_seq + 1, &str);
	92	seq_revcomp6(s->l_seq, (uint8_t*)s->seq);
	93	kputsn(s->seq, s->l_seq + 1, &str);
	94	free(s->seq);
	95	}
	96	free(seq);
	97	mr_insert_multi(mr, str.l, (uint8_t*)str.s, is_mt);
	98	free(str.s);
	99
	100	e = rld_init(6, 3);
	101	rld_itr_init(e, &di, 0);
	102	mr_itr_first(mr, &itr, 1);
	103	while ((block = mr_itr_next_block(&itr)) != 0) {
	104	const uint8_t q = block + 2, end = block + 2 + *rle_nptr(block);
	105	while (q < end) {
	106	int c = 0;
	107	int64_t l;
	108	rle_dec1(q, c, l);
	109	rld_enc(e, &di, l, c);
	110	}
	111	}
	112	rld_enc_finish(e, &di);
	113
	114	mr_destroy(mr);
	115	return e;
	116	}
	117
	118	struct rld_t fml_seq2fmi(const fml_opt_t opt, int n, bseq1_t *seq)
	119	{
	120	return fml_fmi_gen(n, seq, opt->n_threads > 1? 1 : 0);
	121	}
	122
	123	void fml_fmi_destroy(rld_t *e)
	124	{
	125	rld_destroy(e);
	126	}
	127
	128	void fml_mag_clean(const fml_opt_t opt, struct mag_t g)
	129	{
	130	magopt_t o = opt->mag_opt;
	131	o.min_merge_len = opt->min_merge_len;
	132	mag_g_merge(g, 1, opt->min_merge_len);
	133	mag_g_clean(g, &o);
	134	mag_g_trim_open(g, &o);
	135	}
	136
	137	void fml_mag_destroy(struct mag_t *g)
	138	{
	139	mag_g_destroy(g);
	140	}
	141
	142	#include "khash.h"
	143	KHASH_DECLARE(64, uint64_t, uint64_t)
	144
	145	#define edge_is_del(_x) ((_x).x == (uint64_t)-2 \|\| (_x).y == 0) // from mag.c
	146
	147	fml_utg_t fml_mag2utg(struct mag_t g, int *n)
	148	{
	149	size_t i, j;
	150	fml_utg_t *utg;
	151	khash_t(64) *h;
	152	khint_t k;
	153
	154	h = kh_init(64);
	155	for (i = j = 0; i < g->v.n; ++i) {
	156	int absent;
	157	magv_t *p = &g->v.a[i];
	158	if (p->len < 0) continue;
	159	k = kh_put(64, h, p->k[0], &absent);
	160	kh_val(h, k) = j<<1 \| 0;
	161	k = kh_put(64, h, p->k[1], &absent);
	162	kh_val(h, k) = j<<1 \| 1;
	163	++j;
	164	}
	165	*n = j;
	166	kh_destroy(64, g->h);
	167
	168	utg = (fml_utg_t)calloc(n, sizeof(fml_utg_t));
	169	for (i = j = 0; i < g->v.n; ++i) {
	170	magv_t *p = &g->v.a[i];
	171	fml_utg_t *q;
	172	int from, a, b;
	173	if (p->len < 0) continue;
	174	q = &utg[j++];
	175	q->len = p->len, q->nsr = p->nsr;
	176	q->seq = p->seq, q->cov = p->cov;
	177	for (a = 0; a < q->len; ++a)
	178	q->seq[a] = "$ACGTN"[(int)q->seq[a]];
	179	q->seq[q->len] = q->cov[q->len] = 0;
	180	for (from = 0; from < 2; ++from) {
	181	ku128_v *r = &p->nei[from];
	182	for (b = q->n_ovlp[from] = 0; b < r->n; ++b)
	183	if (!edge_is_del(r->a[b])) ++q->n_ovlp[from];
	184	}
	185	q->ovlp = (fml_ovlp_t*)calloc(q->n_ovlp[0] + q->n_ovlp[1], sizeof(fml_ovlp_t));
	186	for (from = a = 0; from < 2; ++from) {
	187	ku128_v *r = &p->nei[from];
	188	for (b = 0; b < r->n; ++b) {
	189	ku128_t *s = &r->a[b];
	190	fml_ovlp_t *t;
	191	if (edge_is_del(*s)) continue;
	192	t = &q->ovlp[a++];
	193	k = kh_get(64, h, s->x);
	194	assert(k != kh_end(h));
	195	t->tid = kh_val(h, k);
	196	t->len = s->y;
	197	t->from = from;
	198	}
	199	free(p->nei[from].a);
	200	}
	201	}
	202	kh_destroy(64, h);
	203	free(g->v.a);
	204	free(g);
	205	return utg;
	206	}
	207
	208	void fml_utg_print(int n, const fml_utg_t *utg)
	209	{
	210	int i, j, l;
	211	kstring_t out = {0,0,0};
	212	for (i = 0; i < n; ++i) {
	213	const fml_utg_t *u = &utg[i];
	214	out.l = 0;
	215	kputc('@', &out); kputw(i<<1\|0, &out); kputc(':', &out); kputw(i<<1\|1, &out);
	216	kputc('\t', &out); kputw(u->nsr, &out);
	217	kputc('\t', &out);
	218	for (j = 0; j < u->n_ovlp[0]; ++j) {
	219	kputw(u->ovlp[j].tid, &out); kputc(',', &out);
	220	kputw(u->ovlp[j].len, &out); kputc(';', &out);
	221	}
	222	if (u->n_ovlp[0] == 0) kputc('.', &out);
	223	kputc('\t', &out);
	224	for (; j < u->n_ovlp[0] + u->n_ovlp[1]; ++j) {
	225	kputw(u->ovlp[j].tid, &out); kputc(',', &out);
	226	kputw(u->ovlp[j].len, &out); kputc(';', &out);
	227	}
	228	if (u->n_ovlp[1] == 0) kputc('.', &out);
	229	kputc('\n', &out);
	230	l = out.l;
	231	kputsn(u->seq, u->len, &out);
	232	kputsn("\n+\n", 3, &out);
	233	kputsn(u->cov, u->len, &out);
	234	kputc('\n', &out);
	235	fputs(out.s, stdout);
	236	}
	237	free(out.s);
	238	}
	239
	240	void fml_utg_destroy(int n, fml_utg_t *utg)
	241	{
	242	int i;
	243	for (i = 0; i < n; ++i) {
	244	free(utg[i].seq);
	245	free(utg[i].cov);
	246	free(utg[i].ovlp);
	247	}
	248	free(utg);
	249	}
	250
	251	#define MAG_MIN_NSR_COEF .1
	252
	253	fml_utg_t fml_assemble(const fml_opt_t opt0, int n_seqs, bseq1_t seqs, int n_utg)
	254	{
	255	rld_t *e;
	256	mag_t *g;
	257	fml_utg_t *utg;
	258	fml_opt_t opt = *opt0;
	259	float kcov;
	260
	261	fml_opt_adjust(&opt, n_seqs, seqs);
	262	if (opt.ec_k >= 0) fml_correct(&opt, n_seqs, seqs);
	263	kcov = fml_fltuniq(&opt, n_seqs, seqs);
	264	e = fml_seq2fmi(&opt, n_seqs, seqs);
	265	g = fml_fmi2mag(&opt, e);
	266	opt.mag_opt.min_ensr = opt.mag_opt.min_ensr > kcov * MAG_MIN_NSR_COEF? opt.mag_opt.min_ensr : (int)(kcov * MAG_MIN_NSR_COEF + .499);
	267	opt.mag_opt.min_ensr = opt.mag_opt.min_ensr < opt0->max_cnt? opt.mag_opt.min_ensr : opt0->max_cnt;
	268	opt.mag_opt.min_ensr = opt.mag_opt.min_ensr > opt0->min_cnt? opt.mag_opt.min_ensr : opt0->min_cnt;
	269	opt.mag_opt.min_insr = opt.mag_opt.min_ensr - 1;
	270	fml_mag_clean(&opt, g);
	271	utg = fml_mag2utg(g, n_utg);
	272	return utg;
	273	}

+307

-0

third_party/fermi-lite-0.1/mrope.c less more

	0	#include <stdlib.h>
	1	#include <string.h>
	2	#include <assert.h>
	3	#include <unistd.h>
	4	#include <pthread.h>
	5	#include <stdio.h>
	6	#include <time.h>
	7	#include "mrope.h"
	8
	9	/*******************************
	10	* Single-string insertion *
	11	*******************************/
	12
	13	mrope_t *mr_init(int max_nodes, int block_len, int sorting_order)
	14	{
	15	int a;
	16	mrope_t *r;
	17	assert(sorting_order >= 0 && sorting_order <= 2);
	18	r = calloc(1, sizeof(mrope_t));
	19	r->so = sorting_order;
	20	r->thr_min = 1000;
	21	for (a = 0; a != 6; ++a)
	22	r->r[a] = rope_init(max_nodes, block_len);
	23	return r;
	24	}
	25
	26	void mr_destroy(mrope_t *r)
	27	{
	28	int a;
	29	for (a = 0; a != 6; ++a)
	30	if (r->r[a]) rope_destroy(r->r[a]);
	31	free(r);
	32	}
	33
	34	int mr_thr_min(mrope_t *r, int thr_min)
	35	{
	36	if (thr_min > 0)
	37	r->thr_min = thr_min;
	38	return r->thr_min;
	39	}
	40
	41	int64_t mr_insert1(mrope_t r, const uint8_t str)
	42	{
	43	int64_t tl[6], tu[6], l, u;
	44	const uint8_t *p;
	45	int b, is_srt = (r->so != MR_SO_IO), is_comp = (r->so == MR_SO_RCLO);
	46	for (u = 0, b = 0; b != 6; ++b) u += r->r[b]->c[0];
	47	l = is_srt? 0 : u;
	48	for (p = str, b = 0; p; b = p++) {
	49	int a;
	50	if (l != u) {
	51	int64_t cnt = 0;
	52	rope_rank2a(r->r[b], l, u, tl, tu);
	53	if (is_comp && *p != 5) {
	54	for (a = 4; a > *p; --a) l += tu[a] - tl[a];
	55	l += tu[0] - tl[0];
	56	} else for (a = 0; a < *p; ++a) l += tu[a] - tl[a];
	57	rope_insert_run(r->r[b], l, *p, 1, 0);
	58	while (--b >= 0) cnt += r->r[b]->c[*p];
	59	l = cnt + tl[p]; u = cnt + tu[p];
	60	} else {
	61	l = rope_insert_run(r->r[b], l, *p, 1, 0);
	62	while (--b >= 0) l += r->r[b]->c[*p];
	63	u = l;
	64	}
	65	}
	66	return rope_insert_run(r->r[b], l, 0, 1, 0);
	67	}
	68
	69	void mr_rank2a(const mrope_t mr, int64_t x, int64_t y, int64_t cx, int64_t *cy)
	70	{
	71	int a, b;
	72	int64_t z, c[6], l;
	73	memset(c, 0, 48);
	74	for (a = 0, z = 0; a < 6; ++a) {
	75	const int64_t *ca = mr->r[a]->c;
	76	l = ca[0] + ca[1] + ca[2] + ca[3] + ca[4] + ca[5];
	77	if (z + l >= x) break;
	78	for (b = 0; b < 6; ++b) c[b] += ca[b];
	79	z += l;
	80	}
	81	assert(a != 6);
	82	if (y >= 0 && z + l >= y) { // [x,y) is in the same bucket
	83	rope_rank2a(mr->r[a], x - z, y - z, cx, cy);
	84	for (b = 0; b < 6; ++b)
	85	cx[b] += c[b], cy[b] += c[b];
	86	return;
	87	}
	88	if (x != z) rope_rank1a(mr->r[a], x - z, cx);
	89	else memset(cx, 0, 48);
	90	for (b = 0; b < 6; ++b)
	91	cx[b] += c[b], c[b] += mr->r[a]->c[b];
	92	if (y < 0) return;
	93	for (++a, z += l; a < 6; ++a) {
	94	const int64_t *ca = mr->r[a]->c;
	95	l = ca[0] + ca[1] + ca[2] + ca[3] + ca[4] + ca[5];
	96	if (z + l >= y) break;
	97	for (b = 0; b < 6; ++b) c[b] += ca[b];
	98	z += l;
	99	}
	100	assert(a != 6);
	101	if (y != z + l) rope_rank1a(mr->r[a], y - z, cy);
	102	else for (b = 0; b < 6; ++b) cy[b] = mr->r[a]->c[b];
	103	for (b = 0; b < 6; ++b) cy[b] += c[b];
	104	}
	105
	106	/**********************
	107	* Mrope iterator *
	108	**********************/
	109
	110	void mr_itr_first(mrope_t r, mritr_t i, int to_free)
	111	{
	112	i->a = 0; i->r = r; i->to_free = to_free;
	113	rope_itr_first(i->r->r[0], &i->i);
	114	}
	115
	116	const uint8_t mr_itr_next_block(mritr_t i)
	117	{
	118	const uint8_t *s;
	119	if (i->a >= 6) return 0;
	120	while ((s = rope_itr_next_block(&i->i)) == 0) {
	121	if (i->to_free) {
	122	rope_destroy(i->r->r[i->a]);
	123	i->r->r[i->a] = 0;
	124	}
	125	if (++i->a == 6) return 0;
	126	rope_itr_first(i->r->r[i->a], &i->i);
	127	}
	128	return i->a == 6? 0 : s;
	129	}
	130
	131	/*****************************************
	132	* Inserting multiple strings in RLO *
	133	*****************************************/
	134
	135	typedef struct {
	136	uint64_t l;
	137	uint64_t u:61, c:3;
	138	const uint8_t *p;
	139	} triple64_t;
	140
	141	typedef const uint8_t *cstr_t;
	142
	143	#define rope_comp6(c) ((c) >= 1 && (c) <= 4? 5 - (c) : (c))
	144
	145	static void mr_insert_multi_aux(rope_t rope, int64_t m, triple64_t a, int is_comp)
	146	{
	147	int64_t k, beg;
	148	rpcache_t cache;
	149	memset(&cache, 0, sizeof(rpcache_t));
	150	for (k = 0; k != m; ++k) // set the base to insert
	151	a[k].c = *a[k].p++;
	152	for (k = 1, beg = 0; k <= m; ++k) {
	153	if (k == m \|\| a[k].u != a[k-1].u) {
	154	int64_t x, i, l = a[beg].l, u = a[beg].u, tl[6], tu[6], c[6];
	155	int start, end, step, b;
	156	if (l == u && k == beg + 1) { // special case; still works without the following block
	157	a[beg].l = a[beg].u = rope_insert_run(rope, l, a[beg].c, 1, &cache);
	158	beg = k;
	159	continue;
	160	} else if (l == u) {
	161	memset(tl, 0, 48);
	162	memset(tu, 0, 48);
	163	} else rope_rank2a(rope, l, u, tl, tu);
	164	memset(c, 0, 48);
	165	for (i = beg; i < k; ++i) ++c[a[i].c];
	166	// insert sentinel
	167	if (c[0]) rope_insert_run(rope, l, 0, c[0], &cache);
	168	// insert A/C/G/T
	169	x = l + c[0] + (tu[0] - tl[0]);
	170	if (is_comp) start = 4, end = 0, step = -1;
	171	else start = 1, end = 5, step = 1;
	172	for (b = start; b != end; b += step) {
	173	int64_t size = tu[b] - tl[b];
	174	if (c[b]) {
	175	tl[b] = rope_insert_run(rope, x, b, c[b], &cache);
	176	tu[b] = tl[b] + size;
	177	}
	178	x += c[b] + size;
	179	}
	180	// insert N
	181	if (c[5]) {
	182	tu[5] -= tl[5];
	183	tl[5] = rope_insert_run(rope, x, 5, c[5], &cache);
	184	tu[5] += tl[5];
	185	}
	186	// update a[]
	187	for (i = beg; i < k; ++i) {
	188	triple64_t *p = &a[i];
	189	p->l = tl[p->c], p->u = tu[p->c];
	190	}
	191	beg = k;
	192	}
	193	}
	194	}
	195
	196	typedef struct {
	197	volatile int *n_fin_workers;
	198	volatile int to_run;
	199	int to_exit;
	200	mrope_t *mr;
	201	int b, is_comp;
	202	int64_t m;
	203	triple64_t *a;
	204	} worker_t;
	205
	206	static void worker(void data)
	207	{
	208	worker_t w = (worker_t)data;
	209	struct timespec req, rem;
	210	req.tv_sec = 0; req.tv_nsec = 1000000;
	211	do {
	212	while (!__sync_bool_compare_and_swap(&w->to_run, 1, 0)) nanosleep(&req, &rem); // wait for the signal from the master thread
	213	if (w->m) mr_insert_multi_aux(w->mr->r[w->b], w->m, w->a, w->is_comp);
	214	__sync_add_and_fetch(w->n_fin_workers, 1);
	215	} while (!w->to_exit);
	216	return 0;
	217	}
	218
	219	void mr_insert_multi(mrope_t mr, int64_t len, const uint8_t s, int is_thr)
	220	{
	221	int64_t k, m, n0;
	222	int b, is_srt = (mr->so != MR_SO_IO), is_comp = (mr->so == MR_SO_RCLO), stop_thr = 0;
	223	volatile int n_fin_workers = 0;
	224	triple64_t a[2], curr, prev, swap;
	225	pthread_t *tid = 0;
	226	worker_t *w = 0;
	227
	228	if (mr->thr_min < 0) mr->thr_min = 0;
	229	assert(len > 0 && s[len-1] == 0);
	230	{ // split into short strings
	231	cstr_t p, q, end = s + len;
	232	for (p = s, m = 0; p != end; ++p) // count #sentinels
	233	if (*p == 0) ++m;
	234	curr = a[0] = malloc(m * sizeof(triple64_t));
	235	prev = a[1] = malloc(m * sizeof(triple64_t));
	236	for (p = q = s, k = 0; p != end; ++p) // find the start of each string
	237	if (*p == 0) prev[k++].p = q, q = p + 1;
	238	}
	239
	240	for (k = n0 = 0; k < 6; ++k) n0 += mr->r[k]->c[0];
	241	for (k = 0; k != m; ++k) {
	242	if (is_srt) prev[k].l = 0, prev[k].u = n0;
	243	else prev[k].l = prev[k].u = n0 + k;
	244	prev[k].c = 0;
	245	}
	246	mr_insert_multi_aux(mr->r[0], m, prev, is_comp); // insert the first (actually the last) column
	247
	248	if (is_thr) {
	249	tid = alloca(4 * sizeof(pthread_t));
	250	w = alloca(4 * sizeof(worker_t));
	251	memset(w, 0, 4 * sizeof(worker_t));
	252	for (b = 0; b < 4; ++b) {
	253	w[b].mr = mr, w[b].b = b + 1, w[b].is_comp = is_comp;
	254	w[b].n_fin_workers = &n_fin_workers;
	255	}
	256	for (b = 0; b < 4; ++b) pthread_create(&tid[b], 0, worker, &w[b]);
	257	}
	258
	259	n0 = 0; // the number of inserted strings
	260	while (m) {
	261	int64_t c[6], ac[6];
	262	triple64_t *q[6];
	263
	264	memset(c, 0, 48);
	265	for (k = n0; k != m; ++k) ++c[prev[k].c]; // counting
	266	for (q[0] = curr + n0, b = 1; b < 6; ++b) q[b] = q[b-1] + c[b-1];
	267	if (n0 + c[0] < m) {
	268	for (k = n0; k != m; ++k) *q[prev[k].c]++ = prev[k]; // sort
	269	for (b = 0; b < 6; ++b) q[b] -= c[b];
	270	}
	271	n0 += c[0];
	272
	273	if (is_thr && !stop_thr) {
	274	struct timespec req, rem;
	275	req.tv_sec = 0; req.tv_nsec = 1000000;
	276	stop_thr = (m - n0 <= mr->thr_min);
	277	for (b = 0; b < 4; ++b) {
	278	w[b].a = q[b+1], w[b].m = c[b+1];
	279	if (stop_thr) w[b].to_exit = 1; // signal the workers to exit
	280	while (!__sync_bool_compare_and_swap(&w[b].to_run, 0, 1)); // signal the workers to start
	281	}
	282	if (c[5]) mr_insert_multi_aux(mr->r[5], c[5], q[5], is_comp); // the master thread processes the "N" bucket
	283	while (!__sync_bool_compare_and_swap(&n_fin_workers, 4, 0)) // wait until all 4 workers finish
	284	nanosleep(&req, &rem);
	285	if (stop_thr && n0 < m)
	286	fprintf(stderr, "[M::%s] Turn off parallelization for this batch as too few strings are left.\n", __func__);
	287	} else {
	288	for (b = 1; b < 6; ++b)
	289	if (c[b]) mr_insert_multi_aux(mr->r[b], c[b], q[b], is_comp);
	290	}
	291	if (n0 == m) break;
	292
	293	memset(ac, 0, 48);
	294	for (b = 1; b < 6; ++b) { // update the intervals to account for buckets ahead
	295	int a;
	296	for (a = 0; a < 6; ++a) ac[a] += mr->r[b-1]->c[a];
	297	for (k = 0; k < c[b]; ++k) {
	298	triple64_t *p = &q[b][k];
	299	p->l += ac[p->c]; p->u += ac[p->c];
	300	}
	301	}
	302	swap = curr, curr = prev, prev = swap;
	303	}
	304	if (is_thr) for (b = 0; b < 4; ++b) pthread_join(tid[b], 0);
	305	free(a[0]); free(a[1]);
	306	}

+114

-0

third_party/fermi-lite-0.1/mrope.h less more

	0	#ifndef MROPE_H_
	1	#define MROPE_H_
	2
	3	#include "rope.h"
	4
	5	#define MR_SO_IO 0
	6	#define MR_SO_RLO 1
	7	#define MR_SO_RCLO 2
	8
	9	typedef struct {
	10	uint8_t so; // sorting order
	11	int thr_min; // when there are fewer sequences than this, disable multi-threading
	12	rope_t *r[6];
	13	} mrope_t; // multi-rope
	14
	15	typedef struct {
	16	mrope_t *r;
	17	int a, to_free;
	18	rpitr_t i;
	19	} mritr_t;
	20
	21	#ifdef __cplusplus
	22	extern "C" {
	23	#endif
	24
	25	/**
	26	* Initiate a multi-rope
	27	*
	28	* @param max_nodes maximum number of nodes in an internal node; use ROPE_DEF_MAX_NODES (64) if unsure
	29	* @param block_len maximum block length in an external node; use ROPE_DEF_BLOCK_LEN (256) if unsure
	30	* @param sorting_order the order in which sequences are added; possible values defined by the MR_SO_* macros
	31	*/
	32	mrope_t *mr_init(int max_nodes, int block_len, int sorting_order);
	33
	34	void mr_destroy(mrope_t *r);
	35
	36	int mr_thr_min(mrope_t *r, int thr_min);
	37
	38	/**
	39	* Insert one string into the index
	40	*
	41	* @param r multi-rope
	42	* @param str the reverse of the input string (important: it is reversed!)
	43	*/
	44	int64_t mr_insert1(mrope_t r, const uint8_t str);
	45
	46	/**
	47	* Insert multiple strings
	48	*
	49	* @param mr multi-rope
	50	* @param len total length of $s
	51	* @param s concatenated, NULL delimited, reversed input strings
	52	* @param is_thr true to use 5 threads
	53	*/
	54	void mr_insert_multi(mrope_t mr, int64_t len, const uint8_t s, int is_thr);
	55
	56	void mr_rank2a(const mrope_t mr, int64_t x, int64_t y, int64_t cx, int64_t *cy);
	57	#define mr_rank1a(mr, x, cx) mr_rank2a(mr, x, -1, cx, 0)
	58
	59	/**
	60	* Put the iterator at the start of the index
	61	*
	62	* @param r multi-rope
	63	* @param i iterator to be initialized
	64	* @param to_free if true, free visited buckets
	65	*/
	66	void mr_itr_first(mrope_t r, mritr_t i, int to_free);
	67
	68	/**
	69	* Iterate to the next block
	70	*
	71	* @param i iterator
	72	*
	73	* @return pointer to the start of a block; see rle.h for decoding the block
	74	*/
	75	const uint8_t mr_itr_next_block(mritr_t i);
	76
	77	#ifdef __cplusplus
	78	}
	79	#endif
	80
	81	static inline int64_t mr_get_c(const mrope_t *mr, int64_t c[6])
	82	{
	83	int a, b;
	84	int64_t tot = 0;
	85	for (a = 0; a < 6; ++a) c[a] = 0;
	86	for (a = 0; a < 6; ++a) {
	87	for (b = 0; b < 6; ++b)
	88	c[b] += mr->r[a]->c[b];
	89	tot += c[b];
	90	}
	91	return tot;
	92	}
	93
	94	static inline int64_t mr_get_ac(const mrope_t *mr, int64_t ac[7])
	95	{
	96	int a;
	97	int64_t c[6], tot;
	98	tot = mr_get_c(mr, c);
	99	for (a = 1, ac[0] = 0; a <= 6; ++a) ac[a] = ac[a-1] + c[a-1];
	100	return tot;
	101	}
	102
	103	static inline int64_t mr_get_tot(const mrope_t *mr)
	104	{
	105	int a, b;
	106	int64_t tot = 0;
	107	for (a = 0; a < 6; ++a)
	108	for (b = 0; b < 6; ++b)
	109	tot += mr->r[a]->c[b];
	110	return tot;
	111	}
	112
	113	#endif

+489

-0

third_party/fermi-lite-0.1/rld0.c less more

	0	#include <stdlib.h>
	1	#include <stdint.h>
	2	#include <stdio.h>
	3	#include <string.h>
	4	#include <assert.h>
	5	#include <unistd.h>
	6	#include <fcntl.h>
	7	#include <sys/mman.h>
	8	#include "rld0.h"
	9
	10	#define RLD_IBITS_PLUS 4
	11
	12	#define rld_file_size(e) ((4 + (e)->asize) * 8 + (e)->n_bytes + 8 * (e)->n_frames * ((e)->asize + 1))
	13
	14	#ifndef xcalloc
	15	#define xcalloc(n, s) calloc(n, s)
	16	#endif
	17	#ifndef xmalloc
	18	#define xmalloc(s) malloc(s)
	19	#endif
	20
	21	/******************
	22	* Delta encoding *
	23	******************/
	24
	25	static const char LogTable256[256] = {
	26	#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n
	27	-1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
	28	LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6),
	29	LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7)
	30	};
	31
	32	static inline int ilog2_32(uint32_t v)
	33	{
	34	register uint32_t t, tt;
	35	if ((tt = v>>16)) return (t = tt>>8) ? 24 + LogTable256[t] : 16 + LogTable256[tt];
	36	return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v];
	37	}
	38
	39	static inline int ilog2(uint64_t v)
	40	{
	41	return v>>32? 32 + ilog2_32(v>>32) : ilog2_32(v);
	42	}
	43
	44	static inline int64_t rld_delta_enc1(uint64_t x, int *width)
	45	{
	46	int y = ilog2(x);
	47	int z = ilog2_32(y + 1);
	48	*width = (z<<1) + 1 + y;
	49	return (x ^ (uint64_t)1<<y) \| (uint64_t)(y+1)<<y;
	50	}
	51
	52	/***********************************
	53	* Initialization and deallocation *
	54	***********************************/
	55
	56	rld_t *rld_init(int asize, int bbits)
	57	{
	58	rld_t *e;
	59	e = xcalloc(1, sizeof(rld_t));
	60	e->n = 1;
	61	e->z = xmalloc(sizeof(void*));
	62	e->z[0] = xcalloc(RLD_LSIZE, 8);
	63	e->ssize = 1<<bbits;
	64	e->cnt = xcalloc(asize + 1, 8);
	65	e->mcnt = xcalloc(asize + 1, 8);
	66	e->abits = ilog2(asize) + 1;
	67	e->asize = asize;
	68	e->sbits = bbits;
	69	e->asize1 = asize + 1;
	70	e->offset0[0] = (e->asize1*16+63)/64;
	71	e->offset0[1] = (e->asize1*32+63)/64;
	72	e->offset0[2] = e->asize1;
	73	return e;
	74	}
	75
	76	void rld_destroy(rld_t *e)
	77	{
	78	int i = 0;
	79	if (e == 0) return;
	80	if (e->mem) {
	81	close(e->fd);
	82	munmap(e->mem, rld_file_size(e));
	83	} else {
	84	for (i = 0; i < e->n; ++i) free(e->z[i]);
	85	free(e->frame);
	86	}
	87	free(e->z); free(e->cnt); free(e->mcnt); free(e);
	88	}
	89
	90	void rld_itr_init(const rld_t e, rlditr_t itr, uint64_t k)
	91	{
	92	itr->i = e->z + (k >> RLD_LBITS);
	93	itr->shead = *itr->i + k%RLD_LSIZE;
	94	itr->stail = rld_get_stail(e, itr);
	95	itr->p = itr->shead + e->offset0[rld_block_type(*itr->shead)];
	96	itr->q = (uint8_t*)itr->p;
	97	itr->r = 64;
	98	itr->c = -1;
	99	itr->l = 0;
	100	}
	101
	102	/************
	103	* Encoding *
	104	************/
	105
	106	static inline void enc_next_block(rld_t e, rlditr_t itr)
	107	{
	108	int i, type;
	109	if (itr->stail + 2 - *itr->i == RLD_LSIZE) {
	110	++e->n;
	111	e->z = realloc(e->z, e->n * sizeof(void*));
	112	itr->i = e->z + e->n - 1;
	113	itr->shead = *itr->i = xcalloc(RLD_LSIZE, 8);
	114	} else itr->shead += e->ssize;
	115	if (e->cnt[0] - e->mcnt[0] < 0x4000) {
	116	uint16_t p = (uint16_t)itr->shead;
	117	for (i = 0; i <= e->asize; ++i) p[i] = e->cnt[i] - e->mcnt[i];
	118	type = 0;
	119	} else if (e->cnt[0] - e->mcnt[0] < 0x40000000) {
	120	uint32_t p = (uint32_t)itr->shead;
	121	for (i = 0; i <= e->asize; ++i) p[i] = e->cnt[i] - e->mcnt[i];
	122	type = 1;
	123	} else {
	124	uint64_t p = (uint64_t)itr->shead;
	125	for (i = 0; i <= e->asize; ++i) p[i] = e->cnt[i] - e->mcnt[i];
	126	type = 2;
	127	}
	128	*itr->shead \|= (uint64_t)type<<62;
	129	itr->p = itr->shead + e->offset0[type];
	130	itr->stail = rld_get_stail(e, itr);
	131	itr->q = (uint8_t*)itr->p;
	132	itr->r = 64;
	133	for (i = 0; i <= e->asize; ++i) e->mcnt[i] = e->cnt[i];
	134	}
	135
	136	static int rld_enc1(rld_t e, rlditr_t itr, int64_t l, uint8_t c)
	137	{
	138	int w;
	139	uint64_t x = rld_delta_enc1(l, &w) << e->abits \| c;
	140	w += e->abits;
	141	if (w >= itr->r && itr->p == itr->stail) enc_next_block(e, itr);
	142	if (w > itr->r) {
	143	w -= itr->r;
	144	*itr->p++ \|= x >> w;
	145	*itr->p = x << (itr->r = 64 - w);
	146	} else itr->r -= w, *itr->p \|= x << itr->r;
	147	e->cnt[0] += l;
	148	e->cnt[c + 1] += l;
	149	return 0;
	150	}
	151
	152	int rld_enc(rld_t e, rlditr_t itr, int64_t l, uint8_t c)
	153	{
	154	if (l == 0) return 0;
	155	if (itr->c != c) {
	156	if (itr->l) rld_enc1(e, itr, itr->l, itr->c);
	157	itr->l = l; itr->c = c;
	158	} else itr->l += l;
	159	return 0;
	160	}
	161
	162	void rld_rank_index(rld_t *e)
	163	{
	164	uint64_t last, n_blks, i, k, *cnt;
	165	int j;
	166
	167	n_blks = e->n_bytes * 8 / 64 / e->ssize + 1;
	168	last = rld_last_blk(e);
	169	cnt = alloca(e->asize * 8);
	170	e->ibits = ilog2(e->mcnt[0] / n_blks) + RLD_IBITS_PLUS;
	171	e->n_frames = ((e->mcnt[0] + (1ll<<e->ibits) - 1) >> e->ibits) + 1;
	172	e->frame = xcalloc(e->n_frames * e->asize1, 8);
	173	e->frame[0] = 0;
	174	for (j = 0; j < e->asize; ++j) cnt[j] = 0;
	175	for (i = e->ssize, k = 1; i <= last; i += e->ssize) {
	176	uint64_t sum, *p = rld_seek_blk(e, i);
	177	int type = rld_block_type(*p);
	178	if (type == 0) {
	179	uint16_t q = (uint16_t)p;
	180	for (j = 1; j <= e->asize; ++j) cnt[j-1] += q[j];
	181	} else if (type == 1) {
	182	uint32_t q = (uint32_t)p;
	183	for (j = 1; j <= e->asize; ++j) cnt[j-1] += q[j] & 0x3fffffff;
	184	} else {
	185	uint64_t q = (uint64_t)p;
	186	for (j = 1; j <= e->asize; ++j) cnt[j-1] += q[j];
	187	}
	188	for (j = 0, sum = 0; j < e->asize; ++j) sum += cnt[j];
	189	while (sum >= k<<e->ibits) ++k;
	190	if (k < e->n_frames) {
	191	uint64_t x = k * e->asize1;
	192	e->frame[x] = i;
	193	for (j = 0; j < e->asize; ++j) e->frame[x + j + 1] = cnt[j];
	194	}
	195	}
	196	assert(k >= e->n_frames - 1);
	197	for (k = 1; k < e->n_frames; ++k) { // fill zero cells
	198	uint64_t x = k * e->asize1;
	199	if (e->frame[x] == 0) {
	200	for (j = 0; j <= e->asize; ++j)
	201	e->frame[x + j] = e->frame[x - e->asize1 + j];
	202	}
	203	}
	204	}
	205
	206	uint64_t rld_enc_finish(rld_t e, rlditr_t itr)
	207	{
	208	int i;
	209	if (itr->l) rld_enc1(e, itr, itr->l, itr->c);
	210	enc_next_block(e, itr);
	211	e->n_bytes = (((uint64_t)(e->n - 1) * RLD_LSIZE) + (itr->p - itr->i)) 8;
	212	// recompute e->cnt as the accumulative count; e->mcnt[] keeps the marginal counts
	213	for (e->cnt[0] = 0, i = 1; i <= e->asize; ++i) e->cnt[i] += e->cnt[i - 1];
	214	rld_rank_index(e);
	215	return e->n_bytes;
	216	}
	217
	218	/*****************
	219	* Save and load *
	220	*****************/
	221
	222	int rld_dump(const rld_t e, const char fn)
	223	{
	224	uint64_t k = 0;
	225	int i;
	226	uint32_t a;
	227	FILE *fp;
	228	fp = strcmp(fn, "-")? fopen(fn, "wb") : fdopen(fileno(stdout), "wb");
	229	if (fp == 0) return -1;
	230	a = e->asize<<16 \| e->sbits;
	231	fwrite("RLD\3", 1, 4, fp); // write magic
	232	fwrite(&a, 4, 1, fp); // write sbits and asize
	233	fwrite(&k, 8, 1, fp); // preserve 8 bytes for future uses
	234	fwrite(&e->n_bytes, 8, 1, fp); // n_bytes can always be divided by 8
	235	fwrite(&e->n_frames, 8, 1, fp); // number of frames
	236	fwrite(e->mcnt + 1, 8, e->asize, fp); // write the marginal counts
	237	for (i = 0, k = e->n_bytes / 8; i < e->n - 1; ++i, k -= RLD_LSIZE)
	238	fwrite(e->z[i], 8, RLD_LSIZE, fp);
	239	fwrite(e->z[i], 8, k, fp);
	240	fwrite(e->frame, 8 * e->asize1, e->n_frames, fp);
	241	fclose(fp);
	242	return 0;
	243	}
	244
	245	static rld_t rld_restore_header(const char fn, FILE **_fp)
	246	{
	247	FILE *fp;
	248	rld_t *e;
	249	char magic[4];
	250	uint64_t a[3];
	251	int32_t i, x;
	252
	253	if (strcmp(fn, "-") == 0) *_fp = fp = stdin;
	254	else if ((*_fp = fp = fopen(fn, "rb")) == 0) return 0;
	255	fread(magic, 1, 4, fp);
	256	if (strncmp(magic, "RLD\3", 4)) return 0;
	257	fread(&x, 4, 1, fp);
	258	e = rld_init(x>>16, x&0xffff);
	259	fread(a, 8, 3, fp);
	260	e->n_bytes = a[1]; e->n_frames = a[2];
	261	fread(e->mcnt + 1, 8, e->asize, fp);
	262	for (i = 0; i <= e->asize; ++i) e->cnt[i] = e->mcnt[i];
	263	for (i = 1; i <= e->asize; ++i) e->cnt[i] += e->cnt[i - 1];
	264	e->mcnt[0] = e->cnt[e->asize];
	265	return e;
	266	}
	267
	268	rld_t rld_restore(const char fn)
	269	{
	270	FILE *fp;
	271	rld_t *e;
	272	uint64_t k, n_blks;
	273	int32_t i;
	274
	275	if ((e = rld_restore_header(fn, &fp)) == 0) { // then load as plain DNA rle
	276	uint8_t *buf;
	277	int l;
	278	rlditr_t itr;
	279	buf = malloc(0x10000);
	280	e = rld_init(6, 3);
	281	rld_itr_init(e, &itr, 0);
	282	while ((l = fread(buf, 1, 0x10000, fp)) != 0)
	283	for (i = 0; i < l; ++i)
	284	if (buf[i]>>3) rld_enc(e, &itr, buf[i]>>3, buf[i]&7);
	285	fclose(fp);
	286	free(buf);
	287	rld_enc_finish(e, &itr);
	288	return e;
	289	}
	290	if (e->n_bytes / 8 > RLD_LSIZE) { // allocate enough memory
	291	e->n = (e->n_bytes / 8 + RLD_LSIZE - 1) / RLD_LSIZE;
	292	e->z = realloc(e->z, e->n * sizeof(void*));
	293	for (i = 1; i < e->n; ++i)
	294	e->z[i] = xcalloc(RLD_LSIZE, 8);
	295	}
	296	for (i = 0, k = e->n_bytes / 8; i < e->n - 1; ++i, k -= RLD_LSIZE)
	297	fread(e->z[i], 8, RLD_LSIZE, fp);
	298	fread(e->z[i], 8, k, fp);
	299	e->frame = xmalloc(e->n_frames * e->asize1 * 8);
	300	fread(e->frame, 8 * e->asize1, e->n_frames, fp);
	301	fclose(fp);
	302	n_blks = e->n_bytes * 8 / 64 / e->ssize + 1;
	303	e->ibits = ilog2(e->mcnt[0] / n_blks) + RLD_IBITS_PLUS;
	304	return e;
	305	}
	306
	307	rld_t rld_restore_mmap(const char fn)
	308	{
	309	FILE *fp;
	310	rld_t *e;
	311	int i;
	312	int64_t n_blks;
	313
	314	e = rld_restore_header(fn, &fp);
	315	fclose(fp);
	316	free(e->z[0]); free(e->z);
	317	e->n = (e->n_bytes / 8 + RLD_LSIZE - 1) / RLD_LSIZE;
	318	e->z = xcalloc(e->n, sizeof(void*));
	319	e->fd = open(fn, O_RDONLY);
	320	e->mem = (uint64_t*)mmap(0, rld_file_size(e), PROT_READ, MAP_PRIVATE, e->fd, 0);
	321	for (i = 0; i < e->n; ++i) e->z[i] = e->mem + (4 + e->asize) + (size_t)i * RLD_LSIZE;
	322	e->frame = e->mem + (4 + e->asize) + e->n_bytes/8;
	323	n_blks = e->n_bytes * 8 / 64 / e->ssize + 1;
	324	e->ibits = ilog2(e->mcnt[0] / n_blks) + RLD_IBITS_PLUS;
	325	return e;
	326	}
	327
	328	/******************
	329	* Computing rank *
	330	******************/
	331
	332	#ifdef _DNA_ONLY
	333	static inline int64_t rld_dec0_fast_dna(const rld_t e, rlditr_t itr, int *c)
	334	{ // This is NOT a replacement of rld_dec0(). It does not do boundary check.
	335	uint64_t x = itr->r == 64? itr->p[0] : itr->p[0] << (64 - itr->r) \| itr->p[1] >> itr->r;
	336	if (x>>63 == 0) {
	337	int64_t y;
	338	int l, w = 0x333333335555779bll>>(x>>59<<2)&0xf;
	339	l = (x >> (64 - w)) - 1;
	340	y = x << w >> (64 - l) \| 1u << l;
	341	w += l;
	342	*c = x << w >> 61;
	343	w += 3;
	344	itr->r -= w;
	345	if (itr->r <= 0) ++itr->p, itr->r += 64;
	346	return y;
	347	} else {
	348	*c = x << 1 >> 61;
	349	itr->r -= 4;
	350	if (itr->r <= 0) ++itr->p, itr->r += 64;
	351	return 1;
	352	}
	353	}
	354	#endif
	355
	356	static inline uint64_t rld_locate_blk(const rld_t e, rlditr_t itr, uint64_t k, uint64_t cnt, uint64_t sum)
	357	{
	358	int j;
	359	uint64_t c = 0, q, z = e->frame + (k>>e->ibits) * e->asize1;
	360	itr->i = e->z + (*z>>RLD_LBITS);
	361	q = itr->p = itr->i + (z&RLD_LMASK);
	362	for (j = 1, sum = 0; j < e->asize1; ++j) sum += (cnt[j-1] = z[j]);
	363	while (1) { // seek to the small block
	364	int type;
	365	q += e->ssize;
	366	if (q - itr->i == RLD_LSIZE) q = ++itr->i;
	367	type = rld_block_type(*q);
	368	c = type == 2? q&0x3fffffffffffffffULL : type == 1? (uint32_t)q : (uint16_t*)q;
	369	if (*sum + c > k) break;
	370	if (type == 0) {
	371	uint16_t p = (uint16_t)q + 1;
	372	#ifdef _DNA_ONLY
	373	cnt[0] += p[0]; cnt[1] += p[1]; cnt[2] += p[2]; cnt[3] += p[3]; cnt[4] += p[4]; cnt[5] += p[5];
	374	#else
	375	for (j = 0; j < e->asize; ++j) cnt[j] += p[j];
	376	#endif
	377	} else if (type == 1) {
	378	uint32_t p = (uint32_t)q + 1;
	379	for (j = 0; j < e->asize; ++j) cnt[j] += p[j] & 0x3fffffff;
	380	} else {
	381	uint64_t p = (uint64_t)q + 1;
	382	for (j = 0; j < e->asize; ++j) cnt[j] += p[j];
	383	}
	384	*sum += c;
	385	itr->p = q;
	386	}
	387	itr->shead = itr->p;
	388	itr->stail = rld_get_stail(e, itr);
	389	itr->p += e->offset0[rld_block_type(*itr->shead)];
	390	itr->q = (uint8_t*)itr->p;
	391	itr->r = 64;
	392	return c + *sum;
	393	}
	394
	395	void rld_rank21(const rld_t e, uint64_t k, uint64_t l, int c, uint64_t ok, uint64_t *ol) // FIXME: can be faster
	396	{
	397	*ok = rld_rank11(e, k, c);
	398	*ol = rld_rank11(e, l, c);
	399	}
	400
	401	int rld_rank1a(const rld_t e, uint64_t k, uint64_t ok)
	402	{
	403	uint64_t z, l;
	404	int a = -1;
	405	rlditr_t itr;
	406	if (k == 0) {
	407	for (a = 0; a < e->asize; ++a) ok[a] = 0;
	408	return -1;
	409	}
	410	rld_locate_blk(e, &itr, k-1, ok, &z);
	411	while (1) {
	412	#ifdef _DNA_ONLY
	413	l = rld_dec0_fast_dna(e, &itr, &a);
	414	#else
	415	l = rld_dec0(e, &itr, &a);
	416	#endif
	417	if (z + l >= k) break;
	418	z += l; ok[a] += l;
	419	}
	420	ok[a] += k - z;
	421	return a;
	422	}
	423
	424	uint64_t rld_rank11(const rld_t *e, uint64_t k, int c)
	425	{
	426	uint64_t *ok;
	427	if (k == (uint64_t)-1) return 0;
	428	ok = alloca(e->asize1 * 8);
	429	rld_rank1a(e, k, ok);
	430	return ok[c];
	431	}
	432
	433	void rld_rank2a(const rld_t e, uint64_t k, uint64_t l, uint64_t ok, uint64_t *ol)
	434	{
	435	uint64_t z, y, len;
	436	rlditr_t itr;
	437	int a = -1;
	438	if (k == 0) {
	439	for (a = 0; a < e->asize; ++a) ok[a] = 0;
	440	rld_rank1a(e, l, ol);
	441	return;
	442	}
	443	y = rld_locate_blk(e, &itr, k-1, ok, &z); // locate the block bracketing k
	444	while (1) { // compute ok[]
	445	#ifdef _DNA_ONLY
	446	len = rld_dec0_fast_dna(e, &itr, &a);
	447	#else
	448	len = rld_dec0(e, &itr, &a);
	449	#endif
	450	if (z + len >= k) break;
	451	z += len; ok[a] += len;
	452	}
	453	if (y > l) { // we do not need to decode other blocks
	454	int b;
	455	for (b = 0; b < e->asize; ++b) ol[b] = ok[b]; // copy ok[] to ol[]
	456	ok[a] += k - z; // finalize ok[a]
	457	if (z + len < l) { // we need to decode the next run
	458	z += len; ol[a] += len;
	459	while (1) {
	460	len = rld_dec0(e, &itr, &a);
	461	if (z + len >= l) break;
	462	z += len; ol[a] += len;
	463	}
	464	}
	465	ol[a] += l - z;
	466	} else { // we have to decode other blocks
	467	ok[a] += k - z;
	468	rld_rank1a(e, l, ol);
	469	}
	470	}
	471
	472	int rld_extend(const rld_t e, const rldintv_t ik, rldintv_t ok[6], int is_back)
	473	{ // TODO: this can be accelerated a little by using rld_rank1a() when ik.x[2]==1
	474	uint64_t tk[6], tl[6];
	475	int i;
	476	rld_rank2a(e, ik->x[!is_back], ik->x[!is_back] + ik->x[2], tk, tl);
	477	for (i = 0; i < 6; ++i) {
	478	ok[i].x[!is_back] = e->cnt[i] + tk[i];
	479	ok[i].x[2] = (tl[i] -= tk[i]);
	480	}
	481	ok[0].x[is_back] = ik->x[is_back];
	482	ok[4].x[is_back] = ok[0].x[is_back] + tl[0];
	483	ok[3].x[is_back] = ok[4].x[is_back] + tl[4];
	484	ok[2].x[is_back] = ok[3].x[is_back] + tl[3];
	485	ok[1].x[is_back] = ok[2].x[is_back] + tl[2];
	486	ok[5].x[is_back] = ok[1].x[is_back] + tl[1];
	487	return 0;
	488	}

+137

-0

third_party/fermi-lite-0.1/rld0.h less more

	0	#ifndef RLDELTA0_H
	1	#define RLDELTA0_H
	2
	3	#define _DNA_ONLY
	4
	5	#include <stdint.h>
	6	#include <stdlib.h>
	7	#include <assert.h>
	8	#include <stdio.h>
	9
	10	#define RLD_LBITS 23
	11	#define RLD_LSIZE (1<<RLD_LBITS)
	12	#define RLD_LMASK (RLD_LSIZE - 1)
	13
	14	typedef struct {
	15	int r, c; // $r: bits remained in the last 64-bit integer; $c: pending symbol
	16	int64_t l; // $l: pending length
	17	uint64_t p, shead, stail, *i;
	18	uint8_t *q;
	19	} rlditr_t;
	20
	21	typedef struct rld_t {
	22	// initialized in the constructor
	23	uint8_t asize, asize1; // alphabet size; asize1=asize+1
	24	int8_t abits; // bits required to store a symbol
	25	int8_t sbits; // bits per small block
	26	int8_t ibits; // modified during indexing; here for a better alignment
	27	int8_t offset0[3]; // 0 for 16-bit blocks; 1 for 32-bit blocks; 2 for 64-bit blocks
	28	int ssize; // ssize = 1<<sbits
	29	// modified during encoding
	30	int n; // number of blocks (unchanged in decoding)
	31	uint64_t n_bytes; // total number of bits (unchanged in decoding)
	32	uint64_t **z; // the actual data (unchanged in decoding)
	33	uint64_t cnt, mcnt; // after enc_finish, cnt keeps the accumulative count and mcnt keeps the marginal
	34	// modified during indexing
	35	uint64_t n_frames;
	36	uint64_t *frame;
	37	//
	38	int fd;
	39	uint64_t *mem; // only used for memory mapped file
	40	} rld_t;
	41
	42	typedef struct {
	43	uint64_t x[3]; // 0: start of the interval, backward; 1: forward; 2: size of the interval
	44	uint64_t info;
	45	} rldintv_t;
	46
	47	#ifdef __cplusplus
	48	extern "C" {
	49	#endif
	50
	51	rld_t *rld_init(int asize, int bbits);
	52	void rld_destroy(rld_t *e);
	53	int rld_dump(const rld_t e, const char fn);
	54	rld_t rld_restore(const char fn);
	55	rld_t rld_restore_mmap(const char fn);
	56
	57	void rld_itr_init(const rld_t e, rlditr_t itr, uint64_t k);
	58	int rld_enc(rld_t e, rlditr_t itr, int64_t l, uint8_t c);
	59	uint64_t rld_enc_finish(rld_t e, rlditr_t itr);
	60
	61	uint64_t rld_rank11(const rld_t *e, uint64_t k, int c);
	62	int rld_rank1a(const rld_t e, uint64_t k, uint64_t ok);
	63	void rld_rank21(const rld_t e, uint64_t k, uint64_t l, int c, uint64_t ok, uint64_t *ol);
	64	void rld_rank2a(const rld_t e, uint64_t k, uint64_t l, uint64_t ok, uint64_t *ol);
	65
	66	int rld_extend(const rld_t e, const rldintv_t ik, rldintv_t ok[6], int is_back);
	67
	68	#ifdef __cplusplus
	69	}
	70	#endif
	71
	72	#define rld_last_blk(e) ((e)->n_bytes>>3>>(e)->sbits<<(e)->sbits)
	73	#define rld_seek_blk(e, k) ((e)->z[(k)>>RLD_LBITS] + ((k)&RLD_LMASK))
	74	#define rld_get_stail(e, itr) ((itr)->shead + (e)->ssize - ((itr)->shead + (e)->ssize - *(itr)->i == RLD_LSIZE? 2 : 1))
	75
	76	#define rld_block_type(x) ((uint64_t)(x)>>62)
	77
	78	static inline int64_t rld_dec0(const rld_t e, rlditr_t itr, int *c)
	79	{
	80	int w;
	81	uint64_t x;
	82	int64_t l, y = 0;
	83	x = itr->p[0] << (64 - itr->r) \| (itr->p != itr->stail && itr->r != 64? itr->p[1] >> itr->r : 0);
	84	if (x>>63 == 0) {
	85	if ((w = 0x333333335555779bll>>(x>>59<<2)&0xf) == 0xb && x>>58 == 0) return 0;
	86	l = (x >> (64 - w)) - 1;
	87	y = x << w >> (64 - l) \| 1u << l;
	88	w += l;
	89	} else w = y = 1;
	90	*c = x << w >> (64 - e->abits);
	91	w += e->abits;
	92	if (itr->r > w) itr->r -= w;
	93	else ++itr->p, itr->r = 64 + itr->r - w;
	94	return y;
	95	}
	96
	97	static inline int64_t rld_dec(const rld_t e, rlditr_t itr, int *_c, int is_free)
	98	{
	99	int64_t l = rld_dec0(e, itr, _c);
	100	if (l == 0 \|\| *_c > e->asize) {
	101	uint64_t last = rld_last_blk(e);
	102	if (itr->p - *itr->i > RLD_LSIZE - e->ssize) {
	103	if (is_free) {
	104	free(itr->i); itr->i = 0;
	105	}
	106	itr->shead = *++itr->i;
	107	} else itr->shead += e->ssize;
	108	if (itr->shead == rld_seek_blk(e, last)) return -1;
	109	itr->p = itr->shead + e->offset0[rld_block_type(*itr->shead)];
	110	itr->q = (uint8_t*)itr->p;
	111	itr->stail = rld_get_stail(e, itr);
	112	itr->r = 64;
	113	return rld_dec0(e, itr, _c);
	114	} else return l;
	115	}
	116
	117	// take k symbols from e0 and write it to e
	118	static inline void rld_dec_enc(rld_t e, rlditr_t itr, const rld_t e0, rlditr_t itr0, int64_t k)
	119	{
	120	if (itr0->l >= k) { // there are more pending symbols
	121	rld_enc(e, itr, k, itr0->c);
	122	itr0->l -= k; // l - k symbols remains
	123	} else { // use up all pending symbols
	124	int c = -1; // to please gcc
	125	int64_t l;
	126	rld_enc(e, itr, itr0->l, itr0->c); // write all pending symbols
	127	k -= itr0->l;
	128	for (; k > 0; k -= l) { // we always go into this loop because l0<k
	129	l = rld_dec(e0, itr0, &c, 1);
	130	rld_enc(e, itr, k < l? k : l, c);
	131	}
	132	itr0->l = -k; itr0->c = c;
	133	}
	134	}
	135
	136	#endif

+191

-0

third_party/fermi-lite-0.1/rle.c less more

	0	#include <string.h>
	1	#include <assert.h>
	2	#include <stdlib.h>
	3	#include <stdio.h>
	4	#include "rle.h"
	5
	6	const uint8_t rle_auxtab[8] = { 0x01, 0x11, 0x21, 0x31, 0x03, 0x13, 0x07, 0x17 };
	7
	8	// insert symbol $a after $x symbols in $str; marginal counts added to $cnt; returns the size increase
	9	int rle_insert_cached(uint8_t block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int beg, int64_t bc[6])
	10	{
	11	uint16_t nptr = (uint16_t)block;
	12	int diff;
	13
	14	block += 2; // skip the first 2 counting bytes
	15	if (*nptr == 0) {
	16	memset(cnt, 0, 48);
	17	diff = rle_enc1(block, a, rl);
	18	} else {
	19	uint8_t p, end = block + nptr, q;
	20	int64_t pre, z, l = 0, tot, beg_l;
	21	int c = -1, n_bytes = 0, n_bytes2, t = 0;
	22	uint8_t tmp[24];
	23	beg_l = bc[0] + bc[1] + bc[2] + bc[3] + bc[4] + bc[5];
	24	tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5];
	25	if (x < beg_l) {
	26	beg_l = 0, *beg = 0;
	27	memset(bc, 0, 48);
	28	}
	29	if (x == beg_l) {
	30	p = q = block + (*beg); z = beg_l;
	31	memcpy(cnt, bc, 48);
	32	} else if (x - beg_l <= ((tot-beg_l)>>1) + ((tot-beg_l)>>3)) { // forward
	33	z = beg_l; p = block + (*beg);
	34	memcpy(cnt, bc, 48);
	35	while (z < x) {
	36	rle_dec1(p, c, l);
	37	z += l; cnt[c] += l;
	38	}
	39	for (q = p - 1; *q>>6 == 2; --q);
	40	} else { // backward
	41	memcpy(cnt, ec, 48);
	42	z = tot; p = end;
	43	while (z >= x) {
	44	--p;
	45	if (*p>>6 != 2) {
	46	l \|= p>>7? (int64_t)rle_auxtab[p>>3&7]>>4 << t : *p>>3;
	47	z -= l; cnt[*p&7] -= l;
	48	l = 0; t = 0;
	49	} else {
	50	l \|= (*p&0x3fL) << t;
	51	t += 6;
	52	}
	53	}
	54	q = p;
	55	rle_dec1(p, c, l);
	56	z += l; cnt[c] += l;
	57	}
	58	*beg = q - block;
	59	memcpy(bc, cnt, 48);
	60	bc[c] -= l;
	61	n_bytes = p - q;
	62	if (x == z && a != c && p < end) { // then try the next run
	63	int tc;
	64	int64_t tl;
	65	q = p;
	66	rle_dec1(q, tc, tl);
	67	if (a == tc)
	68	c = tc, n_bytes = q - p, l = tl, z += l, p = q, cnt[tc] += tl;
	69	}
	70	if (z != x) cnt[c] -= z - x;
	71	pre = x - (z - l); p -= n_bytes;
	72	if (a == c) { // insert to the same run
	73	n_bytes2 = rle_enc1(tmp, c, l + rl);
	74	} else if (x == z) { // at the end; append to the existing run
	75	p += n_bytes; n_bytes = 0;
	76	n_bytes2 = rle_enc1(tmp, a, rl);
	77	} else { // break the current run
	78	n_bytes2 = rle_enc1(tmp, c, pre);
	79	n_bytes2 += rle_enc1(tmp + n_bytes2, a, rl);
	80	n_bytes2 += rle_enc1(tmp + n_bytes2, c, l - pre);
	81	}
	82	if (n_bytes != n_bytes2 && end != p + n_bytes) // size changed
	83	memmove(p + n_bytes2, p + n_bytes, end - p - n_bytes);
	84	memcpy(p, tmp, n_bytes2);
	85	diff = n_bytes2 - n_bytes;
	86	}
	87	return (*nptr += diff);
	88	}
	89
	90	int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6])
	91	{
	92	int beg = 0;
	93	int64_t bc[6];
	94	memset(bc, 0, 48);
	95	return rle_insert_cached(block, x, a, rl, cnt, ec, &beg, bc);
	96	}
	97
	98	void rle_split(uint8_t block, uint8_t new_block)
	99	{
	100	int n = (uint16_t)block;
	101	uint8_t end = block + 2 + n, q = block + 2 + (n>>1);
	102	while (*q>>6 == 2) --q;
	103	memcpy(new_block + 2, q, end - q);
	104	(uint16_t)new_block = end - q;
	105	(uint16_t)block = q - block - 2;
	106	}
	107
	108	void rle_count(const uint8_t *block, int64_t cnt[6])
	109	{
	110	const uint8_t q = block + 2, end = q + (uint16_t)block;
	111	while (q < end) {
	112	int c;
	113	int64_t l;
	114	rle_dec1(q, c, l);
	115	cnt[c] += l;
	116	}
	117	}
	118
	119	void rle_print(const uint8_t *block, int expand)
	120	{
	121	const uint16_t p = (const uint16_t)block;
	122	const uint8_t q = block + 2, end = block + 2 + *p;
	123	while (q < end) {
	124	int c;
	125	int64_t l, x;
	126	rle_dec1(q, c, l);
	127	if (expand) for (x = 0; x < l; ++x) putchar("$ACGTN"[c]);
	128	else printf("%c%ld", "$ACGTN"[c], (long)l);
	129	}
	130	putchar('\n');
	131	}
	132
	133	void rle_rank2a(const uint8_t block, int64_t x, int64_t y, int64_t cx, int64_t *cy, const int64_t ec[6])
	134	{
	135	int a;
	136	int64_t tot, cnt[6];
	137	const uint8_t *p;
	138
	139	y = y >= x? y : x;
	140	tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5];
	141	if (tot == 0) return;
	142	if (x <= (tot - y) + (tot>>3)) {
	143	int c = 0;
	144	int64_t l, z = 0;
	145	memset(cnt, 0, 48);
	146	p = block + 2;
	147	while (z < x) {
	148	rle_dec1(p, c, l);
	149	z += l; cnt[c] += l;
	150	}
	151	for (a = 0; a != 6; ++a) cx[a] += cnt[a];
	152	cx[c] -= z - x;
	153	if (cy) {
	154	while (z < y) {
	155	rle_dec1(p, c, l);
	156	z += l; cnt[c] += l;
	157	}
	158	for (a = 0; a != 6; ++a) cy[a] += cnt[a];
	159	cy[c] -= z - y;
	160	}
	161	} else {
	162	#define move_backward(_x) \
	163	while (z >= (_x)) { \
	164	--p; \
	165	if (*p>>6 != 2) { \
	166	l \|= p>>7? (int64_t)rle_auxtab[p>>3&7]>>4 << t : *p>>3; \
	167	z -= l; cnt[*p&7] -= l; \
	168	l = 0; t = 0; \
	169	} else { \
	170	l \|= (*p&0x3fL) << t; \
	171	t += 6; \
	172	} \
	173	} \
	174
	175	int t = 0;
	176	int64_t l = 0, z = tot;
	177	memcpy(cnt, ec, 48);
	178	p = block + 2 + (const uint16_t)block;
	179	if (cy) {
	180	move_backward(y)
	181	for (a = 0; a != 6; ++a) cy[a] += cnt[a];
	182	cy[*p&7] += y - z;
	183	}
	184	move_backward(x)
	185	for (a = 0; a != 6; ++a) cx[a] += cnt[a];
	186	cx[*p&7] += x - z;
	187
	188	#undef move_backward
	189	}
	190	}

+77

-0

third_party/fermi-lite-0.1/rle.h less more

	0	#ifndef RLE6_H_
	1	#define RLE6_H_
	2
	3	#include <stdint.h>
	4
	5	#ifdef __GNUC__
	6	#define LIKELY(x) __builtin_expect((x),1)
	7	#else
	8	#define LIKELY(x) (x)
	9	#endif
	10	#ifdef __cplusplus
	11
	12	extern "C" {
	13	#endif
	14
	15	int rle_insert_cached(uint8_t block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int beg, int64_t bc[6]);
	16	int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t end_cnt[6]);
	17	void rle_split(uint8_t block, uint8_t new_block);
	18	void rle_count(const uint8_t *block, int64_t cnt[6]);
	19	void rle_rank2a(const uint8_t block, int64_t x, int64_t y, int64_t cx, int64_t *cy, const int64_t ec[6]);
	20	#define rle_rank1a(block, x, cx, ec) rle_rank2a(block, x, -1, cx, 0, ec)
	21
	22	void rle_print(const uint8_t *block, int expand);
	23
	24	#ifdef __cplusplus
	25	}
	26	#endif
	27
	28	/******************
	29	* 43+3 codec *
	30	******************/
	31
	32	extern const uint8_t rle_auxtab[8];
	33
	34	#define RLE_MIN_SPACE 18
	35	#define rle_nptr(block) ((uint16_t*)(block))
	36
	37	// decode one run (c,l) and move the pointer p
	38	#define rle_dec1(p, c, l) do { \
	39	(c) = *(p) & 7; \
	40	if (LIKELY((*(p)&0x80) == 0)) { \
	41	(l) = *(p)++ >> 3; \
	42	} else if (LIKELY(*(p)>>5 == 6)) { \
	43	(l) = (*(p)&0x18L)<<3L \| ((p)[1]&0x3fL); \
	44	(p) += 2; \
	45	} else { \
	46	int n = ((*(p)&0x10) >> 2) + 4; \
	47	(l) = *(p)++ >> 3 & 1; \
	48	while (--n) (l) = ((l)<<6) \| (*(p)++&0x3fL); \
	49	} \
	50	} while (0)
	51
	52	static inline int rle_enc1(uint8_t *p, int c, int64_t l)
	53	{
	54	if (l < 1LL<<4) {
	55	*p = l << 3 \| c;
	56	return 1;
	57	} else if (l < 1LL<<8) {
	58	*p = 0xC0 \| l >> 6 << 3 \| c;
	59	p[1] = 0x80 \| (l & 0x3f);
	60	return 2;
	61	} else if (l < 1LL<<19) {
	62	*p = 0xE0 \| l >> 18 << 3 \| c;
	63	p[1] = 0x80 \| (l >> 12 & 0x3f);
	64	p[2] = 0x80 \| (l >> 6 & 0x3f);
	65	p[3] = 0x80 \| (l & 0x3f);
	66	return 4;
	67	} else {
	68	int i, shift = 36;
	69	*p = 0xF0 \| l >> 42 << 3 \| c;
	70	for (i = 1; i < 8; ++i, shift -= 6)
	71	p[i] = 0x80 \| (l>>shift & 0x3f);
	72	return 8;
	73	}
	74	}
	75
	76	#endif

+219

-0

third_party/fermi-lite-0.1/rope.c less more

	0	#include <stdlib.h>
	1	#include <string.h>
	2	#include <assert.h>
	3	#include <stdio.h>
	4	#include <zlib.h>
	5	#include "rle.h"
	6	#include "rope.h"
	7
	8	/*******************
	9	* Memory Pool *
	10	*******************/
	11
	12	#define MP_CHUNK_SIZE 0x100000 // 1MB per chunk
	13
	14	typedef struct { // memory pool for fast and compact memory allocation (no free)
	15	int size, i, n_elems;
	16	int64_t top, max;
	17	uint8_t **mem;
	18	} mempool_t;
	19
	20	static mempool_t *mp_init(int size)
	21	{
	22	mempool_t *mp;
	23	mp = calloc(1, sizeof(mempool_t));
	24	mp->size = size;
	25	mp->i = mp->n_elems = MP_CHUNK_SIZE / size;
	26	mp->top = -1;
	27	return mp;
	28	}
	29
	30	static void mp_destroy(mempool_t *mp)
	31	{
	32	int64_t i;
	33	for (i = 0; i <= mp->top; ++i) free(mp->mem[i]);
	34	free(mp->mem); free(mp);
	35	}
	36
	37	static inline void mp_alloc(mempool_t mp)
	38	{
	39	if (mp->i == mp->n_elems) {
	40	if (++mp->top == mp->max) {
	41	mp->max = mp->max? mp->max<<1 : 1;
	42	mp->mem = realloc(mp->mem, sizeof(void) mp->max);
	43	}
	44	mp->mem[mp->top] = calloc(mp->n_elems, mp->size);
	45	mp->i = 0;
	46	}
	47	return mp->mem[mp->top] + (mp->i++) * mp->size;
	48	}
	49
	50	/***************
	51	* B+ rope *
	52	***************/
	53
	54	rope_t *rope_init(int max_nodes, int block_len)
	55	{
	56	rope_t *rope;
	57	rope = calloc(1, sizeof(rope_t));
	58	if (block_len < 32) block_len = 32;
	59	rope->max_nodes = (max_nodes+ 1)>>1<<1;
	60	rope->block_len = (block_len + 7) >> 3 << 3;
	61	rope->node = mp_init(sizeof(rpnode_t) * rope->max_nodes);
	62	rope->leaf = mp_init(rope->block_len);
	63	rope->root = mp_alloc(rope->node);
	64	rope->root->n = 1;
	65	rope->root->is_bottom = 1;
	66	rope->root->p = mp_alloc(rope->leaf);
	67	return rope;
	68	}
	69
	70	void rope_destroy(rope_t *rope)
	71	{
	72	mp_destroy(rope->node);
	73	mp_destroy(rope->leaf);
	74	free(rope);
	75	}
	76
	77	static inline rpnode_t split_node(rope_t rope, rpnode_t u, rpnode_t v)
	78	{ // split $v's child. $u is the first node in the bucket. $v and $u are in the same bucket. IMPORTANT: there is always enough room in $u
	79	int j, i = v - u;
	80	rpnode_t *w; // $w is the sibling of $v
	81	if (u == 0) { // only happens at the root; add a new root
	82	u = v = mp_alloc(rope->node);
	83	v->n = 1; v->p = rope->root; // the new root has the old root as the only child
	84	memcpy(v->c, rope->c, 48);
	85	for (j = 0; j < 6; ++j) v->l += v->c[j];
	86	rope->root = v;
	87	}
	88	if (i != u->n - 1) // then make room for a new node
	89	memmove(v + 2, v + 1, sizeof(rpnode_t) * (u->n - i - 1));
	90	++u->n; w = v + 1;
	91	memset(w, 0, sizeof(rpnode_t));
	92	w->p = mp_alloc(u->is_bottom? rope->leaf : rope->node);
	93	if (u->is_bottom) { // we are at the bottom level; $v->p is a string instead of a node
	94	uint8_t p = (uint8_t)v->p, q = (uint8_t)w->p;
	95	rle_split(p, q);
	96	rle_count(q, w->c);
	97	} else { // $v->p is a node, not a string
	98	rpnode_t p = v->p, q = w->p; // $v and $w are siblings and thus $p and $q are cousins
	99	p->n -= rope->max_nodes>>1;
	100	memcpy(q, p + p->n, sizeof(rpnode_t) * (rope->max_nodes>>1));
	101	q->n = rope->max_nodes>>1; // NB: this line must below memcpy() as $q->n and $q->is_bottom are modified by memcpy()
	102	q->is_bottom = p->is_bottom;
	103	for (i = 0; i < q->n; ++i)
	104	for (j = 0; j < 6; ++j)
	105	w->c[j] += q[i].c[j];
	106	}
	107	for (j = 0; j < 6; ++j) // compute $w->l and update $v->c
	108	w->l += w->c[j], v->c[j] -= w->c[j];
	109	v->l -= w->l; // update $v->c
	110	return v;
	111	}
	112
	113	int64_t rope_insert_run(rope_t rope, int64_t x, int a, int64_t rl, rpcache_t cache)
	114	{ // insert $a after $x symbols in $rope and the returns rank(a, x)
	115	rpnode_t u = 0, v = 0, *p = rope->root; // $v is the parent of $p; $u and $v are at the same level and $u is the first node in the bucket
	116	int64_t y = 0, z = 0, cnt[6];
	117	int n_runs;
	118	do { // top-down update. Searching and node splitting are done together in one pass.
	119	if (p->n == rope->max_nodes) { // node is full; split
	120	v = split_node(rope, u, v); // $v points to the parent of $p; when a new root is added, $v points to the root
	121	if (y + v->l < x) // if $v is not long enough after the split, we need to move both $p and its parent $v
	122	y += v->l, z += v->c[a], ++v, p = v->p;
	123	}
	124	u = p;
	125	if (v && x - y > v->l>>1) { // then search backwardly for the right node to descend
	126	p += p->n - 1; y += v->l; z += v->c[a];
	127	for (; y >= x; --p) y -= p->l, z -= p->c[a];
	128	++p;
	129	} else for (; y + p->l < x; ++p) y += p->l, z += p->c[a]; // then search forwardly
	130	assert(p - u < u->n);
	131	if (v) v->c[a] += rl, v->l += rl; // we should not change p->c[a] because this may cause troubles when p's child is split
	132	v = p; p = p->p; // descend
	133	} while (!u->is_bottom);
	134	rope->c[a] += rl; // $rope->c should be updated after the loop as adding a new root needs the old $rope->c counts
	135	if (cache) {
	136	if (cache->p != (uint8_t*)p) memset(cache, 0, sizeof(rpcache_t));
	137	n_runs = rle_insert_cached((uint8_t*)p, x - y, a, rl, cnt, v->c, &cache->beg, cache->bc);
	138	cache->p = (uint8_t*)p;
	139	} else n_runs = rle_insert((uint8_t*)p, x - y, a, rl, cnt, v->c);
	140	z += cnt[a];
	141	v->c[a] += rl; v->l += rl; // this should be after rle_insert(); otherwise rle_insert() won't work
	142	if (n_runs + RLE_MIN_SPACE > rope->block_len) {
	143	split_node(rope, u, v);
	144	if (cache) memset(cache, 0, sizeof(rpcache_t));
	145	}
	146	return z;
	147	}
	148
	149	static rpnode_t rope_count_to_leaf(const rope_t rope, int64_t x, int64_t cx[6], int64_t *rest)
	150	{
	151	rpnode_t u, v = 0, *p = rope->root;
	152	int64_t y = 0;
	153	int a;
	154
	155	memset(cx, 0, 48);
	156	do {
	157	u = p;
	158	if (v && x - y > v->l>>1) {
	159	p += p->n - 1; y += v->l;
	160	for (a = 0; a != 6; ++a) cx[a] += v->c[a];
	161	for (; y >= x; --p) {
	162	y -= p->l;
	163	for (a = 0; a != 6; ++a) cx[a] -= p->c[a];
	164	}
	165	++p;
	166	} else {
	167	for (; y + p->l < x; ++p) {
	168	y += p->l;
	169	for (a = 0; a != 6; ++a) cx[a] += p->c[a];
	170	}
	171	}
	172	v = p; p = p->p;
	173	} while (!u->is_bottom);
	174	*rest = x - y;
	175	return v;
	176	}
	177
	178	void rope_rank2a(const rope_t rope, int64_t x, int64_t y, int64_t cx, int64_t *cy)
	179	{
	180	rpnode_t *v;
	181	int64_t rest;
	182	v = rope_count_to_leaf(rope, x, cx, &rest);
	183	if (y < x \|\| cy == 0) {
	184	rle_rank1a((const uint8_t*)v->p, rest, cx, v->c);
	185	} else if (rest + (y - x) <= v->l) {
	186	memcpy(cy, cx, 48);
	187	rle_rank2a((const uint8_t*)v->p, rest, rest + (y - x), cx, cy, v->c);
	188	} else {
	189	rle_rank1a((const uint8_t*)v->p, rest, cx, v->c);
	190	v = rope_count_to_leaf(rope, y, cy, &rest);
	191	rle_rank1a((const uint8_t*)v->p, rest, cy, v->c);
	192	}
	193	}
	194
	195	/*********************
	196	* Rope iterator *
	197	*********************/
	198
	199	void rope_itr_first(const rope_t rope, rpitr_t i)
	200	{
	201	memset(i, 0, sizeof(rpitr_t));
	202	i->rope = rope;
	203	for (i->pa[i->d] = rope->root; !i->pa[i->d]->is_bottom;) // descend to the leftmost leaf
	204	++i->d, i->pa[i->d] = i->pa[i->d - 1]->p;
	205	}
	206
	207	const uint8_t rope_itr_next_block(rpitr_t i)
	208	{
	209	const uint8_t *ret;
	210	assert(i->d < ROPE_MAX_DEPTH); // a B+ tree should not be that tall
	211	if (i->d < 0) return 0;
	212	ret = (uint8_t*)i->pa[i->d][i->ia[i->d]].p;
	213	while (i->d >= 0 && ++i->ia[i->d] == i->pa[i->d]->n) i->ia[i->d--] = 0; // backtracking
	214	if (i->d >= 0)
	215	while (!i->pa[i->d]->is_bottom) // descend to the leftmost leaf
	216	++i->d, i->pa[i->d] = i->pa[i->d - 1][i->ia[i->d - 1]].p;
	217	return ret;
	218	}

+54

-0

third_party/fermi-lite-0.1/rope.h less more

	0	#ifndef ROPE_H_
	1	#define ROPE_H_
	2
	3	#include <stdint.h>
	4	#include <stdio.h>
	5
	6	#define ROPE_MAX_DEPTH 80
	7	#define ROPE_DEF_MAX_NODES 64
	8	#define ROPE_DEF_BLOCK_LEN 512
	9
	10	typedef struct rpnode_s {
	11	struct rpnode_s *p; // child; at the bottom level, $p points to a string with the first 2 bytes giving the number of runs (#runs)
	12	uint64_t l:54, n:9, is_bottom:1; // $n and $is_bottom are only set for the first node in a bucket
	13	int64_t c[6]; // marginal counts
	14	} rpnode_t;
	15
	16	typedef struct {
	17	int32_t max_nodes, block_len; // both MUST BE even numbers
	18	int64_t c[6]; // marginal counts
	19	rpnode_t *root;
	20	void node, leaf; // memory pool
	21	} rope_t;
	22
	23	typedef struct {
	24	const rope_t *rope; // the rope
	25	const rpnode_t *pa[ROPE_MAX_DEPTH]; // parent nodes
	26	int ia[ROPE_MAX_DEPTH]; // index in the parent nodes
	27	int d; // the current depth in the B+-tree
	28	} rpitr_t;
	29
	30	typedef struct {
	31	int beg;
	32	int64_t bc[6];
	33	uint8_t *p;
	34	} rpcache_t;
	35
	36	#ifdef __cplusplus
	37	extern "C" {
	38	#endif
	39
	40	rope_t *rope_init(int max_nodes, int block_len);
	41	void rope_destroy(rope_t *rope);
	42	int64_t rope_insert_run(rope_t rope, int64_t x, int a, int64_t rl, rpcache_t cache);
	43	void rope_rank2a(const rope_t rope, int64_t x, int64_t y, int64_t cx, int64_t *cy);
	44	#define rope_rank1a(rope, x, cx) rope_rank2a(rope, x, -1, cx, 0)
	45
	46	void rope_itr_first(const rope_t rope, rpitr_t i);
	47	const uint8_t rope_itr_next_block(rpitr_t i);
	48
	49	#ifdef __cplusplus
	50	}
	51	#endif
	52
	53	#endif

third_party/fermi-lite-0.1/test/MT-simu.fq.gz less more

Binary diff not shown

+455

-0

third_party/fermi-lite-0.1/unitig.c less more

	0	#include <assert.h>
	1	#include <string.h>
	2	#include <math.h>
	3	#include "kvec.h"
	4	#include "kstring.h"
	5	#include "rld0.h"
	6	#include "mag.h"
	7	#include "internal.h"
	8
	9	/******************
	10	* From fermi *
	11	******************/
	12
	13	typedef struct { size_t n, m; int32_t *a; } fm32s_v;
	14	typedef struct { size_t n, m; rldintv_t *a; } rldintv_v;
	15
	16	static uint64_t utg_primes[] = { 123457, 234571, 345679, 456791, 567899, 0 };
	17
	18	#define fm6_comp(a) ((a) >= 1 && (a) <= 4? 5 - (a) : (a))
	19	#define fm6_set_intv(e, c, ik) ((ik).x[0] = (e)->cnt[(int)(c)], (ik).x[2] = (e)->cnt[(int)(c)+1] - (e)->cnt[(int)(c)], (ik).x[1] = (e)->cnt[fm6_comp(c)], (ik).info = 0)
	20
	21	int rld_extend0(const rld_t e, const rldintv_t ik, rldintv_t *ok0, int is_back)
	22	{ // FIXME: this can be accelerated a little by using rld_rank1a() when ik.x[2]==1
	23	uint64_t tk[6], tl[6];
	24	rld_rank2a(e, ik->x[!is_back], ik->x[!is_back] + ik->x[2], tk, tl);
	25	ok0->x[!is_back] = tk[0];
	26	ok0->x[is_back] = ik->x[is_back];
	27	ok0->x[2] = tl[0] - tk[0];
	28	return 0;
	29	}
	30
	31	uint64_t fm6_retrieve(const rld_t e, uint64_t x, kstring_t s, rldintv_t k2, int contained)
	32	{
	33	uint64_t k = x, ok[6];
	34	rldintv_t ok2[6];
	35	s->l = 0; *contained = 0;
	36	while (1) {
	37	int c = rld_rank1a(e, k + 1, ok);
	38	k = e->cnt[c] + ok[c] - 1;
	39	if (c == 0) break;
	40	if (s->l > 0) {
	41	if (k2->x[2] == 1) k2->x[0] = k;
	42	else {
	43	rld_extend(e, k2, ok2, 1);
	44	*k2 = ok2[c];
	45	}
	46	} else fm6_set_intv(e, c, *k2);
	47	kputc(c, s);
	48	}
	49	if (k2->x[2] != 1) {
	50	rld_extend(e, k2, ok2, 1);
	51	if (ok2[0].x[2] != k2->x[2]) *contained \|= 1; // left contained
	52	*k2 = ok2[0];
	53	} else k2->x[0] = k;
	54	rld_extend(e, k2, ok2, 0);
	55	if (ok2[0].x[2] != k2->x[2]) *contained \|= 2; // right contained
	56	*k2 = ok2[0];
	57	return k;
	58	}
	59
	60	/*****************
	61	* Main body *
	62	*****************/
	63
	64	#define info_lt(a, b) ((a).info < (b).info)
	65
	66	#include "ksort.h"
	67	KSORT_INIT(infocmp, rldintv_t, info_lt)
	68
	69	static inline void set_bit(uint64_t *bits, uint64_t x)
	70	{
	71	uint64_t *p = bits + (x>>6);
	72	uint64_t z = 1LLU<<(x&0x3f);
	73	__sync_fetch_and_or(p, z);
	74	}
	75
	76	static inline void set_bits(uint64_t bits, const rldintv_t p)
	77	{
	78	uint64_t k;
	79	for (k = 0; k < p->x[2]; ++k) {
	80	set_bit(bits, p->x[0] + k);
	81	set_bit(bits, p->x[1] + k);
	82	}
	83	}
	84
	85	static rldintv_t overlap_intv(const rld_t e, int len, const uint8_t seq, int min, int j, int at5, rldintv_v *p, int inc_sentinel)
	86	{ // requirement: seq[j] matches the end of a read
	87	int c, depth, dir, end;
	88	rldintv_t ik, ok[6];
	89	p->n = 0;
	90	dir = at5? 1 : -1; // at5 is true iff we start from the 5'-end of a read
	91	end = at5? len : -1;
	92	c = seq[j];
	93	fm6_set_intv(e, c, ik);
	94	for (depth = 1, j += dir; j != end; j += dir, ++depth) {
	95	c = at5? fm6_comp(seq[j]) : seq[j];
	96	rld_extend(e, &ik, ok, !at5);
	97	if (!ok[c].x[2]) break; // cannot be extended
	98	if (depth >= min && ok[0].x[2]) {
	99	if (inc_sentinel) {
	100	ok[0].info = j - dir;
	101	kv_push(rldintv_t, *p, ok[0]);
	102	} else {
	103	ik.info = j - dir;
	104	kv_push(rldintv_t, *p, ik);
	105	}
	106	}
	107	ik = ok[c];
	108	}
	109	kv_reverse(rldintv_t, *p, 0); // reverse the array such that the smallest interval comes first
	110	return ik;
	111	}
	112
	113	typedef struct {
	114	const rld_t *e;
	115	int min_match, min_merge_len;
	116	rldintv_v a[2], nei;
	117	fm32s_v cat;
	118	uint64_t used, bend;
	119	kstring_t str;
	120	uint64_t n, sum, sum2, unpaired;
	121	} aux_t;
	122
	123	int fm6_is_contained(const rld_t e, int min_match, const kstring_t s, rldintv_t intv, rldintv_v ovlp)
	124	{ // for s is a sequence in e, test if s is contained in other sequences in e; return intervals right overlapping with s
	125	rldintv_t ik, ok[6];
	126	int ret = 0;
	127	assert(s->l > min_match);
	128	ovlp->n = 0;
	129	ik = overlap_intv(e, s->l, (uint8_t*)s->s, min_match, s->l - 1, 0, ovlp, 0);
	130	rld_extend(e, &ik, ok, 1); assert(ok[0].x[2]);
	131	if (ik.x[2] != ok[0].x[2]) ret = -1; // the sequence is left contained
	132	ik = ok[0];
	133	rld_extend(e, &ik, ok, 0); assert(ok[0].x[2]);
	134	if (ik.x[2] != ok[0].x[2]) ret = -1; // the sequence is right contained
	135	*intv = ok[0];
	136	return ret;
	137	}
	138
	139	int fm6_get_nei(const rld_t e, int min_match, int beg, kstring_t s, rldintv_v *nei, // input and output variables
	140	rldintv_v prev, rldintv_v curr, fm32s_v *cat, // temporary arrays
	141	uint64_t *used) // optional info
	142	{
	143	int ori_l = s->l, j, i, c, rbeg, is_forked = 0;
	144	rldintv_v *swap;
	145	rldintv_t ok[6], ok0;
	146
	147	curr->n = nei->n = cat->n = 0;
	148	if (prev->n == 0) { // when this routine is called for the seed, prev may filled by fm6_is_contained()
	149	overlap_intv(e, s->l - beg, (uint8_t*)s->s + beg, min_match, s->l - beg - 1, 0, prev, 0);
	150	if (prev->n == 0) return -1; // no overlap
	151	for (j = 0; j < prev->n; ++j) prev->a[j].info += beg;
	152	}
	153	kv_resize(int, *cat, prev->m);
	154	for (j = 0; j < prev->n; ++j) cat->a[j] = 0; // only one interval; all point to 0
	155	while (prev->n) {
	156	for (j = 0, curr->n = 0; j < prev->n; ++j) {
	157	rldintv_t *p = &prev->a[j];
	158	if (cat->a[j] < 0) continue;
	159	rld_extend(e, p, ok, 0); // forward extension
	160	if (ok[0].x[2] && ori_l != s->l) { // some (partial) reads end here
	161	rld_extend0(e, &ok[0], &ok0, 1); // backward extension to look for sentinels
	162	if (ok0.x[2]) { // the match is bounded by sentinels - a full-length match
	163	if (ok[0].x[2] == p->x[2] && p->x[2] == ok0.x[2]) { // never consider a read contained in another read
	164	int cat0 = cat->a[j]; // a category approximately corresponds to one neighbor, though not always
	165	assert(j == 0 \|\| cat->a[j] > cat->a[j-1]); // otherwise not irreducible
	166	ok0.info = ori_l - (p->info&0xffffffffU);
	167	for (i = j; i < prev->n && cat->a[i] == cat0; ++i) cat->a[i] = -1; // mask out other intervals of the same cat
	168	kv_push(rldintv_t, *nei, ok0); // keep in the neighbor vector
	169	continue; // no need to go through for(c); do NOT set "used" as this neighbor may be rejected later
	170	} else if (used) set_bits(used, &ok0); // the read is contained in another read; mark it as used
	171	}
	172	} // ~if(ok[0].x[2])
	173	if (cat->a[j] < 0) continue; // no need to proceed if we have finished this path
	174	for (c = 1; c < 5; ++c) // collect extensible intervals
	175	if (ok[c].x[2]) {
	176	rld_extend0(e, &ok[c], &ok0, 1);
	177	if (ok0.x[2]) { // do not extend intervals whose left end is not bounded by a sentinel
	178	ok[c].info = (p->info&0xfffffff0ffffffffLLU) \| (uint64_t)c<<32;
	179	kv_push(rldintv_t, *curr, ok[c]);
	180	}
	181	}
	182	} // ~for(j)
	183	if (curr->n) { // update category
	184	uint32_t last, cat0;
	185	kv_resize(int, *cat, curr->m);
	186	c = curr->a[0].info>>32&0xf;
	187	kputc(fm6_comp(c), s);
	188	ks_introsort(infocmp, curr->n, curr->a);
	189	last = curr->a[0].info >> 32;
	190	cat->a[0] = 0;
	191	curr->a[0].info &= 0xffffffff;
	192	for (j = 1, cat0 = 0; j < curr->n; ++j) { // this loop recalculate cat
	193	if (curr->a[j].info>>32 != last)
	194	last = curr->a[j].info>>32, cat0 = j;
	195	cat->a[j] = cat0;
	196	curr->a[j].info = (curr->a[j].info&0xffffffff) \| (uint64_t)cat0<<36;
	197	}
	198	if (cat0 != 0) is_forked = 1;
	199	}
	200	swap = curr; curr = prev; prev = swap; // swap curr and prev
	201	} // ~while(prev->n)
	202	if (nei->n == 0) return -1; // no overlap
	203	rbeg = ori_l - (uint32_t)nei->a[0].info;
	204	if (nei->n == 1 && is_forked) { // this may happen if there are contained reads; fix this
	205	fm6_set_intv(e, 0, ok0);
	206	for (i = rbeg; i < ori_l; ++i) {
	207	rld_extend(e, &ok0, ok, 0);
	208	ok0 = ok[fm6_comp(s->s[i])];
	209	}
	210	for (i = ori_l; i < s->l; ++i) {
	211	int c0 = -1;
	212	rld_extend(e, &ok0, ok, 0);
	213	for (c = 1, j = 0; c < 5; ++c)
	214	if (ok[c].x[2] && ok[c].x[0] <= nei->a[0].x[0] && ok[c].x[0] + ok[c].x[2] >= nei->a[0].x[0] + nei->a[0].x[2])
	215	++j, c0 = c;
	216	if (j == 0 && ok[0].x[2]) break;
	217	assert(j == 1);
	218	s->s[i] = fm6_comp(c0);
	219	ok0 = ok[c0];
	220	}
	221	s->l = i; s->s[s->l] = 0;
	222	}
	223	if (nei->n > 1) s->l = ori_l, s->s[s->l] = 0;
	224	return rbeg;
	225	}
	226
	227	static int try_right(aux_t a, int beg, kstring_t s)
	228	{
	229	return fm6_get_nei(a->e, a->min_match, beg, s, &a->nei, &a->a[0], &a->a[1], &a->cat, a->used);
	230	}
	231
	232	static int check_left_simple(aux_t a, int beg, int rbeg, const kstring_t s)
	233	{
	234	rldintv_t ok[6];
	235	rldintv_v prev = &a->a[0], curr = &a->a[1], *swap;
	236	int i, j;
	237
	238	overlap_intv(a->e, s->l, (uint8_t*)s->s, a->min_match, rbeg, 1, prev, 1);
	239	for (i = rbeg - 1; i >= beg; --i) {
	240	for (j = 0, curr->n = 0; j < prev->n; ++j) {
	241	rldintv_t *p = &prev->a[j];
	242	rld_extend(a->e, p, ok, 1);
	243	if (ok[0].x[2]) set_bits(a->used, &ok[0]); // some reads end here; they must be contained in a longer read
	244	if (ok[0].x[2] + ok[(int)s->s[i]].x[2] != p->x[2]) return -1; // potential backward bifurcation
	245	kv_push(rldintv_t, *curr, ok[(int)s->s[i]]);
	246	}
	247	swap = curr; curr = prev; prev = swap;
	248	} // ~for(i)
	249	return 0;
	250	}
	251
	252	static int check_left(aux_t a, int beg, int rbeg, const kstring_t s)
	253	{
	254	int i, ret;
	255	rldintv_t tmp;
	256	assert(a->nei.n == 1);
	257	ret = check_left_simple(a, beg, rbeg, s);
	258	if (ret == 0) return 0;
	259	// when ret<0, the back fork may be caused by a contained read. we have to do more to confirm this.
	260	tmp = a->nei.a[0]; // backup the neighbour as it will be overwritten by try_right()
	261	a->a[0].n = a->a[1].n = a->nei.n = 0;
	262	ks_resize(&a->str, s->l - rbeg + 1);
	263	for (i = s->l - 1, a->str.l = 0; i >= rbeg; --i)
	264	a->str.s[a->str.l++] = fm6_comp(s->s[i]);
	265	a->str.s[a->str.l] = 0;
	266	try_right(a, 0, &a->str);
	267	assert(a->nei.n >= 1);
	268	ret = a->nei.n > 1? -1 : 0;
	269	a->nei.n = 1; a->nei.a[0] = tmp; // recover the original neighbour
	270	return ret;
	271	}
	272
	273	static int unitig_unidir(aux_t a, kstring_t s, kstring_t cov, int beg0, uint64_t k0, uint64_t end, int *is_loop)
	274	{
	275	int i, beg = beg0, rbeg, ori_l = s->l, n_reads = 0;
	276	*is_loop = 0;
	277	while ((rbeg = try_right(a, beg, s)) >= 0) { // loop if there is at least one overlap
	278	uint64_t k;
	279	if (a->nei.n > 1) { // forward bifurcation
	280	set_bit(a->bend, *end);
	281	break;
	282	}
	283	if ((k = a->nei.a[0].x[0]) == *end) break; // a loop like b>>c>>a><a; keep the link but stop extension
	284	if (((a->bend[k>>6]>>(k&0x3f)&1) \|\| check_left(a, beg, rbeg, s) < 0)) { // backward bifurcation
	285	set_bit(a->bend, k);
	286	break;
	287	}
	288	if (k == k0) { // a loop like a>>b>>c>>a
	289	*is_loop = 1;
	290	break;
	291	}
	292	if (a->nei.a[0].x[1] == *end) { // a loop like b>>c>>a>>a; cut the last link
	293	a->nei.n = 0;
	294	break;
	295	}
	296	if ((int)a->nei.a[0].info < a->min_merge_len) break; // the overlap is not long enough
	297	*end = a->nei.a[0].x[1];
	298	set_bits(a->used, &a->nei.a[0]); // successful extension
	299	++n_reads;
	300	if (cov->m < s->m) ks_resize(cov, s->m);
	301	cov->l = s->l; cov->s[cov->l] = 0;
	302	for (i = rbeg; i < ori_l; ++i) // update the coverage string
	303	if (cov->s[i] != '~') ++cov->s[i];
	304	for (i = ori_l; i < s->l; ++i) cov->s[i] = '"';
	305	beg = rbeg; ori_l = s->l; a->a[0].n = a->a[1].n = 0; // prepare for the next round of loop
	306	}
	307	cov->l = s->l = ori_l; s->s[ori_l] = cov->s[ori_l] = 0;
	308	return n_reads;
	309	}
	310
	311	static void copy_nei(ku128_v dst, const rldintv_v src)
	312	{
	313	int i;
	314	for (i = 0; i < src->n; ++i) {
	315	ku128_t z;
	316	z.x = src->a[i].x[0]; z.y = src->a[i].info;
	317	kv_push(ku128_t, *dst, z);
	318	}
	319	}
	320
	321	static int unitig1(aux_t a, int64_t seed, kstring_t s, kstring_t cov, uint64_t end[2], ku128_v nei[2], int n_reads)
	322	{
	323	rldintv_t intv0;
	324	int seed_len, ret, is_loop, contained;
	325	int64_t k;
	326	size_t i;
	327
	328	*n_reads = nei[0].n = nei[1].n = 0;
	329	if (a->used[seed>>6]>>(seed&0x3f)&1) return -2; // used
	330	// retrieve the sequence pointed by seed
	331	k = fm6_retrieve(a->e, seed, s, &intv0, &contained);
	332	seq_reverse(s->l, (uint8_t*)s->s);
	333	seed_len = s->l;
	334	// check contained status
	335	if (intv0.x[2] > 1 && k != intv0.x[0]) return -3; // duplicated, but not the first
	336	set_bits(a->used, &intv0);
	337	if (contained) return -3; // contained
	338	// check length, containment and if used before
	339	if (s->l <= a->min_match) return -1; // too short
	340	ret = fm6_is_contained(a->e, a->min_match, s, &intv0, &a->a[0]);
	341	*n_reads = 1;
	342	// initialize the coverage string
	343	if (cov->m < s->m) ks_resize(cov, s->m);
	344	cov->l = s->l; cov->s[cov->l] = 0;
	345	for (i = 0; i < cov->l; ++i) cov->s[i] = '"';
	346	// left-wards extension
	347	end[0] = intv0.x[1]; end[1] = intv0.x[0];
	348	if (a->a[0].n) { // no need to extend to the right if there is no overlap
	349	*n_reads += unitig_unidir(a, s, cov, 0, intv0.x[0], &end[0], &is_loop);
	350	copy_nei(&nei[0], &a->nei);
	351	if (is_loop) {
	352	ku128_t z;
	353	z.x = end[0]; z.y = a->nei.a[0].info;
	354	kv_push(ku128_t, nei[1], z);
	355	return 0;
	356	}
	357	}
	358	// right-wards extension
	359	a->a[0].n = a->a[1].n = a->nei.n = 0;
	360	seq_revcomp6(s->l, (uint8_t*)s->s); // reverse complement for extension in the other direction
	361	seq_reverse(cov->l, (uint8_t*)cov->s); // reverse the coverage
	362	*n_reads += unitig_unidir(a, s, cov, s->l - seed_len, intv0.x[1], &end[1], &is_loop);
	363	copy_nei(&nei[1], &a->nei);
	364	return 0;
	365	}
	366
	367	typedef struct {
	368	long max_l;
	369	aux_t a;
	370	kstring_t str, cov;
	371	magv_t z;
	372	magv_v v;
	373	} thrdat_t;
	374
	375	typedef struct {
	376	uint64_t prime, used, bend, *visited;
	377	const rld_t *e;
	378	thrdat_t *d;
	379	} worker_t;
	380
	381	static void worker(void *data, long _i, int tid)
	382	{
	383	worker_t w = (worker_t)data;
	384	thrdat_t *d = &w->d[tid];
	385	uint64_t i = (w->prime * _i) % w->e->mcnt[1];
	386	if (unitig1(&d->a, i, &d->str, &d->cov, d->z.k, d->z.nei, &d->z.nsr) >= 0) { // then we keep the unitig
	387	uint64_t *p[2], x[2];
	388	magv_t *q;
	389	p[0] = w->visited + (d->z.k[0]>>6); x[0] = 1LLU<<(d->z.k[0]&0x3f);
	390	p[1] = w->visited + (d->z.k[1]>>6); x[1] = 1LLU<<(d->z.k[1]&0x3f);
	391	if ((__sync_fetch_and_or(p[0], x[0])&x[0]) \|\| (__sync_fetch_and_or(p[1], x[1])&x[1])) return;
	392	d->z.len = d->str.l;
	393	if (d->max_l < d->str.m) {
	394	d->max_l = d->str.m;
	395	d->z.seq = realloc(d->z.seq, d->max_l);
	396	d->z.cov = realloc(d->z.cov, d->max_l);
	397	}
	398	memcpy(d->z.seq, d->str.s, d->z.len);
	399	memcpy(d->z.cov, d->cov.s, d->z.len + 1);
	400	kv_pushp(magv_t, d->v, &q);
	401	mag_v_copy_to_empty(q, &d->z);
	402	}
	403	}
	404
	405	mag_t fml_fmi2mag_core(const rld_t e, int min_match, int min_merge_len, int n_threads)
	406	{
	407	extern void kt_for(int n_threads, void (func)(void,long,int), void *data, long n);
	408	worker_t w;
	409	int j;
	410	mag_t *g;
	411
	412	w.used = (uint64_t*)calloc((e->mcnt[1] + 63)/64, 8);
	413	w.bend = (uint64_t*)calloc((e->mcnt[1] + 63)/64, 8);
	414	w.visited = (uint64_t*)calloc((e->mcnt[1] + 63)/64, 8);
	415	w.e = e;
	416	assert(e->mcnt[1] >= n_threads * 2);
	417	w.d = calloc(n_threads, sizeof(thrdat_t));
	418	w.prime = 0;
	419	for (j = 0; utg_primes[j] > 0; ++j)
	420	if (e->mcnt[1] % utg_primes[j] != 0) {
	421	w.prime = utg_primes[j];
	422	break;
	423	}
	424	assert(w.prime);
	425	for (j = 0; j < n_threads; ++j) {
	426	w.d[j].a.e = e; w.d[j].a.min_match = min_match; w.d[j].a.min_merge_len = min_merge_len;
	427	w.d[j].a.used = w.used; w.d[j].a.bend = w.bend;
	428	}
	429	kt_for(n_threads, worker, &w, e->mcnt[1]);
	430	g = (mag_t*)calloc(1, sizeof(mag_t));
	431	for (j = 0; j < n_threads; ++j) {
	432	kv_resize(magv_t, g->v, g->v.n + w.d[j].v.n);
	433	memcpy(g->v.a + g->v.n, w.d[j].v.a, w.d[j].v.n * sizeof(magv_t));
	434	g->v.n += w.d[j].v.n;
	435	free(w.d[j].v.a);
	436	free(w.d[j].a.a[0].a); free(w.d[j].a.a[1].a); free(w.d[j].a.nei.a); free(w.d[j].a.cat.a);
	437	free(w.d[j].z.nei[0].a); free(w.d[j].z.nei[1].a); free(w.d[j].z.seq); free(w.d[j].z.cov);
	438	free(w.d[j].a.str.s); free(w.d[j].str.s); free(w.d[j].cov.s);
	439	}
	440	free(w.d); free(w.used); free(w.bend); free(w.visited);
	441
	442	mag_g_build_hash(g);
	443	mag_g_amend(g);
	444	g->rdist = mag_cal_rdist(g);
	445	return g;
	446	}
	447
	448	mag_t fml_fmi2mag(const fml_opt_t opt, rld_t *e)
	449	{
	450	mag_t *g;
	451	g = fml_fmi2mag_core(e, opt->min_asm_ovlp, opt->min_merge_len, opt->n_threads);
	452	rld_destroy(e);
	453	return g;
	454	}

+23

-0

third_party/minimap-0.2/LICENSE.txt less more

	0	The MIT License
	1
	2	Copyright (c) 2015 Broad Institute
	3
	4	Permission is hereby granted, free of charge, to any person obtaining
	5	a copy of this software and associated documentation files (the
	6	"Software"), to deal in the Software without restriction, including
	7	without limitation the rights to use, copy, modify, merge, publish,
	8	distribute, sublicense, and/or sell copies of the Software, and to
	9	permit persons to whom the Software is furnished to do so, subject to
	10	the following conditions:
	11
	12	The above copyright notice and this permission notice shall be
	13	included in all copies or substantial portions of the Software.
	14
	15	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	16	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	17	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	18	NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	19	BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	20	ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	21	CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	22	SOFTWARE.

+98

-0

third_party/minimap-0.2/README.md less more

	0	## Introduction
	1
	2	Minimap is an experimental tool to efficiently find multiple approximate
	3	mapping positions between two sets of long sequences, such as between reads and
	4	reference genomes, between genomes and between long noisy reads. By default, it
	5	is tuned to have high sensitivity to 2kb matches around 20% divergence but with
	6	low specificity. Minimap does not generate alignments as of now and because of
	7	this, it is usually tens of times faster than mainstream aligners. With four
	8	CPU cores, minimap can map 1.6Gbp PacBio reads to human in 2.5 minutes, 1Gbp
	9	PacBio E. coli reads to pre-indexed 9.6Gbp bacterial genomes in 3 minutes, to
	10	pre-indexed >100Gbp nt database in ~1 hour (of which ~20 minutes are spent on
	11	loading index from the network filesystem; peak RAM: 10GB), map 2800 bacteria
	12	to themselves in 1 hour, and map 1Gbp E. coli reads against themselves in a
	13	couple of minutes.
	14
	15	Minimap does not replace mainstream aligners, but it can be useful when you
	16	want to quickly identify long approximate matches at moderate divergence among
	17	a huge collection of sequences. For this task, it is much faster than most
	18	existing tools.
	19
	20	## Usage
	21
	22	* Map two sets of long sequences:
	23	```sh
	24	minimap target.fa.gz query.fa.gz > out.mini
	25	```
	26	The output is TAB-delimited with each line consisting of query name, length,
	27	0-based start, end, strand, target name, length, start, end, the number of
	28	matching bases, the number of co-linear minimizers in the match and the
	29	fraction of matching bases.
	30
	31	* All-vs-all PacBio read self-mapping for [miniasm][miniasm]:
	32	```sh
	33	minimap -Sw5 -L100 -m0 reads.fa reads.fa \| gzip -1 > reads.paf.gz
	34	```
	35
	36	* Prebuild index and then map:
	37	```sh
	38	minimap -d target.mmi target.fa.gz
	39	minimap -l target.mmi query.fa.gz > out.mini
	40	```
	41	Minimap indexing is very fast (1 minute for human genome; 50 minutes for >100Gbp
	42	nt database retrieved on 2015-09-30), but for huge
	43	repeatedly used databases, prebuilding index is still preferred.
	44
	45	* Map sequences against themselve without diagnal matches:
	46	```sh
	47	minimap -S sequences.fa sequences.fa > self-match.mini
	48	```
	49	The output may still contain overlapping matches in repetitive regions.
	50
	51	## Algorithm Overview
	52
	53	1. Indexing. Collect all [(w,k)-minimizers][mini] in a batch (-I=4
	54	billion bp) of target sequences and store them in a hash table. Mark top
	55	-f=0.1% of most frequent minimizers as repeats. Minimap
	56	uses [invertible hash function][invhash] to avoid taking ploy-A as
	57	minimizers.
	58
	59	2. For each query, collect all (w,k)-minimizers and look up the hash table for
	60	matches (q<sub>i</sub>,t<sub>i</sub>,s<sub>i</sub>), where
	61	q<sub>i</sub> is the query position, t<sub>i</sub> the target position
	62	and s<sub>i</sub> indicates whether the minimizer match is on the same
	63	strand.
	64
	65	3. For matches on the same strand, sort by {q<sub>i</sub>-t<sub>i</sub>}
	66	and then cluster matches within a -r=500bp window. Minimap merges
	67	two windows if -m=50% of minimizer matches overlap. For matches on different
	68	strands, sort {q<sub>i</sub>+t<sub>i</sub>} and apply a similar
	69	clustering procedure. This is inspired by the [Hough transformation][hough].
	70
	71	4. For each cluster, sort (q<sub>i</sub>,t<sub>i</sub>) by q<sub>i</sub>
	72	and solve a [longest increasing sequence problem][lis] for t<sub>i</sub>. This
	73	finds the longest co-linear matching chain. Break the chain whenever there
	74	is a gap longer than -g=10000.
	75
	76	5. Output the start and end of the chain if it contains -c=4 or more
	77	minimizer matches and the matching length is no less than -L=40.
	78
	79	6. Go to 1 and rewind to the first record of query if there are more target
	80	sequences; otherwise stop.
	81
	82	To increase sensitivity, we may decrease -w to index more minimizers;
	83	we may also decrease -k, though this may greatly impact performance for
	84	mammalian genomes.
	85
	86	Also note that by default, if the total length of target sequences is less than
	87	4Gbp (1G=1 billion; controlled by -I), minimap creates one index and stream
	88	all the query sequences in one go. The multiple hits of a query sequence is
	89	adjacent to each other in the output. If the total length is greater than
	90	4Gbp, minimap needs to read query sequences multiple times. The multiple hits
	91	of a query may not be adjacent.
	92
	93	[mini]: http://bioinformatics.oxfordjournals.org/content/20/18/3363.abstract
	94	[lis]: https://en.wikipedia.org/wiki/Longest_increasing_subsequence
	95	[hough]: https://en.wikipedia.org/wiki/Hough_transform
	96	[invhash]: https://gist.github.com/lh3/974ced188be2f90422cc
	97	[miniasm]: https://github.com/lh3/miniasm

+65

-0

third_party/minimap-0.2/bseq.c less more

	0	#include <zlib.h>
	1	#include <stdio.h>
	2	#include <stdlib.h>
	3	#include <string.h>
	4	#include <assert.h>
	5	#include "bseq.h"
	6	#include "kseq.h"
	7	KSEQ_INIT(gzFile, gzread)
	8
	9	extern unsigned char seq_nt4_table[256];
	10
	11	struct bseq_file_s {
	12	int is_eof;
	13	gzFile fp;
	14	kseq_t *ks;
	15	};
	16
	17	bseq_file_t bseq_open(const char fn)
	18	{
	19	bseq_file_t *fp;
	20	gzFile f;
	21	f = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
	22	if (f == 0) return 0;
	23	fp = (bseq_file_t*)calloc(1, sizeof(bseq_file_t));
	24	fp->fp = f;
	25	fp->ks = kseq_init(fp->fp);
	26	return fp;
	27	}
	28
	29	void bseq_close(bseq_file_t *fp)
	30	{
	31	kseq_destroy(fp->ks);
	32	gzclose(fp->fp);
	33	free(fp);
	34	}
	35
	36	bseq1_t bseq_read(bseq_file_t fp, int chunk_size, int *n_)
	37	{
	38	int size = 0, m, n;
	39	bseq1_t *seqs;
	40	kseq_t *ks = fp->ks;
	41	m = n = 0; seqs = 0;
	42	while (kseq_read(ks) >= 0) {
	43	bseq1_t *s;
	44	assert(ks->seq.l <= INT32_MAX);
	45	if (n >= m) {
	46	m = m? m<<1 : 256;
	47	seqs = (bseq1_t)realloc(seqs, m sizeof(bseq1_t));
	48	}
	49	s = &seqs[n];
	50	s->name = strdup(ks->name.s);
	51	s->seq = strdup(ks->seq.s);
	52	s->l_seq = ks->seq.l;
	53	size += seqs[n++].l_seq;
	54	if (size >= chunk_size) break;
	55	}
	56	if (n == 0) fp->is_eof = 1;
	57	*n_ = n;
	58	return seqs;
	59	}
	60
	61	int bseq_eof(bseq_file_t *fp)
	62	{
	63	return fp->is_eof;
	64	}

+19

-0

third_party/minimap-0.2/bseq.h less more

	0	#ifndef MM_BSEQ_H
	1	#define MM_BSEQ_H
	2
	3	#include <stdint.h>
	4
	5	struct bseq_file_s;
	6	typedef struct bseq_file_s bseq_file_t;
	7
	8	typedef struct {
	9	int l_seq, rid;
	10	char name, seq;
	11	} bseq1_t;
	12
	13	bseq_file_t bseq_open(const char fn);
	14	void bseq_close(bseq_file_t *fp);
	15	bseq1_t bseq_read(bseq_file_t fp, int chunk_size, int *n_);
	16	int bseq_eof(bseq_file_t *fp);
	17
	18	#endif

+52

-0

third_party/minimap-0.2/example.c less more

	0	// To compile:
	1	// gcc -g -O2 example.c libminimap.a -lz
	2
	3	#include <stdlib.h>
	4	#include <assert.h>
	5	#include <stdio.h>
	6	#include <zlib.h>
	7	#include "minimap.h"
	8	#include "kseq.h"
	9	KSEQ_INIT(gzFile, gzread)
	10
	11	int main(int argc, char *argv[])
	12	{
	13	if (argc < 3) {
	14	fprintf(stderr, "Usage: minimap-lite <target.fa> <query.fa>\n");
	15	return 1;
	16	}
	17
	18	// open query file for reading; you may use your favorite FASTA/Q parser
	19	gzFile f = gzopen(argv[2], "r");
	20	assert(f);
	21	kseq_t *ks = kseq_init(f);
	22
	23	// create index for target; we are creating one index for all target sequence
	24	int n_threads = 4, w = 10, k = 15;
	25	mm_idx_t *mi = mm_idx_build(argv[1], w, k, n_threads);
	26	assert(mi);
	27
	28	// mapping
	29	mm_mapopt_t opt;
	30	mm_mapopt_init(&opt); // initialize mapping parameters
	31	mm_tbuf_t *tbuf = mm_tbuf_init(); // thread buffer; for multi-threading, allocate one tbuf for each thread
	32	while (kseq_read(ks) >= 0) { // each kseq_read() call reads one query sequence
	33	const mm_reg1_t *reg;
	34	int j, n_reg;
	35	// get all hits for the query
	36	reg = mm_map(mi, ks->seq.l, ks->seq.s, &n_reg, tbuf, &opt, 0);
	37	// traverse hits and print them out
	38	for (j = 0; j < n_reg; ++j) {
	39	const mm_reg1_t *r = &reg[j];
	40	printf("%s\t%d\t%d\t%d\t%c\t", ks->name.s, ks->seq.l, r->qs, r->qe, "+-"[r->rev]);
	41	printf("%s\t%d\t%d\t%d\t%d\t%d\n", mi->name[r->rid], mi->len[r->rid], r->rs, r->re, r->len, r->cnt);
	42	}
	43	}
	44	mm_tbuf_destroy(tbuf);
	45
	46	// deallocate index and close the query file
	47	mm_idx_destroy(mi);
	48	kseq_destroy(ks);
	49	gzclose(f);
	50	return 0;
	51	}

+352

-0

third_party/minimap-0.2/index.c less more

	0	#include <stdlib.h>
	1	#include <assert.h>
	2	#include <stdio.h>
	3	#include "minimap.h"
	4	#include "kvec.h"
	5	#include "khash.h"
	6
	7	#define idx_hash(a) ((a)>>1)
	8	#define idx_eq(a, b) ((a)>>1 == (b)>>1)
	9	KHASH_INIT(idx, uint64_t, uint64_t, 1, idx_hash, idx_eq)
	10	typedef khash_t(idx) idxhash_t;
	11
	12	void kt_for(int n_threads, void (func)(void,long,int), void *data, long n);
	13
	14	mm_idx_t *mm_idx_init(int w, int k, int b)
	15	{
	16	mm_idx_t *mi;
	17	if (k2 < b) b = k 2;
	18	if (w < 1) w = 1;
	19	mi = (mm_idx_t*)calloc(1, sizeof(mm_idx_t));
	20	mi->w = w, mi->k = k, mi->b = b;
	21	mi->max_occ = UINT32_MAX;
	22	mi->B = (mm_idx_bucket_t*)calloc(1<<b, sizeof(mm_idx_bucket_t));
	23	return mi;
	24	}
	25
	26	void mm_idx_destroy(mm_idx_t *mi)
	27	{
	28	int i;
	29	if (mi == 0) return;
	30	for (i = 0; i < 1<<mi->b; ++i) {
	31	free(mi->B[i].p);
	32	free(mi->B[i].a.a);
	33	kh_destroy(idx, (idxhash_t*)mi->B[i].h);
	34	}
	35	free(mi->B);
	36	if (mi->name)
	37	for (i = 0; i < mi->n; ++i) free(mi->name[i]);
	38	free(mi->len); free(mi->name);
	39	free(mi);
	40	}
	41
	42	const uint64_t mm_idx_get(const mm_idx_t mi, uint64_t minier, int *n)
	43	{
	44	int mask = (1<<mi->b) - 1;
	45	khint_t k;
	46	mm_idx_bucket_t *b = &mi->B[minier&mask];
	47	idxhash_t h = (idxhash_t)b->h;
	48	*n = 0;
	49	if (h == 0) return 0;
	50	k = kh_get(idx, h, minier>>mi->b<<1);
	51	if (k == kh_end(h)) return 0;
	52	if (kh_key(h, k)&1) {
	53	*n = 1;
	54	return &kh_val(h, k);
	55	} else {
	56	*n = (uint32_t)kh_val(h, k);
	57	return &b->p[kh_val(h, k)>>32];
	58	}
	59	}
	60
	61	uint32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
	62	{
	63	int i;
	64	size_t n = 0;
	65	uint32_t thres;
	66	khint_t *a, k;
	67	if (f <= 0.) return UINT32_MAX;
	68	for (i = 0; i < 1<<mi->b; ++i)
	69	if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
	70	a = (uint32_t)malloc(n 4);
	71	for (i = n = 0; i < 1<<mi->b; ++i) {
	72	idxhash_t h = (idxhash_t)mi->B[i].h;
	73	if (h == 0) continue;
	74	for (k = 0; k < kh_end(h); ++k) {
	75	if (!kh_exist(h, k)) continue;
	76	a[n++] = kh_key(h, k)&1? 1 : (uint32_t)kh_val(h, k);
	77	}
	78	}
	79	thres = ks_ksmall_uint32_t(n, a, (uint32_t)((1. - f) * n)) + 1;
	80	free(a);
	81	return thres;
	82	}
	83
	84	void mm_idx_set_max_occ(mm_idx_t *mi, float f)
	85	{
	86	mi->freq_thres = f;
	87	mi->max_occ = mm_idx_cal_max_occ(mi, f);
	88	}
	89
	90	/*********************************
	91	* Sort and generate hash tables *
	92	*********************************/
	93
	94	static void worker_post(void *g, long i, int tid)
	95	{
	96	int j, start_a, start_p, n, n_keys;
	97	idxhash_t *h;
	98	mm_idx_t mi = (mm_idx_t)g;
	99	mm_idx_bucket_t *b = &mi->B[i];
	100	if (b->a.n == 0) return;
	101
	102	// sort by minimizer
	103	radix_sort_128x(b->a.a, b->a.a + b->a.n);
	104
	105	// count and preallocate
	106	for (j = 1, n = 1, n_keys = 0, b->n = 0; j <= b->a.n; ++j) {
	107	if (j == b->a.n \|\| b->a.a[j].x != b->a.a[j-1].x) {
	108	++n_keys;
	109	if (n > 1) b->n += n;
	110	n = 1;
	111	} else ++n;
	112	}
	113	h = kh_init(idx);
	114	kh_resize(idx, h, n_keys);
	115	b->p = (uint64_t*)calloc(b->n, 8);
	116
	117	// create the hash table
	118	for (j = 1, n = 1, start_a = start_p = 0; j <= b->a.n; ++j) {
	119	if (j == b->a.n \|\| b->a.a[j].x != b->a.a[j-1].x) {
	120	khint_t itr;
	121	int absent;
	122	mm128_t *p = &b->a.a[j-1];
	123	itr = kh_put(idx, h, p->x>>mi->b<<1, &absent);
	124	assert(absent && j - start_a == n);
	125	if (n == 1) {
	126	kh_key(h, itr) \|= 1;
	127	kh_val(h, itr) = p->y;
	128	} else {
	129	int k;
	130	for (k = 0; k < n; ++k)
	131	b->p[start_p + k] = b->a.a[start_a + k].y;
	132	kh_val(h, itr) = (uint64_t)start_p<<32 \| n;
	133	start_p += n;
	134	}
	135	start_a = j, n = 1;
	136	} else ++n;
	137	}
	138	b->h = h;
	139	assert(b->n == start_p);
	140
	141	// deallocate and clear b->a
	142	free(b->a.a);
	143	b->a.n = b->a.m = 0, b->a.a = 0;
	144	}
	145
	146	static void mm_idx_post(mm_idx_t *mi, int n_threads)
	147	{
	148	kt_for(n_threads, worker_post, mi, 1<<mi->b);
	149	}
	150
	151	/******************
	152	* Generate index *
	153	******************/
	154
	155	#include <string.h>
	156	#include <zlib.h>
	157	#include "bseq.h"
	158
	159	void kt_pipeline(int n_threads, void (func)(void, int, void), void *shared_data, int n_steps);
	160
	161	typedef struct {
	162	int tbatch_size, n_processed, keep_name;
	163	bseq_file_t *fp;
	164	uint64_t ibatch_size, n_read;
	165	mm_idx_t *mi;
	166	} pipeline_t;
	167
	168	typedef struct {
	169	int n_seq;
	170	bseq1_t *seq;
	171	mm128_v a;
	172	} step_t;
	173
	174	static void mm_idx_add(mm_idx_t mi, int n, const mm128_t a)
	175	{
	176	int i, mask = (1<<mi->b) - 1;
	177	for (i = 0; i < n; ++i) {
	178	mm128_v *p = &mi->B[a[i].x&mask].a;
	179	kv_push(mm128_t, *p, a[i]);
	180	}
	181	}
	182
	183	static void worker_pipeline(void shared, int step, void *in)
	184	{
	185	int i;
	186	pipeline_t p = (pipeline_t)shared;
	187	if (step == 0) { // step 0: read sequences
	188	step_t *s;
	189	if (p->n_read > p->ibatch_size) return 0;
	190	s = (step_t*)calloc(1, sizeof(step_t));
	191	s->seq = bseq_read(p->fp, p->tbatch_size, &s->n_seq);
	192	if (s->seq) {
	193	uint32_t old_m = p->mi->n, m, n;
	194	assert((uint64_t)p->n_processed + s->n_seq <= INT32_MAX);
	195	m = n = p->mi->n + s->n_seq;
	196	kroundup32(m); kroundup32(old_m);
	197	if (old_m != m) {
	198	if (p->keep_name)
	199	p->mi->name = (char*)realloc(p->mi->name, m sizeof(char*));
	200	p->mi->len = (int)realloc(p->mi->len, m sizeof(int));
	201	}
	202	for (i = 0; i < s->n_seq; ++i) {
	203	if (p->keep_name) {
	204	assert(strlen(s->seq[i].name) <= 254);
	205	p->mi->name[p->mi->n] = strdup(s->seq[i].name);
	206	}
	207	p->mi->len[p->mi->n++] = s->seq[i].l_seq;
	208	s->seq[i].rid = p->n_processed++;
	209	p->n_read += s->seq[i].l_seq;
	210	}
	211	return s;
	212	} else free(s);
	213	} else if (step == 1) { // step 1: compute sketch
	214	step_t s = (step_t)in;
	215	for (i = 0; i < s->n_seq; ++i) {
	216	bseq1_t *t = &s->seq[i];
	217	mm_sketch(t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, &s->a);
	218	free(t->seq); free(t->name);
	219	}
	220	free(s->seq); s->seq = 0;
	221	return s;
	222	} else if (step == 2) { // dispatch sketch to buckets
	223	step_t s = (step_t)in;
	224	mm_idx_add(p->mi, s->a.n, s->a.a);
	225	free(s->a.a); free(s);
	226	}
	227	return 0;
	228	}
	229
	230	mm_idx_t mm_idx_gen(bseq_file_t fp, int w, int k, int b, int tbatch_size, int n_threads, uint64_t ibatch_size, int keep_name)
	231	{
	232	pipeline_t pl;
	233	memset(&pl, 0, sizeof(pipeline_t));
	234	pl.tbatch_size = tbatch_size;
	235	pl.keep_name = keep_name;
	236	pl.ibatch_size = ibatch_size;
	237	pl.fp = fp;
	238	if (pl.fp == 0) return 0;
	239	pl.mi = mm_idx_init(w, k, b);
	240
	241	kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);
	242	if (mm_verbose >= 3)
	243	fprintf(stderr, "[M::%s::%.3f*%.2f] collected minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
	244
	245	mm_idx_post(pl.mi, n_threads);
	246	if (mm_verbose >= 3)
	247	fprintf(stderr, "[M::%s::%.3f*%.2f] sorted minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
	248
	249	return pl.mi;
	250	}
	251
	252	mm_idx_t mm_idx_build(const char fn, int w, int k, int n_threads) // a simpler interface
	253	{
	254	bseq_file_t *fp;
	255	mm_idx_t *mi;
	256	fp = bseq_open(fn);
	257	if (fp == 0) return 0;
	258	mi = mm_idx_gen(fp, w, k, MM_IDX_DEF_B, 1<<18, n_threads, UINT64_MAX, 1);
	259	mm_idx_set_max_occ(mi, 0.001);
	260	bseq_close(fp);
	261	return mi;
	262	}
	263
	264	/*************
	265	* index I/O *
	266	*************/
	267
	268	#define MM_IDX_MAGIC "MMI\1"
	269
	270	void mm_idx_dump(FILE fp, const mm_idx_t mi)
	271	{
	272	uint32_t x[6];
	273	int i;
	274	x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n, x[4] = mi->name? 1 : 0, x[5] = mi->max_occ;
	275	fwrite(MM_IDX_MAGIC, 1, 4, fp);
	276	fwrite(x, 4, 6, fp);
	277	fwrite(&mi->freq_thres, sizeof(float), 1, fp);
	278	fwrite(mi->len, 4, mi->n, fp);
	279	if (mi->name) {
	280	for (i = 0; i < mi->n; ++i) {
	281	uint8_t l;
	282	l = strlen(mi->name[i]);
	283	fwrite(&l, 1, 1, fp);
	284	fwrite(mi->name[i], 1, l, fp);
	285	}
	286	}
	287	for (i = 0; i < 1<<mi->b; ++i) {
	288	mm_idx_bucket_t *b = &mi->B[i];
	289	khint_t k;
	290	idxhash_t h = (idxhash_t)b->h;
	291	uint32_t size = h? h->size : 0;
	292	fwrite(&b->n, 4, 1, fp);
	293	fwrite(b->p, 8, b->n, fp);
	294	fwrite(&size, 4, 1, fp);
	295	if (size == 0) continue;
	296	for (k = 0; k < kh_end(h); ++k) {
	297	uint64_t x[2];
	298	if (!kh_exist(h, k)) continue;
	299	x[0] = kh_key(h, k), x[1] = kh_val(h, k);
	300	fwrite(x, 8, 2, fp);
	301	}
	302	}
	303	}
	304
	305	mm_idx_t mm_idx_load(FILE fp)
	306	{
	307	int i;
	308	char magic[4];
	309	uint32_t x[6];
	310	mm_idx_t *mi;
	311	if (fread(magic, 1, 4, fp) != 4) return 0;
	312	if (strncmp(magic, MM_IDX_MAGIC, 4) != 0) return 0;
	313	if (fread(x, 4, 6, fp) != 6) return 0;
	314	mi = mm_idx_init(x[0], x[1], x[2]);
	315	mi->n = x[3], mi->max_occ = x[5];
	316	fread(&mi->freq_thres, sizeof(float), 1, fp);
	317	mi->len = (int32_t)malloc(mi->n 4);
	318	fread(mi->len, 4, mi->n, fp);
	319	if (x[4]) { // has names
	320	mi->name = (char*)calloc(mi->n, sizeof(char));
	321	for (i = 0; i < mi->n; ++i) {
	322	uint8_t l;
	323	fread(&l, 1, 1, fp);
	324	mi->name[i] = (char*)malloc(l + 1);
	325	fread(mi->name[i], 1, l, fp);
	326	mi->name[i][l] = 0;
	327	}
	328	}
	329	for (i = 0; i < 1<<mi->b; ++i) {
	330	mm_idx_bucket_t *b = &mi->B[i];
	331	uint32_t j, size;
	332	khint_t k;
	333	idxhash_t *h;
	334	fread(&b->n, 4, 1, fp);
	335	b->p = (uint64_t)malloc(b->n 8);
	336	fread(b->p, 8, b->n, fp);
	337	fread(&size, 4, 1, fp);
	338	if (size == 0) continue;
	339	b->h = h = kh_init(idx);
	340	kh_resize(idx, h, size);
	341	for (j = 0; j < size; ++j) {
	342	uint64_t x[2];
	343	int absent;
	344	fread(x, 8, 2, fp);
	345	k = kh_put(idx, h, x[0], &absent);
	346	assert(absent);
	347	kh_val(h, k) = x[1];
	348	}
	349	}
	350	return mi;
	351	}

+128

-0

third_party/minimap-0.2/kdq.h less more

	0	#ifndef __AC_KDQ_H
	1	#define __AC_KDQ_H
	2
	3	#include <stdlib.h>
	4	#include <string.h>
	5
	6	#define __KDQ_TYPE(type) \
	7	typedef struct { \
	8	size_t front:58, bits:6, count, mask; \
	9	type *a; \
	10	} kdq_##type##_t;
	11
	12	#define kdq_t(type) kdq_##type##_t
	13	#define kdq_size(q) ((q)->count)
	14	#define kdq_first(q) ((q)->a[(q)->front])
	15	#define kdq_last(q) ((q)->a[((q)->front + (q)->count - 1) & (q)->mask])
	16	#define kdq_at(q, i) ((q)->a[((q)->front + (i)) & (q)->mask])
	17
	18	#define __KDQ_IMPL(type, SCOPE) \
	19	SCOPE kdq_##type##_t *kdq_init_##type() \
	20	{ \
	21	kdq_##type##_t *q; \
	22	q = (kdq_##type##_t*)calloc(1, sizeof(kdq_##type##_t)); \
	23	q->bits = 2, q->mask = (1ULL<<q->bits) - 1; \
	24	q->a = (type)malloc((1<<q->bits) sizeof(type)); \
	25	return q; \
	26	} \
	27	SCOPE void kdq_destroy_##type(kdq_##type##_t *q) \
	28	{ \
	29	if (q == 0) return; \
	30	free(q->a); free(q); \
	31	} \
	32	SCOPE int kdq_resize_##type(kdq_##type##_t *q, int new_bits) \
	33	{ \
	34	size_t new_size = 1ULL<<new_bits, old_size = 1ULL<<q->bits; \
	35	if (new_size < q->count) { /* not big enough */ \
	36	int i; \
	37	for (i = 0; i < 64; ++i) \
	38	if (1ULL<<i > q->count) break; \
	39	new_bits = i, new_size = 1ULL<<new_bits; \
	40	} \
	41	if (new_bits == q->bits) return q->bits; /* unchanged */ \
	42	if (new_bits > q->bits) q->a = (type)realloc(q->a, (1ULL<<new_bits) sizeof(type)); \
	43	if (q->front + q->count <= old_size) { /* unwrapped */ \
	44	if (q->front + q->count > new_size) /* only happens for shrinking */ \
	45	memmove(q->a, q->a + new_size, (q->front + q->count - new_size) * sizeof(type)); \
	46	} else { /* wrapped */ \
	47	memmove(q->a + (new_size - (old_size - q->front)), q->a + q->front, (old_size - q->front) * sizeof(type)); \
	48	q->front = new_size - (old_size - q->front); \
	49	} \
	50	q->bits = new_bits, q->mask = (1ULL<<q->bits) - 1; \
	51	if (new_bits < q->bits) q->a = (type)realloc(q->a, (1ULL<<new_bits) sizeof(type)); \
	52	return q->bits; \
	53	} \
	54	SCOPE type kdq_pushp_##type(kdq_##type##_t q) \
	55	{ \
	56	if (q->count == 1ULL<<q->bits) kdq_resize_##type(q, q->bits + 1); \
	57	return &q->a[((q->count++) + q->front) & (q)->mask]; \
	58	} \
	59	SCOPE void kdq_push_##type(kdq_##type##_t *q, type v) \
	60	{ \
	61	if (q->count == 1ULL<<q->bits) kdq_resize_##type(q, q->bits + 1); \
	62	q->a[((q->count++) + q->front) & (q)->mask] = v; \
	63	} \
	64	SCOPE type kdq_unshiftp_##type(kdq_##type##_t q) \
	65	{ \
	66	if (q->count == 1ULL<<q->bits) kdq_resize_##type(q, q->bits + 1); \
	67	++q->count; \
	68	q->front = q->front? q->front - 1 : (1ULL<<q->bits) - 1; \
	69	return &q->a[q->front]; \
	70	} \
	71	SCOPE void kdq_unshift_##type(kdq_##type##_t *q, type v) \
	72	{ \
	73	type *p; \
	74	p = kdq_unshiftp_##type(q); \
	75	*p = v; \
	76	} \
	77	SCOPE type kdq_pop_##type(kdq_##type##_t q) \
	78	{ \
	79	return q->count? &q->a[((--q->count) + q->front) & q->mask] : 0; \
	80	} \
	81	SCOPE type kdq_shift_##type(kdq_##type##_t q) \
	82	{ \
	83	type *d = 0; \
	84	if (q->count == 0) return 0; \
	85	d = &q->a[q->front++]; \
	86	q->front &= q->mask; \
	87	--q->count; \
	88	return d; \
	89	}
	90
	91	#define KDQ_INIT2(type, SCOPE) \
	92	__KDQ_TYPE(type) \
	93	__KDQ_IMPL(type, SCOPE)
	94
	95	#ifndef klib_unused
	96	#if (defined __clang__ && __clang_major__ >= 3) \|\| (defined __GNUC__ && __GNUC__ >= 3)
	97	#define klib_unused __attribute__ ((__unused__))
	98	#else
	99	#define klib_unused
	100	#endif
	101	#endif /* klib_unused */
	102
	103	#define KDQ_INIT(type) KDQ_INIT2(type, static inline klib_unused)
	104
	105	#define KDQ_DECLARE(type) \
	106	__KDQ_TYPE(type) \
	107	kdq_##type##_t *kdq_init_##type(); \
	108	void kdq_destroy_##type(kdq_##type##_t *q); \
	109	int kdq_resize_##type(kdq_##type##_t *q, int new_bits); \
	110	type kdq_pushp_##type(kdq_##type##_t q); \
	111	void kdq_push_##type(kdq_##type##_t *q, type v); \
	112	type kdq_unshiftp_##type(kdq_##type##_t q); \
	113	void kdq_unshift_##type(kdq_##type##_t *q, type v); \
	114	type kdq_pop_##type(kdq_##type##_t q); \
	115	type kdq_shift_##type(kdq_##type##_t q);
	116
	117	#define kdq_init(type) kdq_init_##type()
	118	#define kdq_destroy(type, q) kdq_destroy_##type(q)
	119	#define kdq_resize(type, q, new_bits) kdq_resize_##type(q, new_bits)
	120	#define kdq_pushp(type, q) kdq_pushp_##type(q)
	121	#define kdq_push(type, q, v) kdq_push_##type(q, v)
	122	#define kdq_pop(type, q) kdq_pop_##type(q)
	123	#define kdq_unshiftp(type, q) kdq_unshiftp_##type(q)
	124	#define kdq_unshift(type, q, v) kdq_unshift_##type(q, v)
	125	#define kdq_shift(type, q) kdq_shift_##type(q)
	126
	127	#endif

+619

-0

third_party/minimap-0.2/khash.h less more

	0	/* The MIT License
	1
	2	Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
	3
	4	Permission is hereby granted, free of charge, to any person obtaining
	5	a copy of this software and associated documentation files (the
	6	"Software"), to deal in the Software without restriction, including
	7	without limitation the rights to use, copy, modify, merge, publish,
	8	distribute, sublicense, and/or sell copies of the Software, and to
	9	permit persons to whom the Software is furnished to do so, subject to
	10	the following conditions:
	11
	12	The above copyright notice and this permission notice shall be
	13	included in all copies or substantial portions of the Software.
	14
	15	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	16	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	17	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	18	NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	19	BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	20	ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	21	CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	22	SOFTWARE.
	23	*/
	24
	25	/*
	26	An example:
	27
	28	#include "khash.h"
	29	KHASH_MAP_INIT_INT(32, char)
	30	int main() {
	31	int ret, is_missing;
	32	khiter_t k;
	33	khash_t(32) *h = kh_init(32);
	34	k = kh_put(32, h, 5, &ret);
	35	kh_value(h, k) = 10;
	36	k = kh_get(32, h, 10);
	37	is_missing = (k == kh_end(h));
	38	k = kh_get(32, h, 5);
	39	kh_del(32, h, k);
	40	for (k = kh_begin(h); k != kh_end(h); ++k)
	41	if (kh_exist(h, k)) kh_value(h, k) = 1;
	42	kh_destroy(32, h);
	43	return 0;
	44	}
	45	*/
	46
	47	/*
	48	2013-05-02 (0.2.8):
	49
	50	* Use quadratic probing. When the capacity is power of 2, stepping function
	51	i*(i+1)/2 guarantees to traverse each bucket. It is better than double
	52	hashing on cache performance and is more robust than linear probing.
	53
	54	In theory, double hashing should be more robust than quadratic probing.
	55	However, my implementation is probably not for large hash tables, because
	56	the second hash function is closely tied to the first hash function,
	57	which reduce the effectiveness of double hashing.
	58
	59	Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
	60
	61	2011-12-29 (0.2.7):
	62
	63	* Minor code clean up; no actual effect.
	64
	65	2011-09-16 (0.2.6):
	66
	67	* The capacity is a power of 2. This seems to dramatically improve the
	68	speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
	69
	70	- http://code.google.com/p/ulib/
	71	- http://nothings.org/computer/judy/
	72
	73	* Allow to optionally use linear probing which usually has better
	74	performance for random input. Double hashing is still the default as it
	75	is more robust to certain non-random input.
	76
	77	* Added Wang's integer hash function (not used by default). This hash
	78	function is more robust to certain non-random input.
	79
	80	2011-02-14 (0.2.5):
	81
	82	* Allow to declare global functions.
	83
	84	2009-09-26 (0.2.4):
	85
	86	* Improve portability
	87
	88	2008-09-19 (0.2.3):
	89
	90	* Corrected the example
	91	* Improved interfaces
	92
	93	2008-09-11 (0.2.2):
	94
	95	* Improved speed a little in kh_put()
	96
	97	2008-09-10 (0.2.1):
	98
	99	* Added kh_clear()
	100	* Fixed a compiling error
	101
	102	2008-09-02 (0.2.0):
	103
	104	* Changed to token concatenation which increases flexibility.
	105
	106	2008-08-31 (0.1.2):
	107
	108	* Fixed a bug in kh_get(), which has not been tested previously.
	109
	110	2008-08-31 (0.1.1):
	111
	112	* Added destructor
	113	*/
	114
	115
	116	#ifndef __AC_KHASH_H
	117	#define __AC_KHASH_H
	118
	119	/*!
	120	@header
	121
	122	Generic hash table library.
	123	*/
	124
	125	#define AC_VERSION_KHASH_H "0.2.8"
	126
	127	#include <stdlib.h>
	128	#include <string.h>
	129	#include <limits.h>
	130
	131	/* compiler specific configuration */
	132
	133	#if UINT_MAX == 0xffffffffu
	134	typedef unsigned int khint32_t;
	135	#elif ULONG_MAX == 0xffffffffu
	136	typedef unsigned long khint32_t;
	137	#endif
	138
	139	#if ULONG_MAX == ULLONG_MAX
	140	typedef unsigned long khint64_t;
	141	#else
	142	typedef unsigned long long khint64_t;
	143	#endif
	144
	145	#ifndef kh_inline
	146	#ifdef _MSC_VER
	147	#define kh_inline __inline
	148	#else
	149	#define kh_inline inline
	150	#endif
	151	#endif /* kh_inline */
	152
	153	typedef khint32_t khint_t;
	154	typedef khint_t khiter_t;
	155
	156	#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
	157	#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
	158	#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
	159	#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
	160	#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
	161	#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
	162	#define __ac_set_isdel_true(flag, i) (flag[i>>4]\|=1ul<<((i&0xfU)<<1))
	163
	164	#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
	165
	166	#ifndef kroundup32
	167	#define kroundup32(x) (--(x), (x)\|=(x)>>1, (x)\|=(x)>>2, (x)\|=(x)>>4, (x)\|=(x)>>8, (x)\|=(x)>>16, ++(x))
	168	#endif
	169
	170	#ifndef kcalloc
	171	#define kcalloc(N,Z) calloc(N,Z)
	172	#endif
	173	#ifndef kmalloc
	174	#define kmalloc(Z) malloc(Z)
	175	#endif
	176	#ifndef krealloc
	177	#define krealloc(P,Z) realloc(P,Z)
	178	#endif
	179	#ifndef kfree
	180	#define kfree(P) free(P)
	181	#endif
	182
	183	static const double __ac_HASH_UPPER = 0.77;
	184
	185	#define __KHASH_TYPE(name, khkey_t, khval_t) \
	186	typedef struct kh_##name##_s { \
	187	khint_t n_buckets, size, n_occupied, upper_bound; \
	188	khint32_t *flags; \
	189	khkey_t *keys; \
	190	khval_t *vals; \
	191	} kh_##name##_t;
	192
	193	#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \
	194	extern kh_##name##_t *kh_init_##name(void); \
	195	extern void kh_destroy_##name(kh_##name##_t *h); \
	196	extern void kh_clear_##name(kh_##name##_t *h); \
	197	extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
	198	extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
	199	extern khint_t kh_put_##name(kh_##name##_t h, khkey_t key, int ret); \
	200	extern void kh_del_##name(kh_##name##_t *h, khint_t x);
	201
	202	#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
	203	SCOPE kh_##name##_t *kh_init_##name(void) { \
	204	return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \
	205	} \
	206	SCOPE void kh_destroy_##name(kh_##name##_t *h) \
	207	{ \
	208	if (h) { \
	209	kfree((void *)h->keys); kfree(h->flags); \
	210	kfree((void *)h->vals); \
	211	kfree(h); \
	212	} \
	213	} \
	214	SCOPE void kh_clear_##name(kh_##name##_t *h) \
	215	{ \
	216	if (h && h->flags) { \
	217	memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
	218	h->size = h->n_occupied = 0; \
	219	} \
	220	} \
	221	SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
	222	{ \
	223	if (h->n_buckets) { \
	224	khint_t k, i, last, mask, step = 0; \
	225	mask = h->n_buckets - 1; \
	226	k = __hash_func(key); i = k & mask; \
	227	last = i; \
	228	while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) \|\| !__hash_equal(h->keys[i], key))) { \
	229	i = (i + (++step)) & mask; \
	230	if (i == last) return h->n_buckets; \
	231	} \
	232	return __ac_iseither(h->flags, i)? h->n_buckets : i; \
	233	} else return 0; \
	234	} \
	235	SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
	236	{ /* This function uses 0.25n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]n_buckets. */ \
	237	khint32_t *new_flags = 0; \
	238	khint_t j = 1; \
	239	{ \
	240	kroundup32(new_n_buckets); \
	241	if (new_n_buckets < 4) new_n_buckets = 4; \
	242	if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
	243	else { /* hash table size to be changed (shrink or expand); rehash */ \
	244	new_flags = (khint32_t)kmalloc(__ac_fsize(new_n_buckets) sizeof(khint32_t)); \
	245	if (!new_flags) return -1; \
	246	memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
	247	if (h->n_buckets < new_n_buckets) { /* expand */ \
	248	khkey_t new_keys = (khkey_t)krealloc((void )h->keys, new_n_buckets sizeof(khkey_t)); \
	249	if (!new_keys) { kfree(new_flags); return -1; } \
	250	h->keys = new_keys; \
	251	if (kh_is_map) { \
	252	khval_t new_vals = (khval_t)krealloc((void )h->vals, new_n_buckets sizeof(khval_t)); \
	253	if (!new_vals) { kfree(new_flags); return -1; } \
	254	h->vals = new_vals; \
	255	} \
	256	} /* otherwise shrink */ \
	257	} \
	258	} \
	259	if (j) { /* rehashing is needed */ \
	260	for (j = 0; j != h->n_buckets; ++j) { \
	261	if (__ac_iseither(h->flags, j) == 0) { \
	262	khkey_t key = h->keys[j]; \
	263	khval_t val; \
	264	khint_t new_mask; \
	265	new_mask = new_n_buckets - 1; \
	266	if (kh_is_map) val = h->vals[j]; \
	267	__ac_set_isdel_true(h->flags, j); \
	268	while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
	269	khint_t k, i, step = 0; \
	270	k = __hash_func(key); \
	271	i = k & new_mask; \
	272	while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
	273	__ac_set_isempty_false(new_flags, i); \
	274	if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
	275	{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
	276	if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
	277	__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
	278	} else { /* write the element and jump out of the loop */ \
	279	h->keys[i] = key; \
	280	if (kh_is_map) h->vals[i] = val; \
	281	break; \
	282	} \
	283	} \
	284	} \
	285	} \
	286	if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
	287	h->keys = (khkey_t)krealloc((void )h->keys, new_n_buckets * sizeof(khkey_t)); \
	288	if (kh_is_map) h->vals = (khval_t)krealloc((void )h->vals, new_n_buckets * sizeof(khval_t)); \
	289	} \
	290	kfree(h->flags); /* free the working space */ \
	291	h->flags = new_flags; \
	292	h->n_buckets = new_n_buckets; \
	293	h->n_occupied = h->size; \
	294	h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
	295	} \
	296	return 0; \
	297	} \
	298	SCOPE khint_t kh_put_##name(kh_##name##_t h, khkey_t key, int ret) \
	299	{ \
	300	khint_t x; \
	301	if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
	302	if (h->n_buckets > (h->size<<1)) { \
	303	if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
	304	*ret = -1; return h->n_buckets; \
	305	} \
	306	} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
	307	*ret = -1; return h->n_buckets; \
	308	} \
	309	} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
	310	{ \
	311	khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
	312	x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
	313	if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
	314	else { \
	315	last = i; \
	316	while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) \|\| !__hash_equal(h->keys[i], key))) { \
	317	if (__ac_isdel(h->flags, i)) site = i; \
	318	i = (i + (++step)) & mask; \
	319	if (i == last) { x = site; break; } \
	320	} \
	321	if (x == h->n_buckets) { \
	322	if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
	323	else x = i; \
	324	} \
	325	} \
	326	} \
	327	if (__ac_isempty(h->flags, x)) { /* not present at all */ \
	328	h->keys[x] = key; \
	329	__ac_set_isboth_false(h->flags, x); \
	330	++h->size; ++h->n_occupied; \
	331	*ret = 1; \
	332	} else if (__ac_isdel(h->flags, x)) { /* deleted */ \
	333	h->keys[x] = key; \
	334	__ac_set_isboth_false(h->flags, x); \
	335	++h->size; \
	336	*ret = 2; \
	337	} else ret = 0; / Don't touch h->keys[x] if present and not deleted */ \
	338	return x; \
	339	} \
	340	SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
	341	{ \
	342	if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
	343	__ac_set_isdel_true(h->flags, x); \
	344	--h->size; \
	345	} \
	346	}
	347
	348	#define KHASH_DECLARE(name, khkey_t, khval_t) \
	349	__KHASH_TYPE(name, khkey_t, khval_t) \
	350	__KHASH_PROTOTYPES(name, khkey_t, khval_t)
	351
	352	#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
	353	__KHASH_TYPE(name, khkey_t, khval_t) \
	354	__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
	355
	356	#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
	357	KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
	358
	359	/* --- BEGIN OF HASH FUNCTIONS --- */
	360
	361	/*! @function
	362	@abstract Integer hash function
	363	@param key The integer [khint32_t]
	364	@return The hash value [khint_t]
	365	*/
	366	#define kh_int_hash_func(key) (khint32_t)(key)
	367	/*! @function
	368	@abstract Integer comparison function
	369	*/
	370	#define kh_int_hash_equal(a, b) ((a) == (b))
	371	/*! @function
	372	@abstract 64-bit integer hash function
	373	@param key The integer [khint64_t]
	374	@return The hash value [khint_t]
	375	*/
	376	#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
	377	/*! @function
	378	@abstract 64-bit integer comparison function
	379	*/
	380	#define kh_int64_hash_equal(a, b) ((a) == (b))
	381	/*! @function
	382	@abstract const char* hash function
	383	@param s Pointer to a null terminated string
	384	@return The hash value
	385	*/
	386	static kh_inline khint_t __ac_X31_hash_string(const char *s)
	387	{
	388	khint_t h = (khint_t)*s;
	389	if (h) for (++s ; s; ++s) h = (h << 5) - h + (khint_t)s;
	390	return h;
	391	}
	392	/*! @function
	393	@abstract Another interface to const char* hash function
	394	@param key Pointer to a null terminated string [const char*]
	395	@return The hash value [khint_t]
	396	*/
	397	#define kh_str_hash_func(key) __ac_X31_hash_string(key)
	398	/*! @function
	399	@abstract Const char* comparison function
	400	*/
	401	#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
	402
	403	static kh_inline khint_t __ac_Wang_hash(khint_t key)
	404	{
	405	key += ~(key << 15);
	406	key ^= (key >> 10);
	407	key += (key << 3);
	408	key ^= (key >> 6);
	409	key += ~(key << 11);
	410	key ^= (key >> 16);
	411	return key;
	412	}
	413	#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
	414
	415	/* --- END OF HASH FUNCTIONS --- */
	416
	417	/* Other convenient macros... */
	418
	419	/*!
	420	@abstract Type of the hash table.
	421	@param name Name of the hash table [symbol]
	422	*/
	423	#define khash_t(name) kh_##name##_t
	424
	425	/*! @function
	426	@abstract Initiate a hash table.
	427	@param name Name of the hash table [symbol]
	428	@return Pointer to the hash table [khash_t(name)*]
	429	*/
	430	#define kh_init(name) kh_init_##name()
	431
	432	/*! @function
	433	@abstract Destroy a hash table.
	434	@param name Name of the hash table [symbol]
	435	@param h Pointer to the hash table [khash_t(name)*]
	436	*/
	437	#define kh_destroy(name, h) kh_destroy_##name(h)
	438
	439	/*! @function
	440	@abstract Reset a hash table without deallocating memory.
	441	@param name Name of the hash table [symbol]
	442	@param h Pointer to the hash table [khash_t(name)*]
	443	*/
	444	#define kh_clear(name, h) kh_clear_##name(h)
	445
	446	/*! @function
	447	@abstract Resize a hash table.
	448	@param name Name of the hash table [symbol]
	449	@param h Pointer to the hash table [khash_t(name)*]
	450	@param s New size [khint_t]
	451	*/
	452	#define kh_resize(name, h, s) kh_resize_##name(h, s)
	453
	454	/*! @function
	455	@abstract Insert a key to the hash table.
	456	@param name Name of the hash table [symbol]
	457	@param h Pointer to the hash table [khash_t(name)*]
	458	@param k Key [type of keys]
	459	@param r Extra return code: -1 if the operation failed;
	460	0 if the key is present in the hash table;
	461	1 if the bucket is empty (never used); 2 if the element in
	462	the bucket has been deleted [int*]
	463	@return Iterator to the inserted element [khint_t]
	464	*/
	465	#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
	466
	467	/*! @function
	468	@abstract Retrieve a key from the hash table.
	469	@param name Name of the hash table [symbol]
	470	@param h Pointer to the hash table [khash_t(name)*]
	471	@param k Key [type of keys]
	472	@return Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
	473	*/
	474	#define kh_get(name, h, k) kh_get_##name(h, k)
	475
	476	/*! @function
	477	@abstract Remove a key from the hash table.
	478	@param name Name of the hash table [symbol]
	479	@param h Pointer to the hash table [khash_t(name)*]
	480	@param k Iterator to the element to be deleted [khint_t]
	481	*/
	482	#define kh_del(name, h, k) kh_del_##name(h, k)
	483
	484	/*! @function
	485	@abstract Test whether a bucket contains data.
	486	@param h Pointer to the hash table [khash_t(name)*]
	487	@param x Iterator to the bucket [khint_t]
	488	@return 1 if containing data; 0 otherwise [int]
	489	*/
	490	#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
	491
	492	/*! @function
	493	@abstract Get key given an iterator
	494	@param h Pointer to the hash table [khash_t(name)*]
	495	@param x Iterator to the bucket [khint_t]
	496	@return Key [type of keys]
	497	*/
	498	#define kh_key(h, x) ((h)->keys[x])
	499
	500	/*! @function
	501	@abstract Get value given an iterator
	502	@param h Pointer to the hash table [khash_t(name)*]
	503	@param x Iterator to the bucket [khint_t]
	504	@return Value [type of values]
	505	@discussion For hash sets, calling this results in segfault.
	506	*/
	507	#define kh_val(h, x) ((h)->vals[x])
	508
	509	/*! @function
	510	@abstract Alias of kh_val()
	511	*/
	512	#define kh_value(h, x) ((h)->vals[x])
	513
	514	/*! @function
	515	@abstract Get the start iterator
	516	@param h Pointer to the hash table [khash_t(name)*]
	517	@return The start iterator [khint_t]
	518	*/
	519	#define kh_begin(h) (khint_t)(0)
	520
	521	/*! @function
	522	@abstract Get the end iterator
	523	@param h Pointer to the hash table [khash_t(name)*]
	524	@return The end iterator [khint_t]
	525	*/
	526	#define kh_end(h) ((h)->n_buckets)
	527
	528	/*! @function
	529	@abstract Get the number of elements in the hash table
	530	@param h Pointer to the hash table [khash_t(name)*]
	531	@return Number of elements in the hash table [khint_t]
	532	*/
	533	#define kh_size(h) ((h)->size)
	534
	535	/*! @function
	536	@abstract Get the number of buckets in the hash table
	537	@param h Pointer to the hash table [khash_t(name)*]
	538	@return Number of buckets in the hash table [khint_t]
	539	*/
	540	#define kh_n_buckets(h) ((h)->n_buckets)
	541
	542	/*! @function
	543	@abstract Iterate over the entries in the hash table
	544	@param h Pointer to the hash table [khash_t(name)*]
	545	@param kvar Variable to which key will be assigned
	546	@param vvar Variable to which value will be assigned
	547	@param code Block of code to execute
	548	*/
	549	#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \
	550	for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
	551	if (!kh_exist(h,__i)) continue; \
	552	(kvar) = kh_key(h,__i); \
	553	(vvar) = kh_val(h,__i); \
	554	code; \
	555	} }
	556
	557	/*! @function
	558	@abstract Iterate over the values in the hash table
	559	@param h Pointer to the hash table [khash_t(name)*]
	560	@param vvar Variable to which value will be assigned
	561	@param code Block of code to execute
	562	*/
	563	#define kh_foreach_value(h, vvar, code) { khint_t __i; \
	564	for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
	565	if (!kh_exist(h,__i)) continue; \
	566	(vvar) = kh_val(h,__i); \
	567	code; \
	568	} }
	569
	570	/* More conenient interfaces */
	571
	572	/*! @function
	573	@abstract Instantiate a hash set containing integer keys
	574	@param name Name of the hash table [symbol]
	575	*/
	576	#define KHASH_SET_INIT_INT(name) \
	577	KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
	578
	579	/*! @function
	580	@abstract Instantiate a hash map containing integer keys
	581	@param name Name of the hash table [symbol]
	582	@param khval_t Type of values [type]
	583	*/
	584	#define KHASH_MAP_INIT_INT(name, khval_t) \
	585	KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
	586
	587	/*! @function
	588	@abstract Instantiate a hash map containing 64-bit integer keys
	589	@param name Name of the hash table [symbol]
	590	*/
	591	#define KHASH_SET_INIT_INT64(name) \
	592	KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
	593
	594	/*! @function
	595	@abstract Instantiate a hash map containing 64-bit integer keys
	596	@param name Name of the hash table [symbol]
	597	@param khval_t Type of values [type]
	598	*/
	599	#define KHASH_MAP_INIT_INT64(name, khval_t) \
	600	KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
	601
	602	typedef const char *kh_cstr_t;
	603	/*! @function
	604	@abstract Instantiate a hash map containing const char* keys
	605	@param name Name of the hash table [symbol]
	606	*/
	607	#define KHASH_SET_INIT_STR(name) \
	608	KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
	609
	610	/*! @function
	611	@abstract Instantiate a hash map containing const char* keys
	612	@param name Name of the hash table [symbol]
	613	@param khval_t Type of values [type]
	614	*/
	615	#define KHASH_MAP_INIT_STR(name, khval_t) \
	616	KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
	617
	618	#endif /* __AC_KHASH_H */

+248

-0

third_party/minimap-0.2/kseq.h less more

	0	/* The MIT License
	1
	2	Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
	3
	4	Permission is hereby granted, free of charge, to any person obtaining
	5	a copy of this software and associated documentation files (the
	6	"Software"), to deal in the Software without restriction, including
	7	without limitation the rights to use, copy, modify, merge, publish,
	8	distribute, sublicense, and/or sell copies of the Software, and to
	9	permit persons to whom the Software is furnished to do so, subject to
	10	the following conditions:
	11
	12	The above copyright notice and this permission notice shall be
	13	included in all copies or substantial portions of the Software.
	14
	15	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	16	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	17	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	18	NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	19	BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	20	ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	21	CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	22	SOFTWARE.
	23	*/
	24
	25	/* Last Modified: 05MAR2012 */
	26
	27	#ifndef AC_KSEQ_H
	28	#define AC_KSEQ_H
	29
	30	#include <ctype.h>
	31	#include <string.h>
	32	#include <stdlib.h>
	33
	34	#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
	35	#define KS_SEP_TAB 1 // isspace() && !' '
	36	#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
	37	#define KS_SEP_MAX 2
	38
	39	#define __KS_TYPE(type_t) \
	40	typedef struct __kstream_t { \
	41	int begin, end; \
	42	int is_eof:2, bufsize:30; \
	43	type_t f; \
	44	unsigned char *buf; \
	45	} kstream_t;
	46
	47	#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
	48	#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
	49
	50	#define __KS_BASIC(SCOPE, type_t, __bufsize) \
	51	SCOPE kstream_t *ks_init(type_t f) \
	52	{ \
	53	kstream_t ks = (kstream_t)calloc(1, sizeof(kstream_t)); \
	54	ks->f = f; ks->bufsize = __bufsize; \
	55	ks->buf = (unsigned char*)malloc(__bufsize); \
	56	return ks; \
	57	} \
	58	SCOPE void ks_destroy(kstream_t *ks) \
	59	{ \
	60	if (!ks) return; \
	61	free(ks->buf); \
	62	free(ks); \
	63	}
	64
	65	#define __KS_INLINED(__read) \
	66	static inline int ks_getc(kstream_t *ks) \
	67	{ \
	68	if (ks->is_eof && ks->begin >= ks->end) return -1; \
	69	if (ks->begin >= ks->end) { \
	70	ks->begin = 0; \
	71	ks->end = __read(ks->f, ks->buf, ks->bufsize); \
	72	if (ks->end < ks->bufsize) ks->is_eof = 1; \
	73	if (ks->end == 0) return -1; \
	74	} \
	75	return (int)ks->buf[ks->begin++]; \
	76	} \
	77	static inline int ks_getuntil(kstream_t ks, int delimiter, kstring_t str, int *dret) \
	78	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
	79
	80	#ifndef KSTRING_T
	81	#define KSTRING_T kstring_t
	82	typedef struct __kstring_t {
	83	unsigned l, m;
	84	char *s;
	85	} kstring_t;
	86	#endif
	87
	88	#ifndef kroundup32
	89	#define kroundup32(x) (--(x), (x)\|=(x)>>1, (x)\|=(x)>>2, (x)\|=(x)>>4, (x)\|=(x)>>8, (x)\|=(x)>>16, ++(x))
	90	#endif
	91
	92	#define __KS_GETUNTIL(SCOPE, __read) \
	93	SCOPE int ks_getuntil2(kstream_t ks, int delimiter, kstring_t str, int *dret, int append) \
	94	{ \
	95	if (dret) *dret = 0; \
	96	str->l = append? str->l : 0; \
	97	if (ks->begin >= ks->end && ks->is_eof) return -1; \
	98	for (;;) { \
	99	int i; \
	100	if (ks->begin >= ks->end) { \
	101	if (!ks->is_eof) { \
	102	ks->begin = 0; \
	103	ks->end = __read(ks->f, ks->buf, ks->bufsize); \
	104	if (ks->end < ks->bufsize) ks->is_eof = 1; \
	105	if (ks->end == 0) break; \
	106	} else break; \
	107	} \
	108	if (delimiter == KS_SEP_LINE) { \
	109	for (i = ks->begin; i < ks->end; ++i) \
	110	if (ks->buf[i] == '\n') break; \
	111	} else if (delimiter > KS_SEP_MAX) { \
	112	for (i = ks->begin; i < ks->end; ++i) \
	113	if (ks->buf[i] == delimiter) break; \
	114	} else if (delimiter == KS_SEP_SPACE) { \
	115	for (i = ks->begin; i < ks->end; ++i) \
	116	if (isspace(ks->buf[i])) break; \
	117	} else if (delimiter == KS_SEP_TAB) { \
	118	for (i = ks->begin; i < ks->end; ++i) \
	119	if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
	120	} else i = 0; /* never come to here! */ \
	121	if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
	122	str->m = str->l + (i - ks->begin) + 1; \
	123	kroundup32(str->m); \
	124	str->s = (char*)realloc(str->s, str->m); \
	125	} \
	126	memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
	127	str->l = str->l + (i - ks->begin); \
	128	ks->begin = i + 1; \
	129	if (i < ks->end) { \
	130	if (dret) *dret = ks->buf[i]; \
	131	break; \
	132	} \
	133	} \
	134	if (str->s == 0) { \
	135	str->m = 1; \
	136	str->s = (char*)calloc(1, 1); \
	137	} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
	138	str->s[str->l] = '\0'; \
	139	return str->l; \
	140	}
	141
	142	#define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \
	143	__KS_TYPE(type_t) \
	144	__KS_BASIC(SCOPE, type_t, __bufsize) \
	145	__KS_GETUNTIL(SCOPE, __read) \
	146	__KS_INLINED(__read)
	147
	148	#define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize)
	149
	150	#define KSTREAM_DECLARE(type_t, __read) \
	151	__KS_TYPE(type_t) \
	152	extern int ks_getuntil2(kstream_t ks, int delimiter, kstring_t str, int *dret, int append); \
	153	extern kstream_t *ks_init(type_t f); \
	154	extern void ks_destroy(kstream_t *ks); \
	155	__KS_INLINED(__read)
	156
	157	/******************
	158	* FASTA/Q parser *
	159	******************/
	160
	161	#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
	162
	163	#define __KSEQ_BASIC(SCOPE, type_t) \
	164	SCOPE kseq_t *kseq_init(type_t fd) \
	165	{ \
	166	kseq_t s = (kseq_t)calloc(1, sizeof(kseq_t)); \
	167	s->f = ks_init(fd); \
	168	return s; \
	169	} \
	170	SCOPE void kseq_destroy(kseq_t *ks) \
	171	{ \
	172	if (!ks) return; \
	173	free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
	174	ks_destroy(ks->f); \
	175	free(ks); \
	176	}
	177
	178	/* Return value:
	179	>=0 length of the sequence (normal)
	180	-1 end-of-file
	181	-2 truncated quality string
	182	*/
	183	#define __KSEQ_READ(SCOPE) \
	184	SCOPE int kseq_read(kseq_t *seq) \
	185	{ \
	186	int c; \
	187	kstream_t *ks = seq->f; \
	188	if (seq->last_char == 0) { /* then jump to the next header line */ \
	189	while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
	190	if (c == -1) return -1; /* end of file */ \
	191	seq->last_char = c; \
	192	} /* else: the first header char has been read in the previous call */ \
	193	seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
	194	if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
	195	if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
	196	if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
	197	seq->seq.m = 256; \
	198	seq->seq.s = (char*)malloc(seq->seq.m); \
	199	} \
	200	while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
	201	if (c == '\n') continue; /* skip empty lines */ \
	202	seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
	203	ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
	204	} \
	205	if (c == '>' \|\| c == '@') seq->last_char = c; /* the first header char has been read */ \
	206	if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
	207	seq->seq.m = seq->seq.l + 2; \
	208	kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
	209	seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
	210	} \
	211	seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
	212	if (c != '+') return seq->seq.l; /* FASTA */ \
	213	if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
	214	seq->qual.m = seq->seq.m; \
	215	seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
	216	} \
	217	while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
	218	if (c == -1) return -2; /* error: no quality string */ \
	219	while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
	220	seq->last_char = 0; /* we have not come to the next header line */ \
	221	if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
	222	return seq->seq.l; \
	223	}
	224
	225	#define __KSEQ_TYPE(type_t) \
	226	typedef struct { \
	227	kstring_t name, comment, seq, qual; \
	228	int last_char; \
	229	kstream_t *f; \
	230	} kseq_t;
	231
	232	#define KSEQ_INIT2(SCOPE, type_t, __read) \
	233	KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \
	234	__KSEQ_TYPE(type_t) \
	235	__KSEQ_BASIC(SCOPE, type_t) \
	236	__KSEQ_READ(SCOPE)
	237
	238	#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
	239
	240	#define KSEQ_DECLARE(type_t) \
	241	__KS_TYPE(type_t) \
	242	__KSEQ_TYPE(type_t) \
	243	extern kseq_t *kseq_init(type_t fd); \
	244	void kseq_destroy(kseq_t *ks); \
	245	int kseq_read(kseq_t *seq);
	246
	247	#endif

+159

-0

third_party/minimap-0.2/ksort.h less more

	0	/* The MIT License
	1
	2	Copyright (c) 2008, 2011 Attractive Chaos <attractor@live.co.uk>
	3
	4	Permission is hereby granted, free of charge, to any person obtaining
	5	a copy of this software and associated documentation files (the
	6	"Software"), to deal in the Software without restriction, including
	7	without limitation the rights to use, copy, modify, merge, publish,
	8	distribute, sublicense, and/or sell copies of the Software, and to
	9	permit persons to whom the Software is furnished to do so, subject to
	10	the following conditions:
	11
	12	The above copyright notice and this permission notice shall be
	13	included in all copies or substantial portions of the Software.
	14
	15	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	16	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	17	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	18	NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	19	BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	20	ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	21	CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	22	SOFTWARE.
	23	*/
	24
	25	// This is a simplified version of ksort.h
	26
	27	#ifndef AC_KSORT_H
	28	#define AC_KSORT_H
	29
	30	#include <stdlib.h>
	31	#include <string.h>
	32
	33	typedef struct {
	34	void left, right;
	35	int depth;
	36	} ks_isort_stack_t;
	37
	38	#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
	39
	40	#define KSORT_INIT(name, type_t, __sort_lt) \
	41	size_t ks_lis_##name(size_t n, const type_t a, size_t b, size_t *_p) \
	42	{ /* translated from: http://www.algorithmist.com/index.php/Longest_Increasing_Subsequence.cpp */ \
	43	size_t i, u, v, top = b, p; \
	44	if (n == 0) return 0; \
	45	p = _p? _p : (size_t)malloc(n sizeof(size_t)); \
	46	*top++ = 0; \
	47	for (i = 1; i < n; i++) { \
	48	if (__sort_lt(a[*(top-1)], a[i])) { \
	49	p[i] = *(top-1); \
	50	*top++ = i; \
	51	continue; \
	52	} \
	53	for (u = 0, v = top - b - 1; u < v;) { \
	54	size_t c = (u + v) >> 1; \
	55	if (__sort_lt(a[b[c]], a[i])) u = c + 1; \
	56	else v = c; \
	57	} \
	58	if (__sort_lt(a[i], a[b[u]])) { \
	59	if (u > 0) p[i] = b[u-1]; \
	60	b[u] = i; \
	61	} \
	62	} \
	63	for (u = top - b, v = *(top-1); u--; v = p[v]) b[u] = v; \
	64	if (!_p) free(p); \
	65	return top - b; \
	66	} \
	67	type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \
	68	{ \
	69	type_t low, high, k, ll, hh, mid; \
	70	low = arr; high = arr + n - 1; k = arr + kk; \
	71	for (;;) { \
	72	if (high <= low) return *k; \
	73	if (high == low + 1) { \
	74	if (__sort_lt(high, low)) KSORT_SWAP(type_t, low, high); \
	75	return *k; \
	76	} \
	77	mid = low + (high - low) / 2; \
	78	if (__sort_lt(high, mid)) KSORT_SWAP(type_t, mid, high); \
	79	if (__sort_lt(high, low)) KSORT_SWAP(type_t, low, high); \
	80	if (__sort_lt(low, mid)) KSORT_SWAP(type_t, mid, low); \
	81	KSORT_SWAP(type_t, mid, (low+1)); \
	82	ll = low + 1; hh = high; \
	83	for (;;) { \
	84	do ++ll; while (__sort_lt(ll, low)); \
	85	do --hh; while (__sort_lt(low, hh)); \
	86	if (hh < ll) break; \
	87	KSORT_SWAP(type_t, ll, hh); \
	88	} \
	89	KSORT_SWAP(type_t, low, hh); \
	90	if (hh <= k) low = ll; \
	91	if (hh >= k) high = hh - 1; \
	92	} \
	93	} \
	94
	95	#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
	96
	97	#define ks_lt_generic(a, b) ((a) < (b))
	98	#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
	99
	100	typedef const char *ksstr_t;
	101
	102	#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
	103	#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
	104
	105	#define RS_MIN_SIZE 64
	106
	107	#define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \
	108	typedef struct { \
	109	rstype_t b, e; \
	110	} rsbucket_##name##_t; \
	111	void rs_insertsort_##name(rstype_t beg, rstype_t end) \
	112	{ \
	113	rstype_t *i; \
	114	for (i = beg + 1; i < end; ++i) \
	115	if (rskey(i) < rskey((i - 1))) { \
	116	rstype_t j, tmp = i; \
	117	for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \
	118	j = (j - 1); \
	119	*j = tmp; \
	120	} \
	121	} \
	122	void rs_sort_##name(rstype_t beg, rstype_t end, int n_bits, int s) \
	123	{ \
	124	rstype_t *i; \
	125	int size = 1<<n_bits, m = size - 1; \
	126	rsbucket_##name##_t k, b[size], be = b + size; \
	127	for (k = b; k != be; ++k) k->b = k->e = beg; \
	128	for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \
	129	for (k = b + 1; k != be; ++k) \
	130	k->e += (k-1)->e - beg, k->b = (k-1)->e; \
	131	for (k = b; k != be;) { \
	132	if (k->b != k->e) { \
	133	rsbucket_##name##_t *l; \
	134	if ((l = b + (rskey(*k->b)>>s&m)) != k) { \
	135	rstype_t tmp = *k->b, swap; \
	136	do { \
	137	swap = tmp; tmp = l->b; l->b++ = swap; \
	138	l = b + (rskey(tmp)>>s&m); \
	139	} while (l != k); \
	140	*k->b++ = tmp; \
	141	} else ++k->b; \
	142	} else ++k; \
	143	} \
	144	for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \
	145	if (s) { \
	146	s = s > n_bits? s - n_bits : 0; \
	147	for (k = b; k != be; ++k) \
	148	if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \
	149	else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \
	150	} \
	151	} \
	152	void radix_sort_##name(rstype_t beg, rstype_t end) \
	153	{ \
	154	if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \
	155	else rs_sort_##name(beg, end, 8, sizeof_key * 8 - 8); \
	156	}
	157
	158	#endif

+146

-0

third_party/minimap-0.2/kthread.c less more

	0	#include <pthread.h>
	1	#include <stdlib.h>
	2	#include <limits.h>
	3
	4	/************
	5	* kt_for() *
	6	************/
	7
	8	struct kt_for_t;
	9
	10	typedef struct {
	11	struct kt_for_t *t;
	12	long i;
	13	} ktf_worker_t;
	14
	15	typedef struct kt_for_t {
	16	int n_threads;
	17	long n;
	18	ktf_worker_t *w;
	19	void (func)(void,long,int);
	20	void *data;
	21	} kt_for_t;
	22
	23	static inline long steal_work(kt_for_t *t)
	24	{
	25	int i, min_i = -1;
	26	long k, min = LONG_MAX;
	27	for (i = 0; i < t->n_threads; ++i)
	28	if (min > t->w[i].i) min = t->w[i].i, min_i = i;
	29	k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads);
	30	return k >= t->n? -1 : k;
	31	}
	32
	33	static void ktf_worker(void data)
	34	{
	35	ktf_worker_t w = (ktf_worker_t)data;
	36	long i;
	37	for (;;) {
	38	i = __sync_fetch_and_add(&w->i, w->t->n_threads);
	39	if (i >= w->t->n) break;
	40	w->t->func(w->t->data, i, w - w->t->w);
	41	}
	42	while ((i = steal_work(w->t)) >= 0)
	43	w->t->func(w->t->data, i, w - w->t->w);
	44	pthread_exit(0);
	45	}
	46
	47	void kt_for(int n_threads, void (func)(void,long,int), void *data, long n)
	48	{
	49	int i;
	50	kt_for_t t;
	51	pthread_t *tid;
	52	t.func = func, t.data = data, t.n_threads = n_threads, t.n = n;
	53	t.w = (ktf_worker_t)alloca(n_threads sizeof(ktf_worker_t));
	54	tid = (pthread_t)alloca(n_threads sizeof(pthread_t));
	55	for (i = 0; i < n_threads; ++i)
	56	t.w[i].t = &t, t.w[i].i = i;
	57	for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]);
	58	for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
	59	}
	60
	61	/*****************
	62	* kt_pipeline() *
	63	*****************/
	64
	65	struct ktp_t;
	66
	67	typedef struct {
	68	struct ktp_t *pl;
	69	int64_t index;
	70	int step;
	71	void *data;
	72	} ktp_worker_t;
	73
	74	typedef struct ktp_t {
	75	void *shared;
	76	void (func)(void, int, void);
	77	int64_t index;
	78	int n_workers, n_steps;
	79	ktp_worker_t *workers;
	80	pthread_mutex_t mutex;
	81	pthread_cond_t cv;
	82	} ktp_t;
	83
	84	static void ktp_worker(void data)
	85	{
	86	ktp_worker_t w = (ktp_worker_t)data;
	87	ktp_t *p = w->pl;
	88	while (w->step < p->n_steps) {
	89	// test whether we can kick off the job with this worker
	90	pthread_mutex_lock(&p->mutex);
	91	for (;;) {
	92	int i;
	93	// test whether another worker is doing the same step
	94	for (i = 0; i < p->n_workers; ++i) {
	95	if (w == &p->workers[i]) continue; // ignore itself
	96	if (p->workers[i].step <= w->step && p->workers[i].index < w->index)
	97	break;
	98	}
	99	if (i == p->n_workers) break; // no workers with smaller indices are doing w->step or the previous steps
	100	pthread_cond_wait(&p->cv, &p->mutex);
	101	}
	102	pthread_mutex_unlock(&p->mutex);
	103
	104	// working on w->step
	105	w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL
	106
	107	// update step and let other workers know
	108	pthread_mutex_lock(&p->mutex);
	109	w->step = w->step == p->n_steps - 1 \|\| w->data? (w->step + 1) % p->n_steps : p->n_steps;
	110	if (w->step == 0) w->index = p->index++;
	111	pthread_cond_broadcast(&p->cv);
	112	pthread_mutex_unlock(&p->mutex);
	113	}
	114	pthread_exit(0);
	115	}
	116
	117	void kt_pipeline(int n_threads, void (func)(void, int, void), void *shared_data, int n_steps)
	118	{
	119	ktp_t aux;
	120	pthread_t *tid;
	121	int i;
	122
	123	if (n_threads < 1) n_threads = 1;
	124	aux.n_workers = n_threads;
	125	aux.n_steps = n_steps;
	126	aux.func = func;
	127	aux.shared = shared_data;
	128	aux.index = 0;
	129	pthread_mutex_init(&aux.mutex, 0);
	130	pthread_cond_init(&aux.cv, 0);
	131
	132	aux.workers = (ktp_worker_t)alloca(n_threads sizeof(ktp_worker_t));
	133	for (i = 0; i < n_threads; ++i) {
	134	ktp_worker_t *w = &aux.workers[i];
	135	w->step = 0; w->pl = &aux; w->data = 0;
	136	w->index = aux.index++;
	137	}
	138
	139	tid = (pthread_t)alloca(n_threads sizeof(pthread_t));
	140	for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]);
	141	for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
	142
	143	pthread_mutex_destroy(&aux.mutex);
	144	pthread_cond_destroy(&aux.cv);
	145	}

+110

-0

third_party/minimap-0.2/kvec.h less more

	0	/* The MIT License
	1
	2	Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
	3
	4	Permission is hereby granted, free of charge, to any person obtaining
	5	a copy of this software and associated documentation files (the
	6	"Software"), to deal in the Software without restriction, including
	7	without limitation the rights to use, copy, modify, merge, publish,
	8	distribute, sublicense, and/or sell copies of the Software, and to
	9	permit persons to whom the Software is furnished to do so, subject to
	10	the following conditions:
	11
	12	The above copyright notice and this permission notice shall be
	13	included in all copies or substantial portions of the Software.
	14
	15	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	16	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	17	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	18	NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	19	BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	20	ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	21	CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	22	SOFTWARE.
	23	*/
	24
	25	/*
	26	An example:
	27
	28	#include "kvec.h"
	29	int main() {
	30	kvec_t(int) array;
	31	kv_init(array);
	32	kv_push(int, array, 10); // append
	33	kv_a(int, array, 20) = 5; // dynamic
	34	kv_A(array, 20) = 4; // static
	35	kv_destroy(array);
	36	return 0;
	37	}
	38	*/
	39
	40	/*
	41	2008-09-22 (0.1.0):
	42
	43	* The initial version.
	44
	45	*/
	46
	47	#ifndef AC_KVEC_H
	48	#define AC_KVEC_H
	49
	50	#include <stdlib.h>
	51
	52	#define kv_roundup32(x) (--(x), (x)\|=(x)>>1, (x)\|=(x)>>2, (x)\|=(x)>>4, (x)\|=(x)>>8, (x)\|=(x)>>16, ++(x))
	53
	54	#define kvec_t(type) struct { size_t n, m; type *a; }
	55	#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
	56	#define kv_destroy(v) free((v).a)
	57	#define kv_A(v, i) ((v).a[(i)])
	58	#define kv_pop(v) ((v).a[--(v).n])
	59	#define kv_size(v) ((v).n)
	60	#define kv_max(v) ((v).m)
	61
	62	#define kv_resize(type, v, s) do { \
	63	if ((v).m < (s)) { \
	64	(v).m = (s); \
	65	kv_roundup32((v).m); \
	66	(v).a = (type)realloc((v).a, sizeof(type) (v).m); \
	67	} \
	68	} while (0)
	69
	70	#define kv_copy(type, v1, v0) do { \
	71	if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \
	72	(v1).n = (v0).n; \
	73	memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \
	74	} while (0) \
	75
	76	#define kv_push(type, v, x) do { \
	77	if ((v).n == (v).m) { \
	78	(v).m = (v).m? (v).m<<1 : 2; \
	79	(v).a = (type)realloc((v).a, sizeof(type) (v).m); \
	80	} \
	81	(v).a[(v).n++] = (x); \
	82	} while (0)
	83
	84	#define kv_pushp(type, v, p) do { \
	85	if ((v).n == (v).m) { \
	86	(v).m = (v).m? (v).m<<1 : 2; \
	87	(v).a = (type)realloc((v).a, sizeof(type) (v).m); \
	88	} \
	89	*(p) = &(v).a[(v).n++]; \
	90	} while (0)
	91
	92	#define kv_a(type, v, i) ((v).m <= (size_t)(i)? \
	93	((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
	94	(v).a = (type)realloc((v).a, sizeof(type) (v).m), 0) \
	95	: (v).n <= (size_t)(i)? (v).n = (i) \
	96	: 0), (v).a[(i)]
	97
	98	#define kv_reverse(type, v, start) do { \
	99	if ((v).m > 0 && (v).n > (start)) { \
	100	size_t __i, __end = (v).n - (start); \
	101	type *__a = (v).a + (start); \
	102	for (__i = 0; __i < __end>>1; ++__i) { \
	103	type __t = __a[__end - 1 - __i]; \
	104	__a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \
	105	} \
	106	} \
	107	} while (0)
	108
	109	#endif

+145

-0

third_party/minimap-0.2/main.c less more

	0	#include <unistd.h>
	1	#include <stdlib.h>
	2	#include <stdio.h>
	3	#include <string.h>
	4	#include <sys/resource.h>
	5	#include <sys/time.h>
	6	#include "minimap.h"
	7
	8	#define MM_VERSION "0.2-r123"
	9
	10	void liftrlimit()
	11	{
	12	#ifdef __linux__
	13	struct rlimit r;
	14	getrlimit(RLIMIT_AS, &r);
	15	r.rlim_cur = r.rlim_max;
	16	setrlimit(RLIMIT_AS, &r);
	17	#endif
	18	}
	19
	20	int main(int argc, char *argv[])
	21	{
	22	mm_mapopt_t opt;
	23	int i, c, k = 15, w = -1, b = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx = 0;
	24	int tbatch_size = 100000000;
	25	uint64_t ibatch_size = 4000000000ULL;
	26	float f = 0.001;
	27	bseq_file_t *fp = 0;
	28	char *fnw = 0;
	29	FILE fpr = 0, fpw = 0;
	30
	31	liftrlimit();
	32	mm_realtime0 = realtime();
	33	mm_mapopt_init(&opt);
	34
	35	while ((c = getopt(argc, argv, "w:k:B:b:t:r:c:f:Vv:NOg:I:d:lRPST:m:L:Dx:")) >= 0) {
	36	if (c == 'w') w = atoi(optarg);
	37	else if (c == 'k') k = atoi(optarg);
	38	else if (c == 'b') b = atoi(optarg);
	39	else if (c == 'r') opt.radius = atoi(optarg);
	40	else if (c == 'c') opt.min_cnt = atoi(optarg);
	41	else if (c == 'm') opt.merge_frac = atof(optarg);
	42	else if (c == 'f') f = atof(optarg);
	43	else if (c == 't') n_threads = atoi(optarg);
	44	else if (c == 'v') mm_verbose = atoi(optarg);
	45	else if (c == 'g') opt.max_gap = atoi(optarg);
	46	else if (c == 'N') keep_name = 0;
	47	else if (c == 'd') fnw = optarg;
	48	else if (c == 'l') is_idx = 1;
	49	else if (c == 'R') opt.flag \|= MM_F_WITH_REP;
	50	else if (c == 'P') opt.flag &= ~MM_F_WITH_REP;
	51	else if (c == 'D') opt.flag \|= MM_F_NO_SELF;
	52	else if (c == 'O') opt.flag \|= MM_F_NO_ISO;
	53	else if (c == 'S') opt.flag \|= MM_F_AVA \| MM_F_NO_SELF;
	54	else if (c == 'T') opt.sdust_thres = atoi(optarg);
	55	else if (c == 'L') opt.min_match = atoi(optarg);
	56	else if (c == 'V') {
	57	puts(MM_VERSION);
	58	return 0;
	59	} else if (c == 'B' \|\| c == 'I') {
	60	double x;
	61	char *p;
	62	x = strtod(optarg, &p);
	63	if (p == 'G' \|\| p == 'g') x *= 1e9;
	64	else if (p == 'M' \|\| p == 'm') x *= 1e6;
	65	else if (p == 'K' \|\| p == 'k') x *= 1e3;
	66	if (c == 'B') tbatch_size = (uint64_t)(x + .499);
	67	else ibatch_size = (uint64_t)(x + .499);
	68	} else if (c == 'x') {
	69	if (strcmp(optarg, "ava10k") == 0) {
	70	opt.flag \|= MM_F_AVA \| MM_F_NO_SELF;
	71	opt.min_match = 100;
	72	opt.merge_frac = 0.0;
	73	w = 5;
	74	}
	75	}
	76	}
	77	if (w < 0) w = (int)(.6666667 * k + .499);
	78
	79	if (argc == optind) {
	80	fprintf(stderr, "Usage: minimap [options] <target.fa> [query.fa] [...]\n");
	81	fprintf(stderr, "Options:\n");
	82	fprintf(stderr, " Indexing:\n");
	83	fprintf(stderr, " -k INT k-mer size [%d]\n", k);
	84	fprintf(stderr, " -w INT minizer window size [{-k}*2/3]\n");
	85	fprintf(stderr, " -I NUM split index for every ~NUM input bases [4G]\n");
	86	fprintf(stderr, " -d FILE dump index to FILE []\n");
	87	fprintf(stderr, " -l the 1st argument is a index file (overriding -k, -w and -I)\n");
	88	// fprintf(stderr, " -b INT bucket bits [%d]\n", b); // most users would care about this
	89	fprintf(stderr, " Mapping:\n");
	90	fprintf(stderr, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%.3f]\n", f);
	91	fprintf(stderr, " -r INT bandwidth [%d]\n", opt.radius);
	92	fprintf(stderr, " -m FLOAT merge two chains if FLOAT fraction of minimizers are shared [%.2f]\n", opt.merge_frac);
	93	fprintf(stderr, " -c INT retain a mapping if it consists of >=INT minimizers [%d]\n", opt.min_cnt);
	94	fprintf(stderr, " -L INT min matching length [%d]\n", opt.min_match);
	95	fprintf(stderr, " -g INT split a mapping if there is a gap longer than INT [%d]\n", opt.max_gap);
	96	fprintf(stderr, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres);
	97	// fprintf(stderr, " -D skip self mappings but keep dual mappings\n"); // too confusing to expose to end users
	98	fprintf(stderr, " -S skip self and dual mappings\n");
	99	fprintf(stderr, " -O drop isolated hits before chaining (EXPERIMENTAL)\n");
	100	fprintf(stderr, " -P filtering potential repeats after mapping (EXPERIMENTAL)\n");
	101	// fprintf(stderr, " -R skip post-mapping repeat filtering\n"); // deprecated option for backward compatibility
	102	fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n");
	103	fprintf(stderr, " ava10k: -Sw5 -L100 -m0 (PacBio/ONT all-vs-all read mapping)\n");
	104	fprintf(stderr, " Input/Output:\n");
	105	fprintf(stderr, " -t INT number of threads [%d]\n", n_threads);
	106	// fprintf(stderr, " -B NUM process ~NUM bp in each batch [100M]\n");
	107	// fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose);
	108	// fprintf(stderr, " -N use integer as target names\n");
	109	fprintf(stderr, " -V show version number\n");
	110	fprintf(stderr, "\nSee minimap.1 for detailed description of the command-line options.\n");
	111	return 1;
	112	}
	113
	114	if (is_idx) fpr = fopen(argv[optind], "rb");
	115	else fp = bseq_open(argv[optind]);
	116	if (fnw) fpw = fopen(fnw, "wb");
	117	for (;;) {
	118	mm_idx_t *mi = 0;
	119	if (fpr) mi = mm_idx_load(fpr);
	120	else if (!bseq_eof(fp))
	121	mi = mm_idx_gen(fp, w, k, b, tbatch_size, n_threads, ibatch_size, keep_name);
	122	if (mi == 0) break;
	123	if (mm_verbose >= 3)
	124	fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n",
	125	__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n);
	126	mm_idx_set_max_occ(mi, f);
	127	if (mm_verbose >= 3)
	128	fprintf(stderr, "[M::%s] max occurrences of a minimizer to consider: %d\n", __func__, mi->max_occ);
	129	if (fpw) mm_idx_dump(fpw, mi);
	130	for (i = optind + 1; i < argc; ++i)
	131	mm_map_file(mi, argv[i], &opt, n_threads, tbatch_size);
	132	mm_idx_destroy(mi);
	133	}
	134	if (fpw) fclose(fpw);
	135	if (fpr) fclose(fpr);
	136	if (fp) bseq_close(fp);
	137
	138	fprintf(stderr, "[M::%s] Version: %s\n", __func__, MM_VERSION);
	139	fprintf(stderr, "[M::%s] CMD:", __func__);
	140	for (i = 0; i < argc; ++i)
	141	fprintf(stderr, " %s", argv[i]);
	142	fprintf(stderr, "\n[M::%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - mm_realtime0, cputime());
	143	return 0;
	144	}

+374

-0

third_party/minimap-0.2/map.c less more

	0	#include <stdlib.h>
	1	#include <string.h>
	2	#include <stdio.h>
	3	#include "bseq.h"
	4	#include "kvec.h"
	5	#include "minimap.h"
	6	#include "sdust.h"
	7
	8	void mm_mapopt_init(mm_mapopt_t *opt)
	9	{
	10	opt->radius = 500;
	11	opt->max_gap = 10000;
	12	opt->min_cnt = 4;
	13	opt->min_match = 40;
	14	opt->sdust_thres = 0;
	15	opt->flag = MM_F_WITH_REP;
	16	opt->merge_frac = .5;
	17	}
	18
	19	/****************************
	20	* Find approxiate mappings *
	21	****************************/
	22
	23	struct mm_tbuf_s { // per-thread buffer
	24	mm128_v mini; // query minimizers
	25	mm128_v coef; // Hough transform coefficient
	26	mm128_v intv; // intervals on sorted coef
	27	uint32_v reg2mini;
	28	uint32_v rep_aux;
	29	sdust_buf_t *sdb;
	30	// the following are for computing LIS
	31	uint32_t n, m;
	32	uint64_t *a;
	33	size_t b, p;
	34	// final output
	35	kvec_t(mm_reg1_t) reg;
	36	};
	37
	38	mm_tbuf_t *mm_tbuf_init()
	39	{
	40	mm_tbuf_t *b;
	41	b = (mm_tbuf_t*)calloc(1, sizeof(mm_tbuf_t));
	42	b->sdb = sdust_buf_init();
	43	return b;
	44	}
	45
	46	void mm_tbuf_destroy(mm_tbuf_t *b)
	47	{
	48	if (b == 0) return;
	49	free(b->mini.a); free(b->coef.a); free(b->intv.a); free(b->reg.a); free(b->reg2mini.a); free(b->rep_aux.a);
	50	free(b->a); free(b->b); free(b->p);
	51	sdust_buf_destroy(b->sdb);
	52	free(b);
	53	}
	54
	55	#include "ksort.h"
	56	#define sort_key_64(a) (a)
	57	KRADIX_SORT_INIT(64, uint64_t, sort_key_64, 8)
	58	#define lt_low32(a, b) ((uint32_t)(a) < (uint32_t)(b))
	59	KSORT_INIT(low32lt, uint64_t, lt_low32)
	60	#define gt_low32(a, b) ((uint32_t)(a) > (uint32_t)(b))
	61	KSORT_INIT(low32gt, uint64_t, gt_low32)
	62
	63	/* TODO: drop_rep() is not robust. For all-vs-all mapping but without the -S
	64	* flag, all minimizers have at least one hit. The _thres_ computed below will
	65	* be highly skewed. Some improvements need to be made. */
	66
	67	static void drop_rep(mm_tbuf_t *b, int min_cnt)
	68	{
	69	int i, j, n, m;
	70	uint32_t thres;
	71	b->rep_aux.n = 0;
	72	for (i = 0; i < b->mini.n; ++i)
	73	if (b->mini.a[i].y>>32)
	74	kv_push(uint32_t, b->rep_aux, b->mini.a[i].y>>32);
	75	if (b->rep_aux.n < 3) return;
	76	thres = (uint32_t)(ks_ksmall_uint32_t(b->rep_aux.n, b->rep_aux.a, b->rep_aux.n>>1) * MM_DEREP_Q50 + .499);
	77	for (i = n = m = 0; i < b->reg.n; ++i) {
	78	int cnt = 0, all_cnt = b->reg.a[i].cnt;
	79	for (j = 0; j < all_cnt; ++j)
	80	if (b->mini.a[b->reg2mini.a[m + j]].y>>32 <= thres)
	81	++cnt;
	82	if (cnt >= min_cnt)
	83	b->reg.a[n++] = b->reg.a[i];
	84	m += all_cnt;
	85	}
	86	// printf("%ld=>%d\t%d\n", b->reg.n, n, thres);
	87	b->reg.n = n;
	88	}
	89
	90	static void proc_intv(mm_tbuf_t *b, int which, int k, int min_cnt, int max_gap)
	91	{
	92	int i, j, l_lis, rid = -1, rev = 0, start = b->intv.a[which].y, end = start + b->intv.a[which].x;
	93
	94	// make room for arrays needed by LIS (longest increasing sequence)
	95	if (end - start > b->m) {
	96	b->m = end - start;
	97	kv_roundup32(b->m);
	98	b->a = (uint64_t)realloc(b->a, b->m 8);
	99	b->b = (size_t)realloc(b->b, b->m sizeof(size_t));
	100	b->p = (size_t)realloc(b->p, b->m sizeof(size_t));
	101	}
	102
	103	// prepare the input array _a_ for LIS
	104	b->n = 0;
	105	for (i = start; i < end; ++i)
	106	if (b->coef.a[i].x != UINT64_MAX)
	107	b->a[b->n++] = b->coef.a[i].y, rid = b->coef.a[i].x << 1 >> 33, rev = b->coef.a[i].x >> 63;
	108	if (b->n < min_cnt) return;
	109	radix_sort_64(b->a, b->a + b->n);
	110
	111	// find the longest increasing sequence
	112	l_lis = rev? ks_lis_low32gt(b->n, b->a, b->b, b->p) : ks_lis_low32lt(b->n, b->a, b->b, b->p); // LIS
	113	if (l_lis < min_cnt) return;
	114	for (i = 1, j = 1; i < l_lis; ++i) // squeeze out minimizaers reused in the LIS sequence
	115	if (b->a[b->b[i]]>>32 != b->a[b->b[i-1]]>>32)
	116	b->a[b->b[j++]] = b->a[b->b[i]];
	117	l_lis = j;
	118	if (l_lis < min_cnt) return;
	119
	120	// convert LISes to regions; possibly break an LIS at a long gaps
	121	for (i = 1, start = 0; i <= l_lis; ++i) {
	122	int32_t qgap = i == l_lis? 0 : ((uint32_t)b->mini.a[b->a[b->b[i]]>>32].y>>1) - ((uint32_t)b->mini.a[b->a[b->b[i-1]]>>32].y>>1);
	123	if (i == l_lis \|\| (qgap > max_gap && abs((int32_t)b->a[b->b[i]] - (int32_t)b->a[b->b[i-1]]) > max_gap)) {
	124	if (i - start >= min_cnt) {
	125	uint32_t lq = 0, lr = 0, eq = 0, er = 0, sq = 0, sr = 0;
	126	mm_reg1_t *r;
	127	kv_pushp(mm_reg1_t, b->reg, &r);
	128	r->rid = rid, r->rev = rev, r->cnt = i - start, r->rep = 0;
	129	r->qs = ((uint32_t)b->mini.a[b->a[b->b[start]]>>32].y>>1) - (k - 1);
	130	r->qe = ((uint32_t)b->mini.a[b->a[b->b[i-1]]>>32].y>>1) + 1;
	131	r->rs = rev? (uint32_t)b->a[b->b[i-1]] : (uint32_t)b->a[b->b[start]];
	132	r->re = rev? (uint32_t)b->a[b->b[start]] : (uint32_t)b->a[b->b[i-1]];
	133	r->rs -= k - 1;
	134	r->re += 1;
	135	for (j = start; j < i; ++j) { // count the number of times each minimizer is used
	136	int jj = b->a[b->b[j]]>>32;
	137	b->mini.a[jj].y += 1ULL<<32;
	138	kv_push(uint32_t, b->reg2mini, jj); // keep minimizer<=>reg mapping for derep
	139	}
	140	for (j = start; j < i; ++j) { // compute ->len
	141	uint32_t q = ((uint32_t)b->mini.a[b->a[b->b[j]]>>32].y>>1) - (k - 1);
	142	uint32_t r = (uint32_t)b->a[b->b[j]];
	143	r = !rev? r - (k - 1) : (0x80000000U - r);
	144	if (r > er) lr += er - sr, sr = r, er = sr + k;
	145	else er = r + k;
	146	if (q > eq) lq += eq - sq, sq = q, eq = sq + k;
	147	else eq = q + k;
	148	}
	149	lr += er - sr, lq += eq - sq;
	150	r->len = lr < lq? lr : lq;
	151	}
	152	start = i;
	153	}
	154	}
	155	}
	156
	157	// merge or add a Hough interval; only used by get_reg()
	158	static inline void push_intv(mm128_v *intv, int start, int end, float merge_frac)
	159	{
	160	mm128_t *p;
	161	if (intv->n > 0) { // test overlap
	162	int last_start, last_end, min;
	163	p = &intv->a[intv->n-1];
	164	last_start = p->y, last_end = p->x + last_start;
	165	min = end - start < last_end - last_start? end - start : last_end - last_start;
	166	if (last_end > start && last_end - start > min * merge_frac) { // large overlap; then merge
	167	p->x = end - last_start;
	168	return;
	169	}
	170	}
	171	kv_pushp(mm128_t, *intv, &p); // a new interval
	172	p->x = end - start, p->y = start;
	173	}
	174
	175	// find mapping regions from a list of minimizer hits
	176	static void get_reg(mm_tbuf_t *b, int radius, int k, int min_cnt, int max_gap, float merge_frac, int flag)
	177	{
	178	const uint64_t v_kept = ~(1ULL<<31), v_dropped = 1ULL<<31;
	179	mm128_v *c = &b->coef;
	180	int i, j, start = 0, iso_dist = radius * 2;
	181
	182	if (c->n < min_cnt) return;
	183
	184	// drop isolated minimizer hits
	185	if (flag&MM_F_NO_ISO) {
	186	for (i = 0; i < c->n; ++i) c->a[i].y \|= v_dropped;
	187	for (i = 1; i < c->n; ++i) {
	188	uint64_t x = c->a[i].x;
	189	int32_t rpos = (uint32_t)c->a[i].y;
	190	for (j = i - 1; j >= 0 && x - c->a[j].x < radius; --j) {
	191	int32_t y = c->a[j].y;
	192	if (abs(y - rpos) < iso_dist) {
	193	c->a[i].y &= v_kept, c->a[j].y &= v_kept;
	194	break;
	195	}
	196	}
	197	}
	198	for (i = j = 0; i < c->n; ++i) // squeeze out hits still marked as v_dropped
	199	if ((c->a[i].y&v_dropped) == 0)
	200	c->a[j++] = c->a[i];
	201	c->n = j;
	202	}
	203
	204	// identify (possibly overlapping) intervals within _radius_; an interval is a cluster of hits
	205	b->intv.n = 0;
	206	for (i = 1; i < c->n; ++i) {
	207	if (c->a[i].x - c->a[start].x > radius) {
	208	if (i - start >= min_cnt) push_intv(&b->intv, start, i, merge_frac);
	209	for (++start; start < i && c->a[i].x - c->a[start].x > radius; ++start);
	210	}
	211	}
	212	if (i - start >= min_cnt) push_intv(&b->intv, start, i, merge_frac);
	213
	214	// sort by the size of the interval
	215	radix_sort_128x(b->intv.a, b->intv.a + b->intv.n);
	216
	217	// generate hits, starting from the largest interval
	218	b->reg2mini.n = 0;
	219	for (i = b->intv.n - 1; i >= 0; --i) proc_intv(b, i, k, min_cnt, max_gap);
	220
	221	// post repeat removal
	222	if (!(flag&MM_F_WITH_REP)) drop_rep(b, min_cnt);
	223	}
	224
	225	const mm_reg1_t mm_map(const mm_idx_t mi, int l_seq, const char seq, int n_regs, mm_tbuf_t b, const mm_mapopt_t opt, const char *name)
	226	{
	227	int j, n_dreg = 0, u = 0;
	228	const uint64_t *dreg = 0;
	229
	230	b->mini.n = b->coef.n = 0;
	231	mm_sketch(seq, l_seq, mi->w, mi->k, 0, &b->mini);
	232	if (opt->sdust_thres > 0)
	233	dreg = sdust_core((const uint8_t*)seq, l_seq, opt->sdust_thres, 64, &n_dreg, b->sdb);
	234	for (j = 0; j < b->mini.n; ++j) {
	235	int k, n;
	236	const uint64_t *r;
	237	int32_t qpos = (uint32_t)b->mini.a[j].y>>1, strand = b->mini.a[j].y&1;
	238	b->mini.a[j].y = b->mini.a[j].y<<32>>32; // clear the rid field
	239	if (dreg && n_dreg) { // test complexity
	240	int s = qpos - (mi->k - 1), e = s + mi->k;
	241	while (u < n_dreg && (uint32_t)dreg[u] <= s) ++u;
	242	if (u < n_dreg && dreg[u]>>32 < e) {
	243	int v, l = 0;
	244	for (v = u; v < n_dreg && dreg[v]>>32 < e; ++v) { // iterate over LCRs overlapping this minimizer
	245	int ss = s > dreg[v]>>32? s : dreg[v]>>32;
	246	int ee = e < (uint32_t)dreg[v]? e : (uint32_t)dreg[v];
	247	l += ee - ss;
	248	}
	249	if (l > mi->k>>1) continue;
	250	}
	251	}
	252	r = mm_idx_get(mi, b->mini.a[j].x, &n);
	253	if (n > mi->max_occ) continue;
	254	for (k = 0; k < n; ++k) {
	255	int32_t rpos = (uint32_t)r[k] >> 1;
	256	mm128_t *p;
	257	if (name && (opt->flag&MM_F_NO_SELF) && mi->name && strcmp(name, mi->name[r[k]>>32]) == 0 && rpos == qpos)
	258	continue;
	259	if (name && (opt->flag&MM_F_AVA) && mi->name && strcmp(name, mi->name[r[k]>>32]) > 0)
	260	continue;
	261	kv_pushp(mm128_t, b->coef, &p);
	262	if ((r[k]&1) == strand) { // forward strand
	263	p->x = (uint64_t)r[k] >> 32 << 32 \| (0x80000000U + rpos - qpos);
	264	p->y = (uint64_t)j << 32 \| rpos;
	265	} else { // reverse strand
	266	p->x = (uint64_t)r[k] >> 32 << 32 \| (rpos + qpos) \| 1ULL<<63;
	267	p->y = (uint64_t)j << 32 \| rpos;
	268	}
	269	}
	270	}
	271	radix_sort_128x(b->coef.a, b->coef.a + b->coef.n);
	272	b->reg.n = 0;
	273	get_reg(b, opt->radius, mi->k, opt->min_cnt, opt->max_gap, opt->merge_frac, opt->flag);
	274	*n_regs = b->reg.n;
	275	return b->reg.a;
	276	}
	277
	278	/**************************
	279	* Multi-threaded mapping *
	280	**************************/
	281
	282	void kt_for(int n_threads, void (func)(void,long,int), void *data, long n);
	283	void kt_pipeline(int n_threads, void (func)(void, int, void), void *shared_data, int n_steps);
	284
	285	typedef struct {
	286	int batch_size, n_processed, n_threads;
	287	const mm_mapopt_t *opt;
	288	bseq_file_t *fp;
	289	const mm_idx_t *mi;
	290	} pipeline_t;
	291
	292	typedef struct {
	293	const pipeline_t *p;
	294	int n_seq;
	295	bseq1_t *seq;
	296	int *n_reg;
	297	mm_reg1_t **reg;
	298	mm_tbuf_t **buf;
	299	} step_t;
	300
	301	static void worker_for(void *_data, long i, int tid) // kt_for() callback
	302	{
	303	step_t step = (step_t)_data;
	304	const mm_reg1_t *regs;
	305	int n_regs;
	306
	307	regs = mm_map(step->p->mi, step->seq[i].l_seq, step->seq[i].seq, &n_regs, step->buf[tid], step->p->opt, step->seq[i].name);
	308	step->n_reg[i] = n_regs;
	309	if (n_regs > 0) {
	310	step->reg[i] = (mm_reg1_t)malloc(n_regs sizeof(mm_reg1_t));
	311	memcpy(step->reg[i], regs, n_regs * sizeof(mm_reg1_t));
	312	}
	313	}
	314
	315	static void worker_pipeline(void shared, int step, void *in)
	316	{
	317	int i, j;
	318	pipeline_t p = (pipeline_t)shared;
	319	if (step == 0) { // step 0: read sequences
	320	step_t *s;
	321	s = (step_t*)calloc(1, sizeof(step_t));
	322	s->seq = bseq_read(p->fp, p->batch_size, &s->n_seq);
	323	if (s->seq) {
	324	s->p = p;
	325	for (i = 0; i < s->n_seq; ++i)
	326	s->seq[i].rid = p->n_processed++;
	327	s->buf = (mm_tbuf_t*)calloc(p->n_threads, sizeof(mm_tbuf_t));
	328	for (i = 0; i < p->n_threads; ++i)
	329	s->buf[i] = mm_tbuf_init();
	330	s->n_reg = (int*)calloc(s->n_seq, sizeof(int));
	331	s->reg = (mm_reg1_t)calloc(s->n_seq, sizeof(mm_reg1_t));
	332	return s;
	333	} else free(s);
	334	} else if (step == 1) { // step 1: map
	335	kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_seq);
	336	return in;
	337	} else if (step == 2) { // step 2: output
	338	step_t s = (step_t)in;
	339	const mm_idx_t *mi = p->mi;
	340	for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]);
	341	free(s->buf);
	342	for (i = 0; i < s->n_seq; ++i) {
	343	bseq1_t *t = &s->seq[i];
	344	for (j = 0; j < s->n_reg[i]; ++j) {
	345	mm_reg1_t *r = &s->reg[i][j];
	346	if (r->len < p->opt->min_match) continue;
	347	printf("%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]);
	348	if (mi->name) fputs(mi->name[r->rid], stdout);
	349	else printf("%d", r->rid + 1);
	350	printf("\t%d\t%d\t%d\t%d\t%d\t255\tcm:i:%d\n", mi->len[r->rid], r->rs, r->re, r->len,
	351	r->re - r->rs > r->qe - r->qs? r->re - r->rs : r->qe - r->qs, r->cnt);
	352	}
	353	free(s->reg[i]);
	354	free(s->seq[i].seq); free(s->seq[i].name);
	355	}
	356	free(s->reg); free(s->n_reg); free(s->seq);
	357	free(s);
	358	}
	359	return 0;
	360	}
	361
	362	int mm_map_file(const mm_idx_t idx, const char fn, const mm_mapopt_t *opt, int n_threads, int tbatch_size)
	363	{
	364	pipeline_t pl;
	365	memset(&pl, 0, sizeof(pipeline_t));
	366	pl.fp = bseq_open(fn);
	367	if (pl.fp == 0) return -1;
	368	pl.opt = opt, pl.mi = idx;
	369	pl.n_threads = n_threads, pl.batch_size = tbatch_size;
	370	kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3);
	371	bseq_close(pl.fp);
	372	return 0;
	373	}

+222

-0

third_party/minimap-0.2/minimap.1 less more

	0	.TH minimap 1 "06 December 2015" "minimap-0.2" "Bioinformatics tools"
	1
	2	.SH NAME
	3	.PP
	4	minimap - fast mapping between long DNA sequences
	5
	6	.SH SYNOPSIS
	7	.PP
	8	minimap
	9	.RB [ -lSOV ]
	10	.RB [ -k
	11	.IR kmer ]
	12	.RB [ -w
	13	.IR winSize ]
	14	.RB [ -I
	15	.IR batchSize ]
	16	.RB [ -d
	17	.IR dumpFile ]
	18	.RB [ -f
	19	.IR occThres ]
	20	.RB [ -r
	21	.IR bandWidth ]
	22	.RB [ -m
	23	.IR minShared ]
	24	.RB [ -c
	25	.IR minCount ]
	26	.RB [ -L
	27	.IR minMatch ]
	28	.RB [ -g
	29	.IR maxGap ]
	30	.RB [ -T
	31	.IR dustThres ]
	32	.RB [ -t
	33	.IR nThreads ]
	34	.RB [ -x
	35	.IR preset ]
	36	.I target.fa
	37	.I query.fa
	38	>
	39	.I output.paf
	40
	41	.SH DESCRIPTION
	42	.PP
	43	Minimap is a tool to efficiently find multiple approximate mapping positions
	44	between two sets of long sequences, such as between reads and reference
	45	genomes, between genomes and between long noisy reads. Minimap has an indexing
	46	and a mapping phase. In the indexing phase, it collects all minimizers of a
	47	large batch of target sequences in a hash table; in the mapping phase, it
	48	identifies good clusters of colinear minimizer hits. Minimap does not generate
	49	detailed alignments between the target and the query sequences. It only outputs
	50	the approximate start and the end coordinates of these clusters.
	51
	52	.SH OPTIONS
	53
	54	.SS Indexing options
	55
	56	.TP 10
	57	.BI -k \ INT
	58	Minimizer k-mer length [15]
	59
	60	.TP
	61	.BI -w \ INT
	62	Minimizer window size [2/3 of k-mer length]. A minimizer is the smallest k-mer
	63	in a window of w consecutive k-mers.
	64
	65	.TP
	66	.BI -I \ NUM
	67	Load at most
	68	.I NUM
	69	target bases into RAM for indexing [4G]. If there are more than
	70	.I NUM
	71	bases in
	72	.IR target.fa ,
	73	minimap needs to read
	74	.I query.fa
	75	multiple times to map it against each batch of target sequences.
	76	.I NUM
	77	may be ending with k/K/m/M/g/G.
	78
	79	.TP
	80	.BI -d \ FILE
	81	Dump minimizer index to
	82	.I FILE
	83	[no dump]
	84
	85	.TP
	86	.B -l
	87	Indicate that
	88	.I target.fa
	89	is in fact a minimizer index generated by option
	90	.BR -d ,
	91	not a FASTA or FASTQ file.
	92
	93	.SS Mapping options
	94
	95	.TP 10
	96	.BI -f \ FLOAT
	97	Ignore top
	98	.I FLOAT
	99	fraction of most occurring minimizers [0.001]
	100
	101	.TP
	102	.BI -r \ INT
	103	Approximate bandwidth for initial minimizer hits clustering [500]. A
	104	.I minimizer hit
	105	is a minimizer present in both the target and query sequences. A
	106	.I minimizer hit cluster
	107	is a group of potentially colinear minimizer hits between a target and a query
	108	sequence.
	109
	110	.TP
	111	.BI -m \ FLOAT
	112	Merge initial minimizer hit clusters if
	113	.I FLOAT
	114	or higher fraction of minimizers are shared between the clusters [0.5]
	115
	116	.TP
	117	.BI -c \ INT
	118	Retain a minimizer hit cluster if it contains
	119	.I INT
	120	or more minimizer hits [4]
	121
	122	.TP
	123	.BI -L \ INT
	124	Discard a minimizer hit cluster if after colinearization, the number of matching bases is below
	125	.I INT
	126	[40]. This option mainly reduces the size of output. It has little effect on
	127	the speed and peak memory.
	128
	129	.TP
	130	.BI -g \ INT
	131	Split a minimizer hit cluster at a gap
	132	.IR INT -bp
	133	or longer that does not contain any minimizer hits [10000]
	134
	135	.TP
	136	.BI -T \ INT
	137	Mask regions on query sequences with SDUST score threshold
	138	.IR INT ;
	139	0 to disable [0]. SDUST is an algorithm
	140	to identify low-complexity subsequences. It is not enabled by default. If SDUST
	141	is preferred, a value between 20 and 25 is recommended. A higher threshold masks
	142	less sequences.
	143
	144	.TP
	145	.B -S
	146	Perform all-vs-all mapping. In this mode, if the query sequence name is
	147	lexicographically larger than the target sequence name, the hits between them
	148	will be suppressed; if the query sequence name is the same as the target name,
	149	diagonal minimizer hits will also be suppressed.
	150
	151	.TP
	152	.B -O
	153	Drop a minimizer hit if it is far away from other hits (EXPERIMENTAL). This
	154	option is useful for mapping long chromosomes from two diverged species.
	155
	156	.TP
	157	.BI -x \ STR
	158	Changing multiple settings based on
	159	.I STR
	160	[not set]. It is recommended to apply this option before other options, such
	161	that the following options may override the multiple settings modified by this
	162	option.
	163
	164	.RS
	165	.TP 8
	166	.B ava10k
	167	for PacBio or Oxford Nanopore all-vs-all read mapping (-Sw5 -L100 -m0).
	168	.RE
	169
	170	.SS Input/output options
	171
	172	.TP 10
	173	.BI -t \ INT
	174	Number of threads [3]. Minimap uses at most three threads when collecting
	175	minimizers on target sequences, and uses up to
	176	.IR INT +1
	177	threads when mapping (the extra thread is for I/O, which is frequently idle and
	178	takes little CPU time).
	179
	180	.TP
	181	.B -V
	182	Print version number to stdout
	183
	184	.SH OUTPUT FORMAT
	185
	186	.PP
	187	Minimap outputs mapping positions in the Pairwise mApping Format (PAF). PAF is
	188	a TAB-delimited text format with each line consisting of at least 12 fields as
	189	are described in the following table:
	190
	191	.TS
	192	center box;
	193	cb \| cb \| cb
	194	r \| c \| l .
	195	Col Type Description
	196	_
	197	1 string Query sequence name
	198	2 int Query sequence length
	199	3 int Query start coordinate (0-based)
	200	4 int Query end coordinate (0-based)
	201	5 char `+' if query and target on the same strand; `-' if opposite
	202	6 string Target sequence name
	203	7 int Target sequence length
	204	8 int Target start coordinate on the original strand
	205	9 int Target end coordinate on the original strand
	206	10 int Number of matching bases in the mapping
	207	11 int Number bases, including gaps, in the mapping
	208	12 int Mapping quality (0-255 with 255 for missing)
	209	.TE
	210
	211	.PP
	212	When the alignment is available, column 11 gives the total number of sequence
	213	matches, mismatches and gaps in the alignment; column 10 divided by column 11
	214	gives the alignment identity. As minimap does not generate detailed alignment,
	215	these two columns are approximate. PAF may optionally have additional fields in
	216	the SAM-like typed key-value format. Minimap writes the number of minimizer
	217	hits in a cluster to the cm tag.
	218
	219	.SH SEE ALSO
	220	.PP
	221	miniasm(1)

+104

-0

third_party/minimap-0.2/minimap.h less more

	0	#ifndef MINIMAP_H
	1	#define MINIMAP_H
	2
	3	#include <stdint.h>
	4	#include <stdio.h>
	5	#include <sys/types.h>
	6	#include "bseq.h"
	7
	8	#define MM_IDX_DEF_B 14
	9	#define MM_DEREP_Q50 5.0
	10
	11	#define MM_F_WITH_REP 0x1
	12	#define MM_F_NO_SELF 0x2
	13	#define MM_F_NO_ISO 0x4
	14	#define MM_F_AVA 0x8
	15
	16	typedef struct {
	17	uint64_t x, y;
	18	} mm128_t;
	19
	20	typedef struct { size_t n, m; mm128_t *a; } mm128_v;
	21	typedef struct { size_t n, m; uint64_t *a; } uint64_v;
	22	typedef struct { size_t n, m; uint32_t *a; } uint32_v;
	23
	24	typedef struct {
	25	mm128_v a; // (minimizer, position) array
	26	int32_t n; // size of the _p_ array
	27	uint64_t *p; // position array for minimizers appearing >1 times
	28	void *h; // hash table indexing _p_ and minimizers appearing once
	29	} mm_idx_bucket_t;
	30
	31	typedef struct {
	32	int b, w, k;
	33	uint32_t n; // number of reference sequences
	34	mm_idx_bucket_t *B;
	35	uint32_t max_occ;
	36	float freq_thres;
	37	int32_t *len; // length of each reference sequence
	38	char **name; // TODO: if this uses too much RAM, switch one concatenated string
	39	} mm_idx_t;
	40
	41	typedef struct {
	42	uint32_t cnt:31, rev:1;
	43	uint32_t rid:31, rep:1;
	44	uint32_t len;
	45	int32_t qs, qe, rs, re;
	46	} mm_reg1_t;
	47
	48	typedef struct {
	49	int radius; // bandwidth to cluster hits
	50	int max_gap; // break a chain if there are no minimizers in a max_gap window
	51	int min_cnt; // minimum number of minimizers to start a chain
	52	int min_match;
	53	int sdust_thres; // score threshold for SDUST; 0 to disable
	54	int flag; // see MM_F_* macros
	55	float merge_frac; // merge two chains if merge_frac fraction of minimzers are shared between the chains
	56	} mm_mapopt_t;
	57
	58	extern int mm_verbose;
	59	extern double mm_realtime0;
	60
	61	struct mm_tbuf_s;
	62	typedef struct mm_tbuf_s mm_tbuf_t;
	63
	64	#ifdef __cplusplus
	65	extern "C" {
	66	#endif
	67
	68	// compute minimizers
	69	void mm_sketch(const char str, int len, int w, int k, uint32_t rid, mm128_v p);
	70
	71	// minimizer indexing
	72	mm_idx_t *mm_idx_init(int w, int k, int b);
	73	void mm_idx_destroy(mm_idx_t *mi);
	74	mm_idx_t mm_idx_gen(bseq_file_t fp, int w, int k, int b, int tbatch_size, int n_threads, uint64_t ibatch_size, int keep_name);
	75	void mm_idx_set_max_occ(mm_idx_t *mi, float f);
	76	const uint64_t mm_idx_get(const mm_idx_t mi, uint64_t minier, int *n);
	77
	78	mm_idx_t mm_idx_build(const char fn, int w, int k, int n_threads);
	79
	80	// minimizer index I/O
	81	void mm_idx_dump(FILE fp, const mm_idx_t mi);
	82	mm_idx_t mm_idx_load(FILE fp);
	83
	84	// mapping
	85	void mm_mapopt_init(mm_mapopt_t *opt);
	86	mm_tbuf_t *mm_tbuf_init(void);
	87	void mm_tbuf_destroy(mm_tbuf_t *b);
	88	const mm_reg1_t mm_map(const mm_idx_t mi, int l_seq, const char seq, int n_regs, mm_tbuf_t b, const mm_mapopt_t opt, const char *name);
	89
	90	int mm_map_file(const mm_idx_t idx, const char fn, const mm_mapopt_t *opt, int n_threads, int tbatch_size);
	91
	92	// private functions (may be moved to a "mmpriv.h" in future)
	93	double cputime(void);
	94	double realtime(void);
	95	void radix_sort_128x(mm128_t beg, mm128_t end);
	96	void radix_sort_64(uint64_t beg, uint64_t end);
	97	uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
	98
	99	#ifdef __cplusplus
	100	}
	101	#endif
	102
	103	#endif

+26

-0

third_party/minimap-0.2/misc.c less more

	0	#include <sys/resource.h>
	1	#include <sys/time.h>
	2	#include "minimap.h"
	3
	4	int mm_verbose = 3;
	5	double mm_realtime0;
	6
	7	double cputime()
	8	{
	9	struct rusage r;
	10	getrusage(RUSAGE_SELF, &r);
	11	return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec);
	12	}
	13
	14	double realtime()
	15	{
	16	struct timeval tp;
	17	struct timezone tzp;
	18	gettimeofday(&tp, &tzp);
	19	return tp.tv_sec + tp.tv_usec * 1e-6;
	20	}
	21
	22	#include "ksort.h"
	23	#define sort_key_128x(a) ((a).x)
	24	KRADIX_SORT_INIT(128x, mm128_t, sort_key_128x, 8)
	25	KSORT_INIT_GENERIC(uint32_t)

+209

-0

third_party/minimap-0.2/sdust.c less more

	0	#include <string.h>
	1	#include <stdint.h>
	2	#include <stdio.h>
	3	#include "kdq.h"
	4	#include "kvec.h"
	5	#include "sdust.h"
	6
	7	#define SD_WLEN 3
	8	#define SD_WTOT (1<<(SD_WLEN<<1))
	9	#define SD_WMSK (SD_WTOT - 1)
	10
	11	typedef struct {
	12	int start, finish;
	13	int r, l;
	14	} perf_intv_t;
	15
	16	typedef kvec_t(perf_intv_t) perf_intv_v;
	17	typedef kvec_t(uint64_t) uint64_v;
	18
	19	KDQ_INIT(int)
	20
	21	#if defined(_NO_NT4_TBL) \|\| defined(_SDUST_MAIN)
	22	unsigned char seq_nt4_table[256] = {
	23	0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	24	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	25	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	26	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	27	4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
	28	4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	29	4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
	30	4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	31	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	32	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	33	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	34	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	35	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	36	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	37	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	38	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
	39	};
	40	#else
	41	extern unsigned char seq_nt4_table[256];
	42	#endif
	43
	44	struct sdust_buf_s {
	45	kdq_t(int) *w;
	46	perf_intv_v P; // the list of perfect intervals for the current window, sorted by descending start and then by ascending finish
	47	uint64_v res; // the result
	48	};
	49
	50	sdust_buf_t *sdust_buf_init(void)
	51	{
	52	sdust_buf_t *buf;
	53	buf = (sdust_buf_t*)calloc(1, sizeof(sdust_buf_t));
	54	buf->w = kdq_init(int);
	55	return buf;
	56	}
	57
	58	void sdust_buf_destroy(sdust_buf_t *buf)
	59	{
	60	if (buf == 0) return;
	61	kdq_destroy(int, buf->w);
	62	free(buf->P.a); free(buf->res.a);
	63	free(buf);
	64	}
	65
	66	static inline void shift_window(int t, kdq_t(int) w, int T, int W, int L, int rw, int rv, int cw, int cv)
	67	{
	68	int s;
	69	if (kdq_size(w) >= W - SD_WLEN + 1) { // TODO: is this right for SD_WLEN!=3?
	70	s = *kdq_shift(int, w);
	71	*rw -= --cw[s];
	72	if (*L > kdq_size(w))
	73	--L, rv -= --cv[s];
	74	}
	75	kdq_push(int, w, t);
	76	++*L;
	77	*rw += cw[t]++;
	78	*rv += cv[t]++;
	79	if (cv[t] * 10 > T<<1) {
	80	do {
	81	s = kdq_at(w, kdq_size(w) - *L);
	82	*rv -= --cv[s];
	83	--*L;
	84	} while (s != t);
	85	}
	86	}
	87
	88	static inline void save_masked_regions(uint64_v res, perf_intv_v P, int start)
	89	{
	90	int i, saved = 0;
	91	perf_intv_t *p;
	92	if (P->n == 0 \|\| P->a[P->n - 1].start >= start) return;
	93	p = &P->a[P->n - 1];
	94	if (res->n) {
	95	int s = res->a[res->n - 1]>>32, f = (uint32_t)res->a[res->n - 1];
	96	if (p->start <= f) // if overlapping with or adjacent to the previous interval
	97	saved = 1, res->a[res->n - 1] = (uint64_t)s<<32 \| (f > p->finish? f : p->finish);
	98	}
	99	if (!saved) kv_push(uint64_t, *res, (uint64_t)p->start<<32\|p->finish);
	100	for (i = P->n - 1; i >= 0 && P->a[i].start < start; --i); // remove perfect intervals that have falled out of the window
	101	P->n = i + 1;
	102	}
	103
	104	static void find_perfect(perf_intv_v P, const kdq_t(int) w, int T, int start, int L, int rv, const int *cv)
	105	{
	106	int c[SD_WTOT], r = rv, i, max_r = 0, max_l = 0;
	107	memcpy(c, cv, SD_WTOT * sizeof(int));
	108	for (i = (long)kdq_size(w) - L - 1; i >= 0; --i) {
	109	int j, t = kdq_at(w, i), new_r, new_l;
	110	r += c[t]++;
	111	new_r = r, new_l = kdq_size(w) - i - 1;
	112	if (new_r * 10 > T * new_l) {
	113	for (j = 0; j < P->n && P->a[j].start >= i + start; ++j) { // find insertion position
	114	perf_intv_t *p = &P->a[j];
	115	if (max_r == 0 \|\| p->r * max_l > max_r * p->l)
	116	max_r = p->r, max_l = p->l;
	117	}
	118	if (max_r == 0 \|\| new_r * max_l >= max_r * new_l) { // then insert
	119	max_r = new_r, max_l = new_l;
	120	if (P->n == P->m) kv_resize(perf_intv_t, *P, P->n + 1);
	121	memmove(&P->a[j+1], &P->a[j], (P->n - j) * sizeof(perf_intv_t)); // make room
	122	++P->n;
	123	P->a[j].start = i + start, P->a[j].finish = kdq_size(w) + (SD_WLEN - 1) + start;
	124	P->a[j].r = new_r, P->a[j].l = new_l;
	125	}
	126	}
	127	}
	128	}
	129
	130	const uint64_t sdust_core(const uint8_t seq, int l_seq, int T, int W, int n, sdust_buf_t buf)
	131	{
	132	int rv = 0, rw = 0, L = 0, cv[SD_WTOT], cw[SD_WTOT];
	133	int i, start, l; // _start_: start of the current window; _l_: length of a contiguous A/C/G/T (sub)sequence
	134	unsigned t; // current word
	135
	136	buf->P.n = buf->res.n = 0;
	137	buf->w->front = buf->w->count = 0;
	138	memset(cv, 0, SD_WTOT * sizeof(int));
	139	memset(cw, 0, SD_WTOT * sizeof(int));
	140	if (l_seq < 0) l_seq = strlen((const char*)seq);
	141	for (i = l = t = 0; i <= l_seq; ++i) {
	142	int b = i < l_seq? seq_nt4_table[seq[i]] : 4;
	143	if (b < 4) { // an A/C/G/T base
	144	++l, t = (t<<2 \| b) & SD_WMSK;
	145	if (l >= SD_WLEN) { // we have seen a word
	146	start = (l - W > 0? l - W : 0) + (i + 1 - l); // set the start of the current window
	147	save_masked_regions(&buf->res, &buf->P, start); // save intervals falling out of the current window?
	148	shift_window(t, buf->w, T, W, &L, &rw, &rv, cw, cv);
	149	if (rw * 10 > L * T)
	150	find_perfect(&buf->P, buf->w, T, start, L, rv, cv);
	151	}
	152	} else { // N or the end of sequence; N effectively breaks input into pieces of independent sequences
	153	start = (l - W + 1 > 0? l - W + 1 : 0) + (i + 1 - l);
	154	while (buf->P.n) save_masked_regions(&buf->res, &buf->P, start++); // clear up unsaved perfect intervals
	155	l = t = 0;
	156	}
	157	}
	158	*n = buf->res.n;
	159	return buf->res.a;
	160	}
	161
	162	uint64_t sdust(const uint8_t seq, int l_seq, int T, int W, int *n)
	163	{
	164	uint64_t *ret;
	165	sdust_buf_t *buf;
	166	buf = sdust_buf_init();
	167	ret = (uint64_t*)sdust_core(seq, l_seq, T, W, n, buf);
	168	buf->res.a = 0;
	169	sdust_buf_destroy(buf);
	170	return ret;
	171	}
	172
	173	#ifdef _SDUST_MAIN
	174	#include <zlib.h>
	175	#include <stdio.h>
	176	#include <unistd.h>
	177	#include "kseq.h"
	178	KSEQ_INIT(gzFile, gzread)
	179
	180	int main(int argc, char *argv[])
	181	{
	182	gzFile fp;
	183	kseq_t *ks;
	184	int W = 64, T = 20, c;
	185
	186	while ((c = getopt(argc, argv, "w:t:")) >= 0) {
	187	if (c == 'w') W = atoi(optarg);
	188	else if (c == 't') T = atoi(optarg);
	189	}
	190	if (optind == argc) {
	191	fprintf(stderr, "Usage: sdust [-w %d] [-t %d] <in.fa>\n", W, T);
	192	return 1;
	193	}
	194	fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
	195	ks = kseq_init(fp);
	196	while (kseq_read(ks) >= 0) {
	197	uint64_t *r;
	198	int i, n;
	199	r = sdust((uint8_t*)ks->seq.s, -1, T, W, &n);
	200	for (i = 0; i < n; ++i)
	201	printf("%s\t%d\t%d\n", ks->name.s, (int)(r[i]>>32), (int)r[i]);
	202	free(r);
	203	}
	204	kseq_destroy(ks);
	205	gzclose(fp);
	206	return 0;
	207	}
	208	#endif

+23

-0

third_party/minimap-0.2/sdust.h less more

	0	#ifndef SDUST_H
	1	#define SDUST_H
	2
	3	struct sdust_buf_s;
	4	typedef struct sdust_buf_s sdust_buf_t;
	5
	6	#ifdef __cplusplus
	7	extern "C" {
	8	#endif
	9
	10	// the simple interface
	11	uint64_t sdust(const uint8_t seq, int l_seq, int T, int W, int *n);
	12
	13	// the following interface dramatically reduce heap allocations when sdust is frequently called.
	14	sdust_buf_t *sdust_buf_init(void);
	15	void sdust_buf_destroy(sdust_buf_t *buf);
	16	const uint64_t sdust_core(const uint8_t seq, int l_seq, int T, int W, int n, sdust_buf_t buf);
	17
	18	#ifdef __cplusplus
	19	}
	20	#endif
	21
	22	#endif

+102

-0

third_party/minimap-0.2/sketch.c less more

	0	#include <stdio.h>
	1	#include <stdlib.h>
	2	#include <assert.h>
	3	#include <string.h>
	4	#include "kvec.h"
	5	#include "minimap.h"
	6
	7	unsigned char seq_nt4_table[256] = {
	8	0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	9	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	10	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	11	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	12	4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
	13	4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	14	4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
	15	4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	16	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	17	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	18	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	19	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	20	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	21	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	22	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
	23	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
	24	};
	25
	26	static inline uint64_t hash64(uint64_t key, uint64_t mask)
	27	{
	28	key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
	29	key = key ^ key >> 24;
	30	key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265
	31	key = key ^ key >> 14;
	32	key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21
	33	key = key ^ key >> 28;
	34	key = (key + (key << 31)) & mask;
	35	return key;
	36	}
	37
	38	/**
	39	* Find symmetric (w,k)-minimizers on a DNA sequence
	40	*
	41	* @param str DNA sequence
	42	* @param len length of $str
	43	* @param w find a minimizer for every $w consecutive k-mers
	44	* @param k k-mer size
	45	* @param rid reference ID; will be copied to the output $p array
	46	* @param p minimizers; p->a[i].x is the 2k-bit hash value;
	47	* p->a[i].y = rid<<32 \| lastPos<<1 \| strand
	48	* where lastPos is the position of the last base of the i-th minimizer,
	49	* and strand indicates whether the minimizer comes from the top or the bottom strand.
	50	* Callers may want to set "p->n = 0"; otherwise results are appended to p
	51	*/
	52	void mm_sketch(const char str, int len, int w, int k, uint32_t rid, mm128_v p)
	53	{
	54	uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, kmer[2] = {0,0};
	55	int i, j, l, buf_pos, min_pos;
	56	mm128_t *buf, min = { UINT64_MAX, UINT64_MAX };
	57
	58	assert(len > 0 && w > 0 && k > 0);
	59	buf = (mm128_t)alloca(w 16);
	60	memset(buf, 0xff, w * 16);
	61
	62	for (i = l = buf_pos = min_pos = 0; i < len; ++i) {
	63	int c = seq_nt4_table[(uint8_t)str[i]];
	64	mm128_t info = { UINT64_MAX, UINT64_MAX };
	65	if (c < 4) { // not an ambiguous base
	66	int z;
	67	kmer[0] = (kmer[0] << 2 \| c) & mask; // forward k-mer
	68	kmer[1] = (kmer[1] >> 2) \| (3ULL^c) << shift1; // reverse k-mer
	69	if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand
	70	z = kmer[0] < kmer[1]? 0 : 1; // strand
	71	if (++l >= k)
	72	info.x = hash64(kmer[z], mask), info.y = (uint64_t)rid<<32 \| (uint32_t)i<<1 \| z;
	73	} else l = 0;
	74	buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below
	75	if (l == w + k - 1) { // special case for the first window - because identical k-mers are not stored yet
	76	for (j = buf_pos + 1; j < w; ++j)
	77	if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, *p, buf[j]);
	78	for (j = 0; j < buf_pos; ++j)
	79	if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, *p, buf[j]);
	80	}
	81	if (info.x <= min.x) { // a new minimum; then write the old min
	82	if (l >= w + k) kv_push(mm128_t, *p, min);
	83	min = info, min_pos = buf_pos;
	84	} else if (buf_pos == min_pos) { // old min has moved outside the window
	85	if (l >= w + k - 1) kv_push(mm128_t, *p, min);
	86	for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers
	87	if (min.x >= buf[j].x) min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer
	88	for (j = 0; j <= buf_pos; ++j)
	89	if (min.x >= buf[j].x) min = buf[j], min_pos = j;
	90	if (l >= w + k - 1) { // write identical k-mers
	91	for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted
	92	if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, *p, buf[j]);
	93	for (j = 0; j <= buf_pos; ++j)
	94	if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, *p, buf[j]);
	95	}
	96	}
	97	if (++buf_pos == w) buf_pos = 0;
	98	}
	99	if (min.x != UINT64_MAX)
	100	kv_push(mm128_t, *p, min);
	101	}