Commit bd7747be2d9f51a600d548e18498731326440b44 - abyss

+1

-1

.circleci/config.yml less more

1	1	jobs:
2	2	build:
3	3	docker:
4		- image: ubuntu:xenial
	4	- image: ubuntu:bionic
5	5	steps:
6	6	- run: \|
7	7	apt-get update -qq

+22

-0

.github/ISSUE_TEMPLATE.rb less more

	0	# Please report
	1
	2	- [ ] version of ABySS with `abyss-pe version`
	3	- [ ] distribution of Linux with `lsb_release -d`
	4
	5	# Assembly error
	6
	7	- [ ] complete `abyss-pe` command line
	8	- [ ] last 20 lines of the output of `abyss-pe`
	9	- [ ] number of sequenced bases
	10	- [ ] estimated genome size and ploidy
	11	- [ ] estimated sequencing depth of coverage
	12
	13	# Build error
	14
	15	Consider installing ABySS using [Linuxbrew](https://linuxbrew.sh) on Linux or [Homebrew](https://brew.sh) on macOS with `brew install abyss`, or using [Bioconda](https://bioconda.github.io) with `conda install abyss`.
	16
	17	- [ ] Have you tried installing ABySS using Brew or Bioconda?
	18	- [ ] version of GCC or compiler with `gcc --version`
	19	- [ ] complete `./configure` command line
	20	- [ ] last 20 lines of the output of `./configure`
	21	- [ ] last 20 lines of the output of `make`

+9

-0

ABYSS/abyss.cc less more

71	71
72	72	AssemblyAlgorithms::setCoverageParameters(
73	73	AssemblyAlgorithms::coverageHistogram(g));
	74
	75	if (opt::kc > 0) {
	76	cout << "Minimum k-mer multiplicity kc is " << opt::kc << endl;
	77	cout << "Removing low-multiplicity k-mers" << endl;
	78	size_t removed = AssemblyAlgorithms::applyKmerCoverageThreshold(g, opt::kc);
	79	cout << "Removed " << removed
	80	<< " low-multiplicity k-mers, " << g.size()
	81	<< " k-mers remaining" << std::endl;
	82	}
74	83
75	84	cout << "Generating adjacency" << endl;
76	85	AssemblyAlgorithms::generateAdjacency(&g);

+2

-1

Assembly/BranchGroup.h less more

1	1	#define BRANCHGROUP_H 1
2	2
3	3	#include "Common/Algorithms.h"
	4	#include "Common/Exception.h"
4	5	#include <algorithm> // for swap
5	6	#include <map>
6	7	#include <utility>

210	211
211	212	namespace std {
212	213	template <>
213		inline void swap(BranchGroup&, BranchGroup&) { assert(false); }
	214	inline void swap(BranchGroup&, BranchGroup&) NOEXCEPT { assert(false); }
214	215	}
215	216
216	217	#endif

+3

-1

Assembly/BranchRecordBase.h less more

0	0	#ifndef ASSEMBLY_BRANCHRECORDBASE_H
1	1	#define ASSEMBLY_BRANCHRECORDBASE_H 1
	2
	3	#include "Common/Exception.h"
2	4
3	5	#include <algorithm>
4	6	#include <cassert>

171	173
172	174	namespace std {
173	175	template <>
174		inline void swap(BranchRecord& a, BranchRecord& b)
	176	inline void swap(BranchRecord& a, BranchRecord& b) NOEXCEPT
175	177	{
176	178	a.swap(b);
177	179	}

+16

-0

Assembly/CoverageAlgorithm.h less more

111	111	}
112	112	}
113	113
	114	/** Remove all k-mers with multiplicity lower than the given threshold */
	115	static inline
	116	size_t applyKmerCoverageThreshold(SequenceCollectionHash& c, unsigned kc)
	117	{
	118	if (kc == 0)
	119	return 0;
	120
	121	for (SequenceCollectionHash::iterator it = c.begin();
	122	it != c.end(); ++it) {
	123	if (it->second.getMultiplicity() < kc)
	124	it->second.setFlag(SF_DELETE);
	125	}
	126
	127	return c.cleanup();
	128	}
	129
114	130	} // namespace AssemblyAlgorithms
115	131
116	132	#endif

+9

-1

Assembly/Options.cc less more

53	53	" -t, --trim-length=N maximum length of blunt contigs to trim [k]\n"
54	54	" -c, --coverage=FLOAT remove contigs with mean k-mer coverage\n"
55	55	" less than this threshold\n"
	56	" --kc=N remove all k-mers with multiplicity < N [0]\n"
56	57	" -b, --bubbles=N pop bubbles shorter than N bp [3*k]\n"
57	58	" -b0, --no-bubbles do not pop bubbles\n"
58	59	" -e, --erode=N erode bases at the ends of blunt contigs with coverage\n"

106	107	/** Coverage cutoff. */
107	108	float coverage = -1;
108	109
	110	/** Minimum k-mer multiplicity cutoff. */
	111	unsigned kc = 0;
	112
109	113	/** Pop bubbles shorter than N bp. */
110	114	int bubbleLen = -1;
111	115

146	150
147	151	static const char shortopts[] = "b:c:e:E:g:k:K:mo:Q:q:s:t:v";
148	152
149		enum { OPT_HELP = 1, OPT_VERSION, COVERAGE_HIST, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
	153	enum { OPT_HELP = 1, OPT_VERSION, COVERAGE_HIST, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES, OPT_KC };
150	154
151	155	static const struct option longopts[] = {
152	156	{ "out", required_argument, NULL, 'o' },

163	167	{ "SS", no_argument, &opt::ss, 1 },
164	168	{ "no-SS", no_argument, &opt::ss, 0 },
165	169	{ "coverage", required_argument, NULL, 'c' },
	170	{ "kc", required_argument, NULL, OPT_KC },
166	171	{ "coverage-hist", required_argument, NULL, COVERAGE_HIST },
167	172	{ "bubble-length", required_argument, NULL, 'b' },
168	173	{ "no-bubbles", no_argument, &opt::bubbleLen, 0 },

301	306	case OPT_SPECIES:
302	307	arg >> opt::metaVars[2];
303	308	break;
	309	case OPT_KC:
	310	arg >> opt::kc;
	311	break;
304	312	}
305	313	if (optarg != NULL && !arg.eof()) {
306	314	cerr << PROGRAM ": invalid option: `-"

+1

-0

Assembly/Options.h less more

13	13	extern unsigned erodeStrand;
14	14	extern unsigned trimLen;
15	15	extern float coverage;
	16	extern unsigned kc;
16	17	extern unsigned bubbleLen;
17	18	extern unsigned ss;
18	19	extern bool maskCov;

+8

-0

Bloom/BloomFilter.h less more

12	12	#include <vector>
13	13	#include <iostream>
14	14	#include <boost/dynamic_bitset.hpp>
	15
	16	/*
	17	* Put `BloomFilter` class in `Konnector` namespace to avoid collision with BTL
	18	* `BloomFilter` class of the same name.
	19	*/
	20	namespace Konnector {
15	21
16	22	/** A Bloom filter. */
17	23	class BloomFilter

164	170	char* m_array;
165	171	};
166	172
	173	} // end Konnector namespace
	174
167	175	#endif

+9

-9

Bloom/BloomFilterWindow.h less more

13	13	* A bloom filter that represents a window
14	14	* within a larger bloom filter.
15	15	*/
16		class BloomFilterWindow : public BloomFilter
	16	class BloomFilterWindow : public Konnector::BloomFilter
17	17	{
18	18	public:
19	19
20	20	/** Constructor. */
21		BloomFilterWindow() : BloomFilter() { };
	21	BloomFilterWindow() : Konnector::BloomFilter() { };
22	22
23	23	/** Constructor.
24	24	*

28	28	*/
29	29	BloomFilterWindow(size_t fullBloomSize, size_t startBitPos,
30	30	size_t endBitPos, size_t hashSeed=0) :
31		BloomFilter(endBitPos - startBitPos + 1, hashSeed),
	31	Konnector::BloomFilter(endBitPos - startBitPos + 1, hashSeed),
32	32	m_fullBloomSize(fullBloomSize),
33	33	m_startBitPos(startBitPos),
34	34	m_endBitPos(endBitPos)

62	62	/** Return the size of the bit array. */
63	63	size_t size() const
64	64	{
65		return BloomFilter::size();
	65	return Konnector::BloomFilter::size();
66	66	}
67	67
68	68	/** Return the number of elements with count >= max_count. */
69	69	size_t popcount() const
70	70	{
71		return BloomFilter::popcount();
	71	return Konnector::BloomFilter::popcount();
72	72	}
73	73
74	74	/** Return the estimated false positive rate */
75	75	double FPR() const
76	76	{
77		return BloomFilter::FPR();
	77	return Konnector::BloomFilter::FPR();
78	78	}
79	79
80	80	/** Return whether the specified bit is set. */
81	81	bool operator[](size_t i) const
82	82	{
83	83	if (i >= m_startBitPos && i <= m_endBitPos)
84		return BloomFilter::operator[](i - m_startBitPos);
	84	return Konnector::BloomFilter::operator[](i - m_startBitPos);
85	85	return false;
86	86	}
87	87

95	95	void insert(size_t i)
96	96	{
97	97	if (i >= m_startBitPos && i <= m_endBitPos)
98		BloomFilter::insert(i - m_startBitPos);
	98	Konnector::BloomFilter::insert(i - m_startBitPos);
99	99	}
100	100
101	101	/** Add the object to this set. */

142	142
143	143	if (m_size != bits) {
144	144	if (readOp == BITWISE_OVERWRITE) {
145		BloomFilter::resize(bits);
	145	Konnector::BloomFilter::resize(bits);
146	146	} else {
147	147	std::cerr << "error: can't union/intersect bloom filters with "
148	148	<< "different sizes\n";

+4

-4

Bloom/CascadingBloomFilter.h less more

21	21	{
22	22	m_data.reserve(max_count);
23	23	for (unsigned i = 0; i < max_count; i++)
24		m_data.push_back(new BloomFilter(n, hashSeed));
	24	m_data.push_back(new Konnector::BloomFilter(n, hashSeed));
25	25	}
26	26
27	27	/** Destructor */
28	28	~CascadingBloomFilter()
29	29	{
30		typedef std::vector<BloomFilter*>::iterator Iterator;
	30	typedef std::vector<Konnector::BloomFilter*>::iterator Iterator;
31	31	for (Iterator i = m_data.begin(); i != m_data.end(); i++) {
32	32	assert(*i != NULL);
33	33	delete *i;

90	90	}
91	91
92	92	/** Get the Bloom filter for a given level */
93		BloomFilter& getBloomFilter(unsigned level)
	93	Konnector::BloomFilter& getBloomFilter(unsigned level)
94	94	{
95	95	assert(m_data.at(level) != NULL);
96	96	return *m_data.at(level);

111	111
112	112	private:
113	113	size_t m_hashSeed;
114		std::vector<BloomFilter*> m_data;
	114	std::vector<Konnector::BloomFilter*> m_data;
115	115
116	116	};
117	117

+12

-11

Bloom/bloom.cc less more

204	204	{ NULL, 0, NULL, 0 }
205	205	};
206	206
207		void dieWithUsageError()
	207	__attribute__((noreturn))
	208	static void dieWithUsageError()
208	209	{
209	210	cerr << "Try `" << PROGRAM
210	211	<< " --help' for more information.\n";

417	418	if (opt::windows == 0) {
418	419
419	420	if (opt::levels == 1) {
420		BloomFilter bloom(bits, opt::hashSeed);
	421	Konnector::BloomFilter bloom(bits, opt::hashSeed);
421	422	#ifdef _OPENMP
422		ConcurrentBloomFilter<BloomFilter>
	423	ConcurrentBloomFilter<Konnector::BloomFilter>
423	424	cbf(bloom, opt::numLocks, opt::hashSeed);
424	425	loadFilters(cbf, argc, argv);
425	426	#else

639	640	string outputPath(argv[optind]);
640	641	optind++;
641	642
642		BloomFilter bloom;
	643	Konnector::BloomFilter bloom;
643	644
644	645	for (int i = optind; i < argc; i++) {
645	646	string path(argv[i]);

694	695	dieWithUsageError();
695	696	}
696	697
697		BloomFilter bloom;
	698	Konnector::BloomFilter bloom;
698	699	string path = argv[optind];
699	700
700	701	if (opt::verbose)

743	744	std::cerr << "Computing distance for 2"
744	745	<< " samples...\n";
745	746	// Get both paths and open istreams
746		BloomFilter bloomA;
	747	Konnector::BloomFilter bloomA;
747	748	string pathA(argv[optind]);
748		BloomFilter bloomB;
	749	Konnector::BloomFilter bloomB;
749	750	string pathB(argv[optind+1]);
750	751	if (opt::verbose)
751	752	std::cerr << "Loading bloom filters from "

844	845
845	846	int memberOf(int argc, char ** argv){
846	847	// Initalise bloom and get globals
847		BloomFilter bloom;
	848	Konnector::BloomFilter bloom;
848	849	parseGlobalOpts(argc, argv);
849	850	// Arg parser to get `m' option in case set
850	851	for (int c; (c = getopt_long(argc, argv,

928	929	/**
929	930	* Calculate number of bases to trim from left end of sequence.
930	931	*/
931		int calcLeftTrim(const Sequence& seq, unsigned k, const BloomFilter& bloom,
	932	int calcLeftTrim(const Sequence& seq, unsigned k, const Konnector::BloomFilter& bloom,
932	933	size_t minBranchLen)
933	934	{
934	935	// Boost graph interface for Bloom filter
935		DBGBloom<BloomFilter> g(bloom);
	936	DBGBloom<Konnector::BloomFilter> g(bloom);
936	937
937	938	// if this is the first k-mer we have found in
938	939	// Bloom filter, starting from the left end

999	1000	cerr << "Loading bloom filter from `"
1000	1001	<< bloomPath << "'...\n";
1001	1002
1002		BloomFilter bloom;
	1003	Konnector::BloomFilter bloom;
1003	1004	istream *in = openInputStream(bloomPath);
1004	1005	assert_good(*in, bloomPath);
1005	1006	bloom.read(*in);

+7

-7

BloomDBG/HashAgnosticCascadingBloom.h less more

41	41	{
42	42	m_data.reserve(levels);
43	43	for (unsigned i = 0; i < levels; i++)
44		m_data.push_back(new BTL::BloomFilter(size, hashes, k));
	44	m_data.push_back(new BloomFilter(size, hashes, k));
45	45	}
46	46
47	47	/**
48		* Constructor to load a single-level BTL::BloomFilter from
49		* files. This is used to make BTL::BloomFilter support the
	48	* Constructor to load a single-level BloomFilter from
	49	* files. This is used to make BloomFilter support the
50	50	* same interface as HashAgnosticCascadingBloom.
51	51	*/
52	52	HashAgnosticCascadingBloom(const string& bloomPath)

137	137	}
138	138
139	139	/** Get the Bloom filter for a given level */
140		BTL::BloomFilter& getBloomFilter(unsigned level)
	140	BloomFilter& getBloomFilter(unsigned level)
141	141	{
142	142	assert(m_data.at(level) != NULL);
143	143	return *m_data.at(level);

158	158	void loadFilter(const string& bloomPath)
159	159	{
160	160	clear();
161		BTL::BloomFilter* bloom = new BTL::BloomFilter(bloomPath);
	161	BloomFilter* bloom = new BloomFilter(bloomPath);
162	162	m_k = bloom->getKmerSize();
163	163	m_hashes = bloom->getHashNum();
164	164	m_data.push_back(bloom);

171	171	{
172	172	m_k = 0;
173	173	m_hashes = 0;
174		typedef std::vector<BTL::BloomFilter*>::iterator Iterator;
	174	typedef std::vector<BloomFilter*>::iterator Iterator;
175	175	for (Iterator i = m_data.begin(); i != m_data.end(); i++) {
176	176	assert(*i != NULL);
177	177	delete *i;

184	184	/** number of hash functions */
185	185	unsigned m_hashes;
186	186	/** the array of Bloom filters */
187		std::vector<BTL::BloomFilter*> m_data;
	187	std::vector<BloomFilter*> m_data;
188	188	};
189	189
190	190	#endif

+2

-0

BloomDBG/LightweightKmer.h less more

0	0	#ifndef LIGHTWEIGHT_KMER_H
1	1	#define LIGHTWEIGHT_KMER_H 1
	2
	3	#include "BloomDBG/MaskedKmer.h"
2	4
3	5	#include <algorithm>
4	6	#include <cstring>

+1

-1

BloomDBG/Makefile.am less more

25	25	RollingHashIterator.h \
26	26	SpacedSeed.h \
27	27	$(top_srcdir)/lib/bloomfilter/BloomFilter.hpp \
28		$(top_srcdir)/lib/rolling-hash/rolling.h
	28	$(top_srcdir)/lib/nthash/nthash.hpp

+4

-6

BloomDBG/RollingBloomDBG.h less more

44	44	: m_kmer(kmer), m_rollingHash(rollingHash) {}
45	45
46	46	const LightweightKmer& kmer() const { return m_kmer; };
	47	LightweightKmer& kmer() { return m_kmer; };
	48
47	49	const RollingHash& rollingHash() const { return m_rollingHash; }
48	50
49	51	RollingBloomDBGVertex clone() const {

62	64
63	65	void setLastBase(extDirection dir, char base)
64	66	{
65		const unsigned k = Kmer::length();
66		if (dir == SENSE) {
67		m_rollingHash.setBase(m_kmer.c_str(), k-1, base);
68		} else {
69		m_rollingHash.setBase(m_kmer.c_str(), 0, base);
70		}
	67	m_rollingHash.setLastBase(kmer().c_str(), dir, base);
	68	kmer().setLastBase(dir, base);
71	69	}
72	70
73	71	/**

+61

-143

BloomDBG/RollingHash.h less more

1	1	#define ABYSS_ROLLING_HASH_H 1
2	2
3	3	#include "config.h"
4		#include "lib/rolling-hash/rolling.h"
	4
	5	#include "BloomDBG/LightweightKmer.h"
5	6	#include "BloomDBG/MaskedKmer.h"
	7	#include "Common/Sense.h"
	8	#include "lib/nthash/nthash.hpp"
	9
	10	#include <algorithm>
6	11	#include <string>
7	12	#include <vector>
8	13	#include <cassert>

60	65	*/
61	66	void reset(const std::string& kmer)
62	67	{
	68	/* compute initial hash values for forward and reverse-complement k-mer */
	69	NTC64(kmer.c_str(), m_k, m_hash1, m_rcHash1);
	70
	71	/* get canonical hash value from forward/reverse hash values */
	72	m_hash = canonicalHash(m_hash1, m_rcHash1);
	73
63	74	if (!MaskedKmer::mask().empty())
64		resetMasked(kmer.c_str());
65		else
66		resetUnmasked(kmer);
67		}
68
69		/**
70		* Initialize hash values from current k-mer. When computing the hash
71		* value, mask out "don't care" positions as per the active
72		* k-mer mask.
73		*/
74		void resetMasked(const char* kmer)
75		{
76		const std::string& spacedSeed = MaskedKmer::mask();
77		assert(spacedSeed.length() == m_k);
78
79		/* compute first hash function for k-mer */
80		uint64_t hash1 = getFhval(m_hash1, spacedSeed.c_str(), kmer, m_k);
81
82		/* compute first hash function for reverse complement of k-mer */
83		uint64_t rcHash1 = getRhval(m_rcHash1, spacedSeed.c_str(), kmer, m_k);
84
85		m_hash = canonicalHash(hash1, rcHash1);
86		}
87
88		/**
89		* Initialize hash values from sequence.
90		* @param kmer k-mer used to initialize hash state
91		*/
92		void resetUnmasked(const std::string& kmer)
93		{
94		/* compute first hash function for k-mer */
95		m_hash1 = getFhval(kmer.c_str(), m_k);
96
97		/* compute first hash function for reverse complement
98		* of k-mer */
99		m_rcHash1 = getRhval(kmer.c_str(), m_k);
100
101		m_hash = canonicalHash(m_hash1, m_rcHash1);
	75	m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(),
	76	kmer.c_str(), m_k);
102	77	}
103	78
104	79	/**

109	84	*/
110	85	void rollRight(const char* kmer, char charIn)
111	86	{
112		if (!MaskedKmer::mask().empty())
113		rollRightMasked(kmer, charIn);
114		else
115		rollRightUnmasked(kmer, charIn);
116		}
117
118		/**
119		* Compute hash values for next k-mer to the right and
120		* update internal state. When computing the new hash, mask
121		* out "don't care" positions according to the active
122		* k-mer mask.
123		* @param kmer current k-mer
124		* @param nextKmer k-mer we are rolling into
125		*/
126		void rollRightMasked(const char* kmer, char charIn)
127		{
128		const std::string& spacedSeed = MaskedKmer::mask();
129		m_hash = rollHashesRight(m_hash1, m_rcHash1, spacedSeed.c_str(),
130		kmer, charIn, m_k);
131		}
132
133		/**
134		* Compute hash values for next k-mer to the right and
135		* update internal state.
136		* @param kmer current k-mer
137		* @param nextKmer k-mer we are rolling into
138		*/
139		void rollRightUnmasked(const char* kmer, char charIn)
140		{
141		/* update first hash function */
142		rollHashesRight(m_hash1, m_rcHash1, kmer[0], charIn, m_k);
143		m_hash = canonicalHash(m_hash1, m_rcHash1);
	87	NTC64(kmer[0], charIn, m_k, m_hash1, m_rcHash1);
	88	m_hash = canonicalHash(m_hash1, m_rcHash1);
	89
	90	if (!MaskedKmer::mask().empty()) {
	91	// TODO: copying the k-mer and shifting is very inefficient;
	92	// we need a specialized nthash function that rolls and masks
	93	// simultaneously
	94	LightweightKmer next(kmer);
	95	next.shift(SENSE, charIn);
	96	m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(),
	97	next.c_str(), m_k);
	98	}
144	99	}
145	100
146	101	/**

151	106	*/
152	107	void rollLeft(char charIn, const char* kmer)
153	108	{
154		if (!MaskedKmer::mask().empty())
155		rollLeftMasked(charIn, kmer);
156		else
157		rollLeftUnmasked(charIn, kmer);
158		}
159
160		/**
161		* Compute hash values for next k-mer to the left and
162		* update internal state. When computing the new hash, mask
163		* out "don't care" positions according to the active
164		* k-mer mask.
165		* @param prevKmer k-mer we are rolling into
166		* @param kmer current k-mer
167		*/
168		void rollLeftMasked(char charIn, const char* kmer)
169		{
170		const std::string& spacedSeed = MaskedKmer::mask();
171		m_hash = rollHashesLeft(m_hash1, m_rcHash1, spacedSeed.c_str(),
172		kmer, charIn, m_k);
173		}
174
175		/**
176		* Compute hash values for next k-mer to the left and
177		* update internal state.
178		* @param prevKmer k-mer we are rolling into
179		* @param kmer current k-mer
180		*/
181		void rollLeftUnmasked(char charIn, const char* kmer)
182		{
183		/* update first hash function */
184		rollHashesLeft(m_hash1, m_rcHash1, charIn, kmer[m_k-1], m_k);
185		m_hash = canonicalHash(m_hash1, m_rcHash1);
	109	NTC64L(kmer[m_k-1], charIn, m_k, m_hash1, m_rcHash1);
	110	m_hash = canonicalHash(m_hash1, m_rcHash1);
	111
	112	if (!MaskedKmer::mask().empty()) {
	113	// TODO: copying the k-mer and shifting is very inefficient;
	114	// we need a specialized nthash function that rolls and masks
	115	// simultaneously
	116	LightweightKmer next(kmer);
	117	next.shift(ANTISENSE, charIn);
	118	m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(),
	119	next.c_str(), m_k);
	120	}
186	121	}
187	122
188	123	/**

202	137	*/
203	138	void getHashes(size_t hashes[]) const
204	139	{
205		uint64_t tmpHashes[MAX_HASHES];
206		multiHash(tmpHashes, m_hash, m_numHashes, m_k);
207		for (unsigned i = 0; i < m_numHashes; ++i) {
208		hashes[i] = (size_t)tmpHashes[i];
209		}
	140	for (unsigned i = 0; i < m_numHashes; ++i)
	141	hashes[i] = NTE64(m_hash, m_k, i);
210	142	}
211	143
212	144	/** Equality operator */

229	161	}
230	162
231	163	/**
232		* Set the base at a given position in the k-mer and update the hash
233		* value accordingly.
	164	* Change the hash value to reflect a change in the first/last base of
	165	* the k-mer.
234	166	* @param kmer point to the k-mer char array
235		* @param pos position of the base to be changed
	167	* @param dir if SENSE, change last base; if ANTISENSE,
	168	* change first base
236	169	* @param base new value for the base
237	170	*/
238		void setBase(char* kmer, unsigned pos, char base)
239		{
	171	void setLastBase(char* kmer, extDirection dir, char base)
	172	{
	173	if (dir == SENSE) {
	174	/* roll left to remove old last char */
	175	NTC64L(kmer[m_k-1], 'A', m_k, m_hash1, m_rcHash1);
	176	/* roll right to add new last char */
	177	NTC64('A', base, m_k, m_hash1, m_rcHash1);
	178	} else {
	179	/* roll right to remove old first char */
	180	NTC64(kmer[0], 'A', m_k, m_hash1, m_rcHash1);
	181	/* roll left to add new first char */
	182	NTC64L('A', base, m_k, m_hash1, m_rcHash1);
	183	}
	184	m_hash = canonicalHash(m_hash1, m_rcHash1);
	185
240	186	if (!MaskedKmer::mask().empty())
241		setBaseMasked(kmer, pos, base);
242		else
243		setBaseUnmasked(kmer, pos, base);
244		}
245
246		/**
247		* Set the base at a given position in the k-mer and update the hash
248		* value accordingly.
249		* @param kmer point to the k-mer char array
250		* @param pos position of the base to be changed
251		* @param base new value for the base
252		*/
253		void setBaseMasked(char* kmer, unsigned pos, char base)
254		{
255		const std::string& spacedSeed = MaskedKmer::mask();
256		assert(spacedSeed.length() == m_k);
257		m_hash = ::setBase(m_hash1, m_rcHash1, spacedSeed.c_str(), kmer,
258		pos, base, m_k);
259		}
260
261		/**
262		* Set the base at a given position in the k-mer and update the hash
263		* value accordingly.
264		* @param kmer point to the k-mer char array
265		* @param pos position of the base to be changed
266		* @param base new value for the base
267		*/
268		void setBaseUnmasked(char* kmer, unsigned pos, char base)
269		{
270		m_hash = ::setBase(m_hash1, m_rcHash1, kmer, pos, base, m_k);
	187	m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(),
	188	kmer, m_k);
271	189	}
272	190
273	191	private:

+1

-1

BloomDBG/bloom-dbg.cc less more

227	227	HashAgnosticCascadingBloom solidKmerSet;
228	228
229	229	/* empty visited k-mers Bloom filter */
230		BTL::BloomFilter visitedKmerSet;
	230	BloomFilter visitedKmerSet;
231	231
232	232	/* counters for progress messages */
233	233	BloomDBG::AssemblyCounters counters;

+3

-3

BloomDBG/bloom-dbg.h less more

573	573	if (!assembledKmerSet.contains(*fwd))
574	574	break;
575	575	}
576		if (fwd.pos() > 0)
	576	if (fwd != RollingHashIterator::end() && fwd.pos() > 0)
577	577	seq.erase(0, fwd.pos());
578	578
579	579	/* trim previously assembled k-mers from end of sequence */

583	583	if (!assembledKmerSet.contains(*rev))
584	584	break;
585	585	}
586		if (rev.pos() > 0)
	586	if (rev != RollingHashIterator::end() && rev.pos() > 0)
587	587	rcSeq.erase(0, rev.pos());
588	588
589	589	/* flip seq back to original orientation */

833	833	std::ostream& out)
834	834	{
835	835	/* k-mers in previously assembled contigs */
836		BTL::BloomFilter visitedKmerSet(solidKmerSet.size(),
	836	BloomFilter visitedKmerSet(solidKmerSet.size(),
837	837	solidKmerSet.getHashNum(), solidKmerSet.getKmerSize());
838	838
839	839	/* counters for progress messages */

+31

-0

ChangeLog less more

	0	2018-09-11 Ben Vandervalk <benv@bcgsc.ca>
	1
	2	* Release version 2.1.1
	3
	4	abyss-bloom-dbg:
	5	* upgrade to most recent version of ntHash to reduce
	6	some assembly/hashing artifacts. On a human assembly, this
	7	reduced QUAST major misassemblies by 5% and increased
	8	scaffold contiguity by 10%
	9	* `kc` parameter now also applies to MPI assemblies (see below)
	10
	11	abyss-fac:
	12	* change N20 and N80 to N25 and N75, respectively
	13
	14	ABYSS-P:
	15	* add `--kc` option, with implements a hard minimum k-mer
	16	multiplicity cutoff
	17
	18	abyss-pe:
	19	* fix `zsh: no such option: pipefail` error with
	20	old versions of `zsh` (fallback to `bash` instead)
	21	* adding `time=1` now times all assembly commands
	22
	23	abyss-sealer:
	24	* parallelize gap sealing with OpenMP (thanks to
	25	@schutzekatze!)
	26	* add `--gap-file` option (thanks to @schutzekatze!)
	27
	28	DistanceEst:
	29	* add support for GFA output
	30
0	31	2018-04-13 Ben Vandervalk <benv@bcgsc.ca>
1	32
2	33	* Release version 2.1.0

+2

-1

Common/Estimate.h less more

1	1	#define ESTIMATE_H 1
2	2
3	3	#include "Common/ContigProperties.h" // for Distance
	4	#include "Common/Exception.h"
4	5	#include "ContigID.h"
5	6	#include "ContigNode.h"
6	7	#include "Graph/Options.h" // for opt::k

197	198
198	199	namespace std {
199	200	template<>
200		inline void swap(EstimateRecord&, EstimateRecord&)
	201	inline void swap(EstimateRecord&, EstimateRecord&) NOEXCEPT
201	202	{
202	203	assert(false);
203	204	}

+11

-0

Common/Exception.h less more

	0	#ifndef _EXCEPTION_H_
	1	#define _EXCEPTION_H_ 1
	2
	3	/* `noexcept` is only recognized in C++11 or later. See https://stackoverflow.com/questions/24567173/backwards-compatible-noexceptfalse-for-destructors */
	4	#if __cplusplus >= 201103L
	5	#define NOEXCEPT noexcept
	6	#else
	7	#define NOEXCEPT
	8	#endif
	9
	10	#endif

+8

-7

Common/Histogram.h less more

0	0	#ifndef HISTOGRAM_H
1	1	#define HISTOGRAM_H 1
2	2
	3	#include "Common/Exception.h"
3	4	#include "StringUtil.h" // for toEng
4	5	#include "VectorUtil.h" // for make_vector
5	6	#include <cassert>

311	312
312	313	namespace std {
313	314	template<>
314		inline void swap(Histogram&, Histogram&) { assert(false); }
	315	inline void swap(Histogram&, Histogram&) NOEXCEPT { assert(false); }
315	316	}
316	317
317	318	/** Print assembly contiguity statistics header. */

328	329	out << "LG50" << sep
329	330	<< "NG50" << sep;
330	331	return out << "min" << sep
331		<< "N80" << sep
	332	<< "N75" << sep
332	333	<< "N50" << sep
333		<< "N20" << sep
	334	<< "N25" << sep
334	335	<< "E-size" << sep
335	336	<< "max" << sep
336	337	<< "sum" << sep

363	364	}
364	365	return out
365	366	<< toEng(h.minimum()) << sep
366		<< toEng(h.weightedPercentile(1 - 0.8)) << sep
	367	<< toEng(h.weightedPercentile(1 - 0.75)) << sep
367	368	<< toEng(n50) << sep
368		<< toEng(h.weightedPercentile(1 - 0.2)) << sep
	369	<< toEng(h.weightedPercentile(1 - 0.25)) << sep
369	370	<< toEng((unsigned)h.expectedValue()) << sep
370	371	<< toEng(h.maximum()) << sep
371	372	<< toEng(sum);

385	386	<< h.size()
386	387	<< h.count(n50, INT_MAX)
387	388	<< h.minimum()
388		<< h.weightedPercentile(1 - 0.8)
	389	<< h.weightedPercentile(1 - 0.75)
389	390	<< n50
390		<< h.weightedPercentile(1 - 0.2)
	391	<< h.weightedPercentile(1 - 0.25)
391	392	<< (unsigned)h.expectedValue()
392	393	<< h.maximum()
393	394	<< sum;

+1

-0

Common/Makefile.am less more

12	12	ContigProperties.h \
13	13	Dictionary.h \
14	14	Estimate.h \
	15	Exception.h \
15	16	Fcontrol.cpp Fcontrol.h \
16	17	Functional.h \
17	18	Hash.h \

+2

-1

Common/PMF.h less more

0	0	#ifndef PMF_H
1	1	#define PMF_H 1
2	2
	3	#include "Common/Exception.h"
3	4	#include "Histogram.h"
4	5	#include <cassert>
5	6	#include <cmath>

62	63
63	64	namespace std {
64	65	template<>
65		inline void swap(PMF&, PMF&) { assert(false); }
	66	inline void swap(PMF&, PMF&) NOEXCEPT { assert(false); }
66	67	}
67	68
68	69	#endif

+4

-1

Common/SAM.h less more

171	171	case 'I': case 'X': case '=':
172	172	qlen += len;
173	173	clip1 += len;
	174	// fallthrough
174	175	case 'D': case 'N': case 'P':
175	176	if (a.align_length == 0) {
176	177	// Ignore a malformatted CIGAR string whose first

187	188	clip1 = 0;
188	189	break;
189	190	}
	191	// fallthrough
190	192	case 'H': case 'S':
191	193	qlen += len;
192	194	clip1 += len;

321	323	#if SAM_SEQ_QUAL
322	324	out << '\t' << o.seq
323	325	<< '\t' << o.qual;
	326	// note: leading tab ('\t') is already embedded in o.tags
324	327	if (!o.tags.empty())
325		out << '\t' << o.tags;
	328	out << o.tags;
326	329	#else
327	330	out << "\t\t";
328	331	#endif

+1

-1

DataLayer/FastaReader.h less more

130	130	/** The sequence */
131	131	Sequence seq;
132	132
133		FastaRecord() { }
	133	FastaRecord() : anchor(0) { }
134	134	FastaRecord(const std::string& id, const std::string& comment,
135	135	const Sequence& seq)
136	136	: id(id), comment(comment), anchor(0), seq(seq) { }

+4

-4

DataLayer/fac.cc less more

112	112	cout << "n:NG50" << sep
113	113	<< "NG50" << sep;
114	114	cout << "min" << sep
115		<< "N80" << sep
	115	<< "N75" << sep
116	116	<< "N50" << sep
117		<< "N20" << sep
	117	<< "N25" << sep
118	118	<< "E-size" << sep
119	119	<< "max" << sep
120	120	<< "sum" << sep

129	129	cout << "n:NG50" << sep
130	130	<< "NG50" << sep;
131	131	cout << "min" << sep
132		<< "N80" << sep
	132	<< "N75" << sep
133	133	<< "N50" << sep
134		<< "N20" << sep
	134	<< "N25" << sep
135	135	<< "E-size" << sep
136	136	<< "max" << sep
137	137	<< "sum" << sep

+22

-3

DistanceEst/DistanceEst.cpp less more

60	60	" (maximum likelihood estimator)\n"
61	61	" --mean use the difference of the population mean\n"
62	62	" and the sample mean\n"
63		" --dist output graph in dist format [default]\n"
64		" --dot output graph in dot format\n"
	63	" --dist output the graph in dist format [default]\n"
	64	" --dot output the graph in GraphViz format\n"
	65	" --gv output the graph in GraphViz format\n"
	66	" --gfa output the graph in GFA2 format\n"
	67	" --gfa2 output the graph in GFA2 format\n"
65	68	" -j, --threads=N use N parallel threads [1]\n"
66	69	" -v, --verbose display verbose output\n"
67	70	" --help display this help and exit\n"

118	121	static const struct option longopts[] = {
119	122	{ "dist", no_argument, &opt::format, DIST, },
120	123	{ "dot", no_argument, &opt::format, DOT, },
	124	{ "gv", no_argument, &opt::format, DOT, },
	125	{ "gfa", no_argument, &opt::format, GFA2, },
	126	{ "gfa2", no_argument, &opt::format, GFA2, },
121	127	{ "fr", no_argument, &opt::rf, false },
122	128	{ "rf", no_argument, &opt::rf, true },
123	129	{ "min-align", required_argument, NULL, 'l' },

274	280	if (opt::format == DOT) {
275	281	#pragma omp critical(out)
276	282	out << get(g_contigNames, e) << " [" << est << "]\n";
	283	} else if (opt::format == GFA2) {
	284	// Output only one of the two complementary edges.
	285	if (len1 < opt::seedLen \|\| e.first < e.second \|\| e.first == e.second)
	286	#pragma omp critical(out)
	287	out << "G\t*"
	288	<< '\t' << get(g_contigNames, e.first)
	289	<< '\t' << get(g_contigNames, e.second)
	290	<< '\t' << est.distance
	291	<< '\t' << (int)ceilf(est.stdDev)
	292	<< "\tFC:i:" << est.numPairs
	293	<< '\n';
277	294	} else
278	295	out << ' ' << get(g_contigNames, id1) << ',' << est;
279	296	} else if (opt::verbose > 1) {

316	333	const PairsMap& x = dataMap[sense0 ^ opt::rf];
317	334	for (PairsMap::const_iterator it = x.begin();
318	335	it != x.end(); ++it)
319		writeEstimate(opt::format == DOT ? out : ss,
	336	writeEstimate(opt::format == DIST ? ss : out,
320	337	ContigNode(id0, sense0), it->first,
321	338	len0, lengthVec[it->first.id()],
322	339	it->second, pmf);

501	518	"k=" << opt::k << " "
502	519	"s=" << opt::seedLen << " "
503	520	"n=" << opt::npairs << "]\n";
	521	else if (opt::format == GFA2)
	522	out << "H\tVN:Z:2.0\n";
504	523
505	524	vector<int> vals = make_vector<int>()
506	525	<< opt::k

+12

-3

Graph/GfaIO.h less more

216	216	}
217	217
218	218	/** Read a graph in GFA format. */
219		template <typename Graph>
220		std::istream& read_gfa(std::istream& in, Graph& g)
	219	template <typename Graph, typename BetterEP>
	220	std::istream& read_gfa(std::istream& in, Graph& g, BetterEP betterEP)
221	221	{
222	222	assert(in);
223	223
224	224	typedef typename graph_traits<Graph>::vertex_descriptor V;
225	225	typedef typename vertex_property<Graph>::type VP;
	226	typedef typename graph_traits<Graph>::edge_descriptor E;
226	227	typedef typename edge_property<Graph>::type EP;
227	228
228	229	// Add vertices if this graph is empty.

350	351	assert(in);
351	352	V u = find_vertex(uname, g);
352	353	V v = find_vertex(vname, g);
353		add_edge(u, v, ep, g);
	354	E e;
	355	bool found;
	356	boost::tie(e, found) = edge(u, v, g);
	357	if (found) {
	358	// Parallel edge
	359	EP& ref = g[e];
	360	ref = betterEP(ref, ep);
	361	} else
	362	add_edge(u, v, ep, g);
354	363	break;
355	364	}
356	365

+1

-1

Graph/GraphIO.h less more

66	66	case 'T': // HT: ASQG format
67	67	return read_asqg(in, g);
68	68	case '\t': // H: GAF format
69		return read_gfa(in, g);
	69	return read_gfa(in, g, betterEP);
70	70	default:
71	71	std::cerr << "Unknown file format: `H" << c << "'\n";
72	72	exit(EXIT_FAILURE);

+1

-0

Konnector/konnector.cc less more

32	32	#endif
33	33
34	34	using namespace std;
	35	using Konnector::BloomFilter;
35	36
36	37	#define PROGRAM "konnector"
37	38

+1

-0

Konnector/konnector.h less more

814	814	result = ES_EXTENDED_TO_CYCLE;
815	815	else
816	816	result = ES_INTERNAL_CYCLE;
	817	break;
817	818	case DEAD_END:
818	819	result = ES_DEAD_END;
819	820	break;

+1

-1

Makefile.am less more

56	56	Sealer \
57	57	AdjList \
58	58	lib/bloomfilter \
59		lib/rolling-hash \
	59	lib/nthash \
60	60	$(GTest) \
61	61	$(UnitTest)
62	62

+0

-22

MergePaths/MergeContigs.cpp less more

640	640	printContiguityStats(cerr, lengthHistogram, STATS_MIN_LENGTH)
641	641	<< '\t' << opt::out << '\n';
642	642	}
643		#if 0
644		// assembly contiguity statistics
645		vector<int> vals = passContiguityStatsVal(lengthHistogram,200);
646		vector<string> keys = make_vector<string>()
647		<< "n"
648		<< "n200"
649		<< "nN50"
650		<< "min"
651		<< "N80"
652		<< "N50"
653		<< "N20"
654		<< "Esize"
655		<< "max"
656		<< "sum"
657		<< "nNG50"
658		<< "NG50";
659
660		if (!opt::db.empty()) {
661		for (unsigned a=0; a<vals.size(); a++)
662		addToDb(db, keys[a], vals[a]);
663		}
664		#endif
665	643	return 0;
666	644	}

+1

-1

Misc/samtobreak.hs less more

278	278	where
279	279	help = putStr (usageInfo usage options) >> exitSuccess
280	280	tryHelp = "Try 'abyss-samtobreak --help' for more information."
281		version = "abyss-samtobreak (ABySS) 2.1.0\n"
	281	version = "abyss-samtobreak (ABySS) 2.1.1\n"
282	282	usage = "Usage: samtobreak [OPTION]... [FILE]...\n\
283	283	\Calculate contig and scaffold contiguity and correctness metrics.\n"
284	284

+48

-2

Parallel/NetworkSequenceCollection.cpp less more

97	97	= AssemblyAlgorithms::coverageHistogram(m_data);
98	98	Histogram h(m_comm.reduce(myh.toVector()));
99	99	AssemblyAlgorithms::setCoverageParameters(h);
	100
	101	/*
	102	* If we have loaded the k-mer hash table from disk, then
	103	* the adjacency data for each k-mer has already been computed
	104	* in the hash table.
	105	*
	106	* `applyKmerCoverageThreshold` would not work correctly
	107	* in this case because it removes k-mer records from
	108	* the hash table without attempting to update the adjacency
	109	* info of neighbouring k-mers.
	110	*/
	111	if (!m_data.isAdjacencyLoaded() && opt::kc > 0) {
	112	m_comm.barrier();
	113	size_t removed = AssemblyAlgorithms::applyKmerCoverageThreshold(m_data, opt::kc);
	114	logger(1) << "Removed " << removed
	115	<< " low multiplicity k-mers" << std::endl;
	116	m_comm.reduce(removed);
	117	m_comm.reduce(m_data.size());
	118	}
	119
100	120	EndState();
101	121	SetState(NAS_WAITING);
102	122	break;
103	123	}
104	124	case NAS_GEN_ADJ:
105	125	m_comm.barrier();
	126
106	127	m_numBasesAdjSet = 0;
107	128	AssemblyAlgorithms::generateAdjacency(this);
108	129	EndState();

473	494	= AssemblyAlgorithms::coverageHistogram(m_data);
474	495	Histogram h(m_comm.reduce(myh.toVector()));
475	496	AssemblyAlgorithms::setCoverageParameters(h);
476		EndState();
477
	497
	498	/*
	499	* If we have loaded the k-mer hash table from disk, then
	500	* the adjacency data for each k-mer has already been computed
	501	* in the hash table.
	502	*
	503	* `applyKmerCoverageThreshold` would not work correctly
	504	* in this case because it removes k-mer records from
	505	* the hash table without attempting to update the adjacency
	506	* info of neighbouring k-mers.
	507	*/
	508	if (!m_data.isAdjacencyLoaded() && opt::kc > 0) {
	509	cout << "Minimum k-mer multiplicity kc is "
	510	<< opt::kc << '\n';
	511	cout << "Removing low multiplicity k-mers..." << std::endl;
	512	m_comm.barrier();
	513	size_t removed = AssemblyAlgorithms::applyKmerCoverageThreshold(m_data, opt::kc);
	514	logger(1) << "Removed " << removed
	515	<< " low multiplicity k-mers" << std::endl;
	516	size_t sumRemoved = m_comm.reduce(removed);
	517	size_t remaining = m_comm.reduce(m_data.size());
	518	cout << "Removed " << sumRemoved
	519	<< " low-multiplicity k-mers, " << remaining
	520	<< " k-mers remaining" << std::endl;
	521	}
	522
	523	EndState();
478	524	SetState(m_data.isAdjacencyLoaded()
479	525	? NAS_ERODE : NAS_GEN_ADJ);
480	526	break;

+11

-4

README.md less more

41	41
42	42	Install [Linuxbrew](http://linuxbrew.sh/), and run the command
43	43
44		brew install brewsci/bio/abyss
	44	brew install abyss
45	45
46	46	## Install ABySS on macOS
47	47
48	48	Install [Homebrew](https://brew.sh/), and run the command
49	49
50		brew install brewsci/bio/abyss
	50	brew install abyss
51	51
52	52	## Install ABySS on Windows
53	53
54	54	Install [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/) and [Linuxbrew](http://linuxbrew.sh/), and run the command
55	55
56		brew install brewsci/bio/abyss
	56	brew install abyss
57	57
58	58	## Install ABySS on Debian or Ubuntu
59	59

96	96	- [ARCS](https://github.com/bcgsc/arcs) to scaffold
97	97	- [Tigmint](https://github.com/bcgsc/tigmint) to correct assembly errors
98	98
99		brew install brewsci/bio/arcs brewsci/bio/links
	99	brew install brewsci/bio/arcs brewsci/bio/links-scaffolder
100	100
101	101	## Optional dependencies
102	102

187	187	installed ABySS in `/opt/abyss`, add `/opt/abyss/bin` to your `PATH`:
188	188
189	189	PATH=/opt/abyss/bin:$PATH
	190
	191	Before starting an assembly
	192	===========================
	193
	194	ABySS stores temporary files in `TMPDIR`, which is `/tmp` by default on most systems. If your default temporary disk volume is too small, set `TMPDIR` to a larger volume, such as `/var/tmp` or your home directory.
	195
	196	export TMPDIR=/var/tmp
190	197
191	198	Assembling a paired-end library
192	199	===============================

+2

-2

Scaffold/scaffold.cc less more

653	653	<< "n200"
654	654	<< "nN50"
655	655	<< "min"
656		<< "N80"
	656	<< "N75"
657	657	<< "N50"
658		<< "N20"
	658	<< "N25"
659	659	<< "Esize"
660	660	<< "max"
661	661	<< "sum"

+56

-17

Sealer/sealer.cc less more

45	45	#endif
46	46
47	47	using namespace std;
	48	using Konnector::BloomFilter;
48	49	#if USESEQAN
49	50	using namespace seqan;
50	51	#endif

109	110	" -s, --search-mem=N mem limit for graph searches; multiply by the\n"
110	111	" number of threads (-j) to get the total mem used\n"
111	112	" for graph traversal [500M]\n"
	113	" -g, --gap-file=FILE write sealed gaps to FILE\n"
112	114	" -t, --trace-file=FILE write graph search stats to FILE\n"
113	115	" -v, --verbose display verbose output\n"
114	116	" --help display this help and exit\n"

203	205	/** Output file for graph search stats */
204	206	static string tracefilePath;
205	207
	208	/** Output file for sealed gaps */
	209	static string gapfilePath;
	210
206	211	/** Mask bases not in flanks */
207	212	static int mask = 0;
208	213

235	240	size_t skipped;
236	241	};
237	242
238		static const char shortopts[] = "S:L:b:B:d:ef:F:G:i:Ij:k:lm:M:no:P:q:r:s:t:v";
	243	static const char shortopts[] = "S:L:b:B:d:ef:F:G:g:i:Ij:k:lm:M:no:P:q:r:s:t:v";
239	244
240	245	enum { OPT_HELP = 1, OPT_VERSION };
241	246

272	277	{ "read-name", required_argument, NULL, 'r' },
273	278	{ "search-mem", required_argument, NULL, 's' },
274	279	{ "trace-file", required_argument, NULL, 't' },
	280	{ "gap-file", required_argument, NULL, 'g' },
275	281	{ "verbose", no_argument, NULL, 'v' },
276	282	{ "help", no_argument, NULL, OPT_HELP },
277	283	{ "version", no_argument, NULL, OPT_VERSION },

414	420	string merge(const Graph& g,
415	421	unsigned k,
416	422	const Gap& gap,
417		FastaRecord &read1,
418		FastaRecord &read2,
	423	const FastaRecord &read1,
	424	const FastaRecord &read2,
419	425	const ConnectPairsParams& params,
420	426	Counters& g_count,
421	427	ofstream& traceStream)

571	577	map<FastaRecord, map<FastaRecord, Gap> > &flanks,
572	578	unsigned &gapsclosed,
573	579	ofstream &logStream,
574		ofstream &traceStream)
	580	ofstream &traceStream,
	581	ofstream &gapStream)
575	582	{
576	583	map<FastaRecord, map<FastaRecord, Gap> >::iterator read1_it;
577	584	map<FastaRecord, Gap>::iterator read2_it;
578	585	unsigned uniqueGapsClosed = 0;
579		bool success;
580	586
581	587	Counters g_count;
582	588	g_count.noStartOrGoalKmer = 0;

596	602
597	603	printLog(logStream, "Flanks inserted into k run = " + IntToString(flanks.size()) + "\n");
598	604
599		for (read1_it = flanks.begin(); read1_it != flanks.end();) {
600		success = false;
	605	int counter = 0;
	606	vector<map<FastaRecord, map<FastaRecord, Gap> >::iterator> flanks_closed;
	607	#pragma omp parallel private(read1_it, read2_it) firstprivate(counter)
	608	for (read1_it = flanks.begin(); read1_it != flanks.end(); ++read1_it) {
601	609	FastaRecord read1 = read1_it->first;
602		for (read2_it = flanks[read1].begin(); read2_it != flanks[read1].end(); read2_it++) {
	610	bool success = false;
	611	for (read2_it = flanks[read1].begin(); read2_it != flanks[read1].end(); ++read2_it, ++counter) {
	612	#if _OPENMP
	613	if (counter % omp_get_num_threads() != omp_get_thread_num())
	614	continue;
	615	#endif
603	616	FastaRecord read2 = read2_it->first;
604	617
605	618	int startposition = read2_it->second.gapStart();
606	619	string tempSeq = merge(g, k, read2_it->second, read1, read2, params, g_count, traceStream);
607	620	if (!tempSeq.empty()) {
608	621	success = true;
	622	#pragma omp critical (allmerged)
609	623	allmerged[read1.id.substr(0,read1.id.length()-2)][startposition]
610	624	= ClosedGap(read2_it->second, tempSeq);
611		//#pragma omp atomic
612		gapsclosed++;
613		//#pragma omp atomic
614		uniqueGapsClosed++;
615		if (gapsclosed % 100 == 0)
	625	#pragma omp atomic
	626	++uniqueGapsClosed;
	627	#pragma omp critical (gapsclosed)
	628	if (++gapsclosed % 100 == 0)
616	629	printLog(logStream, IntToString(gapsclosed) + " gaps closed so far\n");
	630
	631	if (!opt::gapfilePath.empty())
	632	#pragma omp critical (gapStream)
	633	gapStream << ">" << read1.id.substr(0,read1.id.length()-2)
	634	<< "_" << read2_it->second.gapStart() << "-" << read2_it->second.gapEnd()
	635	<< " LN:i:" << tempSeq.length() << '\n'
	636	<< tempSeq << '\n';
617	637	}
618	638	}
619	639	if (success) {
620		flanks.erase(read1_it++);
	640	#pragma omp critical (flanks_closed)
	641	flanks_closed.push_back(read1_it);
621	642	}
622		else
623		read1_it++;
	643	}
	644
	645	for (vector<map<FastaRecord, map<FastaRecord, Gap> >::iterator>::iterator it = flanks_closed.begin();
	646	it != flanks_closed.end(); ++it) {
	647	if (flanks.count((*it)->first) > 0)
	648	flanks.erase(*it);
624	649	}
625	650
626	651	printLog(logStream, IntToString(uniqueGapsClosed) + " unique gaps closed for k" + IntToString(k) + "\n");

770	795	opt::searchMem = SIToBytes(arg); break;
771	796	case 't':
772	797	arg >> opt::tracefilePath; break;
	798	case 'g':
	799	arg >> opt::gapfilePath; break;
773	800	case 'v':
774	801	opt::verbose++; break;
775	802	case OPT_HELP:

871	898	assert(traceStream.is_open());
872	899	ConnectPairsResult::printHeaders(traceStream);
873	900	assert_good(traceStream, opt::tracefilePath);
	901	}
	902
	903	ofstream gapStream;
	904	if (!opt::gapfilePath.empty()) {
	905	gapStream.open(opt::gapfilePath.c_str());
	906	assert(gapStream.is_open());
	907	assert_good(gapStream, opt::gapfilePath);
874	908	}
875	909
876	910	string logOutputPath(opt::outputPrefix);

1019	1053	temp = "Starting K run with k = " + IntToString(opt::k) + "\n";
1020	1054	printLog(logStream, temp);
1021	1055
1022		kRun(params, opt::k, g, allmerged, flanks, gapsclosed, logStream, traceStream);
	1056	kRun(params, opt::k, g, allmerged, flanks, gapsclosed, logStream, traceStream, gapStream);
1023	1057
1024	1058	temp = "k" + IntToString(opt::k) + " run complete\n"
1025	1059	+ "Total gaps closed so far = " + IntToString(gapsclosed) + "\n\n";

1072	1106	traceStream.close();
1073	1107	}
1074	1108
	1109	if (!opt::gapfilePath.empty()) {
	1110	assert_good(gapStream, opt::gapfilePath);
	1111	gapStream.close();
	1112	}
	1113
1075	1114	return 0;
1076	1115	}

+4

-4

Unittest/BloomDBG/BloomDBGTest.cpp less more

8	8	#include <iostream>
9	9
10	10	using namespace std;
11		typedef RollingBloomDBG<BTL::BloomFilter> Graph;
	11	typedef RollingBloomDBG<BloomFilter> Graph;
12	12	typedef graph_traits<Graph> GraphTraits;
13	13
14	14	/* each vertex is represented by

59	59	* GACTCGG
60	60	*/
61	61
62		BTL::BloomFilter bloom1(bloomSize, numHashes, k);
	62	BloomFilter bloom1(bloomSize, numHashes, k);
63	63
64	64	RollingHash("GACTC", numHashes, k).getHashes(hashes);
65	65	bloom1.insert(hashes);

94	94	* GACTCGG
95	95	*/
96	96
97		BTL::BloomFilter bloom2(bloomSize, numHashes, k);
	97	BloomFilter bloom2(bloomSize, numHashes, k);
98	98
99	99	RollingHash("GACTC", numHashes, k).getHashes(hashes);
100	100	bloom2.insert(hashes);

130	130	* ACTCG
131	131	*/
132	132
133		BTL::BloomFilter bloom3(bloomSize, numHashes, k);
	133	BloomFilter bloom3(bloomSize, numHashes, k);
134	134
135	135	RollingHash("TACTC", numHashes, k).getHashes(hashes);
136	136	bloom2.insert(hashes);

+4

-4

Unittest/BloomDBG/RollingBloomDBGTest.cpp less more

7	7	using namespace std;
8	8	using namespace boost;
9	9
10		typedef RollingBloomDBG<BTL::BloomFilter> Graph;
	10	typedef RollingBloomDBG<BloomFilter> Graph;
11	11	typedef graph_traits<Graph> GraphTraits;
12	12	typedef graph_traits<Graph>::vertex_descriptor V;
13	13

19	19	const unsigned m_k;
20	20	const unsigned m_bloomSize;
21	21	const unsigned m_numHashes;
22		BTL::BloomFilter m_bloom;
	22	BloomFilter m_bloom;
23	23	Graph m_graph;
24	24
25	25	RollingBloomDBGTest() : m_k(5), m_bloomSize(100000), m_numHashes(2),

148	148	* CGACT-GACTC-ACTCG
149	149	*/
150	150
151		BTL::BloomFilter bloom(m_bloomSize, m_numHashes, m_k);
	151	BloomFilter bloom(m_bloomSize, m_numHashes, m_k);
152	152	Graph graph(bloom);
153	153
154	154	const V CGACT("CGACT", RollingHash("CGACT", m_numHashes, m_k));

196	196	const unsigned m_k;
197	197	const unsigned m_bloomSize;
198	198	const unsigned m_numHashes;
199		BTL::BloomFilter m_bloom;
	199	BloomFilter m_bloom;
200	200	Graph m_graph;
201	201	const std::string m_spacedSeed;
202	202

+10

-29

Unittest/BloomDBG/RollingHashTest.cpp less more

152	152	ASSERT_EQ(rightKmerHash, middleKmerHash);
153	153	}
154	154
155		TEST_F(RollingHashTest, setBase)
	155	TEST_F(RollingHashTest, setLastBase)
156	156	{
157	157	MaskedKmer::mask().clear();
158	158
159	159	char kmer1[] = "ACGT";
160		char kmer2[] = "ACCT";
	160	char kmer2[] = "ACGA";
	161	char kmer3[] = "GCGT";
161	162
162	163	RollingHash hash1(kmer1, m_numHashes, m_k);
163	164	RollingHash hash2(kmer2, m_numHashes, m_k);
	165	RollingHash hash3(kmer3, m_numHashes, m_k);
164	166
165	167	ASSERT_NE(hash2, hash1);
166		hash1.setBase(kmer1, 2, 'C');
167		ASSERT_EQ(0, strcmp(kmer1, kmer2));
	168	hash1.setLastBase(kmer1, SENSE, 'A');
168	169	ASSERT_EQ(hash2, hash1);
	170
	171	hash1.reset(kmer1);
	172	ASSERT_NE(hash3, hash1);
	173	hash1.setLastBase(kmer1, ANTISENSE, 'G');
	174	ASSERT_EQ(hash3, hash1);
169	175	}
170
171		TEST_F(RollingHashTest, setBaseMasked)
172		{
173		MaskedKmer::setMask("1101");
174
175		char kmer1[] = "ACGT";
176		char kmer2[] = "ACCT";
177
178		RollingHash hash1(kmer1, m_numHashes, m_k);
179		RollingHash hash2(kmer2, m_numHashes, m_k);
180
181		/* hashes should agree since mismatch is in masked position */
182		ASSERT_EQ(hash2, hash1);
183		ASSERT_NE(0, strcmp(kmer1, kmer2));
184
185		/* fix mismatch in masked position (hash values shouldn't change) */
186		hash1.setBase(kmer1, 2, 'C');
187		ASSERT_EQ(hash2, hash1);
188		ASSERT_EQ(0, strcmp(kmer1, kmer2));
189
190		/* create mismatch in unmasked position (hash value should now differ) */
191		hash1.setBase(kmer1, 1, 'G');
192		ASSERT_NE(hash2, hash1);
193		ASSERT_NE(0, strcmp(kmer1, kmer2));
194		}

+1

-0

Unittest/Konnector/BloomFilter.cc less more

8	8	#include <string>
9	9
10	10	using namespace std;
	11	using Konnector::BloomFilter;
11	12
12	13	TEST(BloomFilter, base)
13	14	{

+1

-0

Unittest/Konnector/DBGBloomAlgorithmsTest.cpp less more

5	5	#include <string>
6	6
7	7	using namespace std;
	8	using Konnector::BloomFilter;
8	9
9	10	/*
10	11	* Tests for getStartKmerPos() function, which does

+2

-0

Unittest/Konnector/DBGBloomTest.cpp less more

3	3
4	4	#include <gtest/gtest.h>
5	5	#include <string>
	6
	7	using Konnector::BloomFilter;
6	8
7	9	TEST(DBGBloom, BloomFilterPolymorphism)
8	10	{

+1

-0

Unittest/Konnector/konnectorTest.cpp less more

3	3	#include <gtest/gtest.h>
4	4
5	5	using namespace std;
	6	using Konnector::BloomFilter;
6	7
7	8	// workaround: opt::k must be defined because
8	9	// it is used by write_dot(..)

+4

-0

Unittest/Makefile.am less more

212	212	BloomDBG/HashAgnosticCascadingBloomTest.cpp
213	213	BloomDBG_HashAgnosticCascadingBloom_CXXFLAGS = $(AM_CXXFLAGS) \
214	214	$(OPENMP_CXXFLAGS)
	215	BloomDBG_HashAgnosticCascadingBloom_LDADD = \
	216	$(top_builddir)/DataLayer/libdatalayer.a \
	217	$(top_builddir)/Common/libcommon.a \
	218	$(LDADD)
215	219
216	220	check_PROGRAMS += BloomDBG_RollingBloomDBG
217	221	BloomDBG_RollingBloomDBG_SOURCES = BloomDBG/RollingBloomDBGTest.cpp

+83

-81

bin/abyss-pe less more

3	3	# Anthony Raymond <traymond@bcgsc.ca>.
4	4
5	5	SHELL=bash -e -o pipefail
6		ifneq ($(shell command -v zsh),)
	6	ifeq ($(shell zsh -e -o pipefail -c 'true' 2>/dev/null; echo $$?), 0)
7	7	# Set pipefail to ensure that all commands of a pipe succeed.
8	8	SHELL=zsh -e -o pipefail
9	9	# Report run time and memory usage with zsh.

109	109	endif
110	110	endif
111	111
	112	ifndef preserve_path
112	113	# Determine the path to the ABySS executables
113	114	path?=$(shell dirname `command -v $(MAKEFILE_LIST)`)
114	115	ifdef path
115	116	PATH:=$(path):$(PATH)
	117	endif
116	118	endif
117	119
118	120	ifdef db

195	197	ifdef c
196	198	abyssopt += -c$c
197	199	endif
	200	ifdef kc
	201	abyssopt += --kc=$(kc)
	202	endif
198	203	ifdef b
199	204	abyssopt += -b$b
200	205	pbopt += -b$b

215	220	endif
216	221	ifdef j
217	222	abyssopt += -j$j
218		endif
219		ifdef kc
220		abyssopt += --kc=$(kc)
221	223	endif
222	224	ifdef x
223	225	abyssopt += -s$x

392	394	@echo 'Report bugs to https://github.com/bcgsc/abyss/issues or abyss-users@bcgsc.ca.'
393	395
394	396	version:
395		@echo "abyss-pe (ABySS) 2.1.0"
	397	@echo "abyss-pe (ABySS) 2.1.1"
396	398	@echo "Written by Shaun Jackman and Anthony Raymond."
397	399	@echo
398	400	@echo "Copyright 2012 Canada's Michael Smith Genome Science Centre"

529	531
530	532	ifdef B
531	533	%-1.fa:
532		abyss-bloom-dbg $(abyssopt) $(ABYSS_OPTIONS) $(in) $(se) > $@
	534	$(gtime) abyss-bloom-dbg $(abyssopt) $(ABYSS_OPTIONS) $(in) $(se) > $@
533	535	else ifdef K
534	536
535	537	ifdef np
536	538	%-1.fa:
537		$(mpirun) -np $(np) abyss-paired-dbg-mpi $(abyssopt) $(ABYSS_OPTIONS) -o $*-1.fa $(in) $(se)
	539	$(gtime) $(mpirun) -np $(np) abyss-paired-dbg-mpi $(abyssopt) $(ABYSS_OPTIONS) -o $*-1.fa $(in) $(se)
538	540	else
539	541	%-1.fa %-1.$g:
540		abyss-paired-dbg $(abyssopt) $(ABYSS_OPTIONS) -o $-1.fa -g $-1.$g $(in) $(se)
	542	$(gtime) abyss-paired-dbg $(abyssopt) $(ABYSS_OPTIONS) -o $-1.fa -g $-1.$g $(in) $(se)
541	543	endif
542	544
543	545	else ifdef np
544	546	%-1.fa:
545		$(mpirun) -np $(np) ABYSS-P $(abyssopt) $(ABYSS_OPTIONS) -o $@ $(in) $(se)
	547	$(gtime) $(mpirun) -np $(np) ABYSS-P $(abyssopt) $(ABYSS_OPTIONS) -o $@ $(in) $(se)
546	548	else
547	549	%-1.fa:
548		ABYSS $(abyssopt) $(ABYSS_OPTIONS) -o $@ $(in) $(se)
	550	$(gtime) ABYSS $(abyssopt) $(ABYSS_OPTIONS) -o $@ $(in) $(se)
549	551	endif
550	552
551	553	# Find overlapping contigs
552	554
553	555	%-1.$g: %-1.fa
554		AdjList $(alopt) --$g $< >$@
	556	$(gtime) AdjList $(alopt) --$g $< >$@
555	557
556	558	# Remove shim contigs
557	559
558	560	%-2.$g1 %-1.path: %-1.$g %-1.fa
559		abyss-filtergraph $v --$g $(fgopt) $(FILTERGRAPH_OPTIONS) -k$k -g $-2.$g1 $^ >$-1.path
	561	$(gtime) abyss-filtergraph $v --$g $(fgopt) $(FILTERGRAPH_OPTIONS) -k$k -g $-2.$g1 $^ >$-1.path
560	562
561	563	%-2.fa %-2.$g: %-1.fa %-2.$g1 %-1.path
562		MergeContigs --$g $(mcopt) -g $-2.$g -o $-2.fa $^
	564	$(gtime) MergeContigs --$g $(mcopt) -g $-2.$g -o $-2.fa $^
563	565
564	566	# Pop bubbles
565	567
566	568	%-2.path %-3.$g: %-2.fa %-2.$g
567		PopBubbles $v --$g -j$j -k$k $(SS) $(pbopt) $(POPBUBBLES_OPTIONS) -g $-3.$g $^ >$-2.path
	569	$(gtime) PopBubbles $v --$g -j$j -k$k $(SS) $(pbopt) $(POPBUBBLES_OPTIONS) -g $-3.$g $^ >$-2.path
568	570
569	571	%-3.fa: %-2.fa %-2.$g %-2.path
570		MergeContigs $(mcopt) -o $@ $^
	572	$(gtime) MergeContigs $(mcopt) -o $@ $^
571	573	awk '!/^>/ {x[">" $$1]=1; next} {getline s} $$1 in x {print $$0 "\n" s}' \
572	574	$-2.path $-1.fa >$*-indel.fa
573	575

580	582	# Estimate distances between unitigs
581	583
582	584	%-3.sam.gz %-3.hist: $(name)-3.fa
583		$(align) $(mapopt) $(strip $($*)) $< \
	585	$(gtime) $(align) $(mapopt) $(strip $($*)) $< \
584	586	\|$(fixmate) $(fmopt) -h $*-3.hist \
585	587	\|sort -snk3 -k4 \
586	588	\|$(gzip) >$*-3.sam.gz
587	589
588	590	%-3.bam %-3.hist: $(name)-3.fa
589		$(align) $(mapopt) $(strip $($*)) $< \
	591	$(gtime) $(align) $(mapopt) $(strip $($*)) $< \
590	592	\|$(fixmate) $(fmopt) -h $*-3.hist \
591	593	\|sort -snk3 -k4 \
592	594	\|samtools view -Sb - -o $*-3.bam
593	595
594	596	%-3.dist: %-3.sam.gz %-3.hist
595	597	gunzip -c $< \
	598	\|$(gtime) $(DistanceEst) $(deopt) -o $@ $*-3.hist
	599
	600	%-3.dist: %-3.bam %-3.hist
	601	$(gtime) samtools view -h $< \
596	602	\|$(DistanceEst) $(deopt) -o $@ $*-3.hist
597	603
598		%-3.dist: %-3.bam %-3.hist
599		samtools view -h $< \
600		\|$(DistanceEst) $(deopt) -o $@ $*-3.hist
601
602	604	%-3.dist: $(name)-3.fa
603		$(align) $(mapopt) $(strip $($*)) $< \
	605	$(gtime) $(align) $(mapopt) $(strip $($*)) $< \
604	606	\|$(fixmate) $(fmopt) -h $*-3.hist \
605	607	\|sort -snk3 -k4 \
606	608	\|$(DistanceEst) $(deopt) -o $@ $*-3.hist

609	611
610	612	ifneq ($(name)-3.dist, $(dist))
611	613	$(name)-3.dist: $(name)-3.fa $(dist)
612		abyss-todot $v --dist -e $^ >$@
	614	$(gtime) abyss-todot $v --dist -e $^ >$@
613	615
614	616	$(name)-3.bam: $(addsuffix -3.bam, $(pe))
615		samtools merge -r $@ $^
	617	$(gtime) samtools merge -r $@ $^
616	618	endif
617	619
618	620	# Find overlaps between contigs
619	621
620	622	%-4.fa %-4.$g: %-3.fa %-3.$g %-3.dist
621		Overlap $v --$g $(SS) $(OVERLAP_OPTIONS) -k$k -g $-4.$g -o $-4.fa $^
	623	$(gtime) Overlap $v --$g $(SS) $(OVERLAP_OPTIONS) -k$k -g $-4.$g -o $-4.fa $^
622	624
623	625	# Assemble contigs
624	626
625	627	%-4.path1: %-4.$g %-3.dist
626		SimpleGraph $v $(sgopt) $(SIMPLEGRAPH_OPTIONS) -j$j -k$k -o $@ $^
	628	$(gtime) SimpleGraph $v $(sgopt) $(SIMPLEGRAPH_OPTIONS) -j$j -k$k -o $@ $^
627	629
628	630	%-4.path2: %-4.path1 %-3.fa.fai %-4.fa.fai
629	631	cat $-3.fa.fai $-4.fa.fai \
630		\|MergePaths $(mpopt) $(MERGEPATHS_OPTIONS) -o $@ - $<
	632	\|$(gtime) MergePaths $(mpopt) $(MERGEPATHS_OPTIONS) -o $@ - $<
631	633
632	634	%-4.path3: %-4.$g %-4.path2
633	635	PathOverlap --assemble $(poopt) $(SS) $^ >$@

636	638
637	639	%-5.path %-5.fa %-5.$g: %-3.fa %-4.fa %-4.$g %-4.path3
638	640	cat $(wordlist 1, 2, $^) \
639		\|PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -o $-5.path -s $-5.fa -g $*-5.$g - $(wordlist 3, 4, $^)
	641	\|$(gtime) PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -o $-5.path -s $-5.fa -g $*-5.$g - $(wordlist 3, 4, $^)
640	642
641	643	%-6.fa: %-3.fa %-4.fa %-5.fa %-5.$g %-5.path
642		cat $(wordlist 1, 3, $^) \|MergeContigs $(mcopt) -o $@ - $(wordlist 4, 5, $^)
	644	cat $(wordlist 1, 3, $^) \|$(gtime) MergeContigs $(mcopt) -o $@ - $(wordlist 4, 5, $^)
643	645
644	646	else
645	647

648	650	ln -sf $-4.path3 $-5.path
649	651
650	652	%-cs.fa: %-3.fa %-4.fa %-4.$g %-4.path3
651		cat $(wordlist 1, 2, $^) \|MergeContigs $(mcopt) -o $@ - $(wordlist 3, 4, $^)
	653	cat $(wordlist 1, 2, $^) \|$(gtime) MergeContigs $(mcopt) -o $@ - $(wordlist 3, 4, $^)
652	654
653	655	# Convert colour-space sequence to nucleotides
654	656
655	657	%-6.fa: %-cs.fa
656		KAligner $v --seq -m -j$j -l$l $(in) $(se) $< \
	658	$(gtime) KAligner $v --seq -m -j$j -l$l $(in) $(se) $< \
657	659	\|Consensus $v -o $@ $<
658	660
659	661	endif
660	662
661	663	%-6.$g: %-5.$g %-5.path
662		PathOverlap --overlap $(poopt) --$g $^ >$@
	664	$(gtime) PathOverlap --overlap $(poopt) --$g $^ >$@
663	665
664	666	%-contigs.fa: %-6.fa
665	667	ln -sf $< $@

670	672	# Estimate distances between contigs
671	673
672	674	%-6.sam.gz %-6.hist: $(name)-6.fa
673		$(align) $(mapopt) $(strip $($*)) $< \
	675	$(gtime) $(align) $(mapopt) $(strip $($*)) $< \
674	676	\|$(fixmate) $(fmopt) -h $*-6.hist \
675	677	\|sort -snk3 -k4 \
676	678	\|$(gzip) >$*-6.sam.gz
677	679
678	680	%-6.bam %-6.hist: $(name)-6.fa
679		$(align) $(mapopt) $(strip $($*)) $< \
	681	$(gtime) $(align) $(mapopt) $(strip $($*)) $< \
680	682	\|$(fixmate) $(fmopt) -h $*-6.hist \
681	683	\|sort -snk3 -k4 \
682	684	\|samtools view -Sb - -o $*-6.bam
683	685
684	686	%-6.dist.dot: %-6.sam.gz %-6.hist
685	687	gunzip -c $< \
686		\|$(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist
	688	\|$(gtime) $(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist
687	689
688	690	%-6.dist.dot: %-6.bam %-6.hist
689	691	samtools view -h $< \
690		\|$(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist
	692	\|$(gtime) $(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist
691	693
692	694	%-6.dist.dot: $(name)-6.fa
693		$(align) $(mapopt) $(strip $($*)) $< \
	695	$(gtime) $(align) $(mapopt) $(strip $($*)) $< \
694	696	\|$(fixmate) $(fmopt) -h $*-6.hist \
695	697	\|sort -snk3 -k4 \
696	698	\|$(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist

698	700	# Scaffold
699	701
700	702	%-6.path: $(name)-6.$g $(addsuffix -6.dist.dot, $(mp))
701		abyss-scaffold $(scopt) -s$S -n$N -g $@.dot $(SCAFFOLD_OPTIONS) $^ >$@
	703	$(gtime) abyss-scaffold $(scopt) -s$S -n$N -g $@.dot $(SCAFFOLD_OPTIONS) $^ >$@
702	704
703	705	%-7.path %-7.$g %-7.fa: %-6.fa %-6.$g %-6.path
704		PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -s $-7.fa -g $-7.$g -o $*-7.path $^
	706	$(gtime) PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -s $-7.fa -g $-7.$g -o $*-7.path $^
705	707
706	708	%-8.fa: %-6.fa %-7.fa %-7.$g %-7.path
707	709	cat $(wordlist 1, 2, $^) \
708		\|MergeContigs $(mcopt) -o $@ - $(wordlist 3, 4, $^)
	710	\|$(gtime) MergeContigs $(mcopt) -o $@ - $(wordlist 3, 4, $^)
709	711
710	712	%-8.$g: %-7.$g %-7.path
711		PathOverlap --overlap $(poopt) --$g $^ >$@
	713	$(gtime) PathOverlap --overlap $(poopt) --$g $^ >$@
712	714
713	715	# Scaffold using linked reads
714	716	ifdef lr

716	718	# Tigmint
717	719
718	720	# Options for mapping the reads to the draft assembly.
719		lr_l=$l
	721	lr_l?=$l
720	722	override lrmapopt=$v -j$j -l$(lr_l) $(LR_MAP_OPTIONS)
721	723
722	724	# Options for abyss-scaffold
723		lr_s=1000-100000
724		lr_n=5-20
	725	lr_s?=1000-100000
	726	lr_n?=5-20
725	727
726	728	# Minimum AS/Read length ratio
727		tigmint_as=0.65
	729	tigmint_as?=0.65
728	730
729	731	# Maximum number of mismatches
730		tigmint_nm=5
	732	tigmint_nm?=5
731	733
732	734	# Minimum mapping quality threshold
733		tigmint_mapq=0
	735	tigmint_mapq?=0
734	736
735	737	# Maximum distance between reads to be considered the same molecule
736		tigmint_d=50000
	738	tigmint_d?=50000
737	739
738	740	# Minimum number of spanning molecules
739		tigmint_n=10
	741	tigmint_n?=10
740	742
741	743	# Size of the window that must be spanned by moecules
742		tigmint_w=1000
	744	tigmint_w?=1000
743	745
744	746	# Align paired-end reads to the draft genome, sort by BX tag,
745	747	# and create molecule extents BED.
746		%.lr.bed: %.fa
747		$(gtime) $(align) $(lrmapopt) $(lr_reads) $< \
	748	%.lr.bed: %.fa.fai
	749	$(gtime) $(align) $(lrmapopt) $(lr_reads) $*.fa \
748	750	\| samtools sort -@$j -tBX -l0 -T$$(mktemp -u -t $@.XXXXXX) \
749	751	\| tigmint-molecule -a $(tigmint_as) -n $(tigmint_nm) -q $(tigmint_mapq) -d $(tigmint_d) - \
750	752	\| sort -k1,1 -k2,2n -k3,3n >$@
751	753
752	754	# Align paired-end reads to the draft genome and sort by BX tag.
753		%.lr.sortbx.bam: %.fa
754		$(gtime) $(align) $(lrmapopt) $(lr_reads) $< \
	755	%.lr.sortbx.bam: %.fa.fai
	756	$(gtime) $(align) $(lrmapopt) $(lr_reads) $*.fa \
755	757	\| samtools sort -@$j -tBX -T$$(mktemp -u -t $@.XXXXXX) -o $@
756	758
757	759	# Filter the BAM file, create molecule extents BED.

764	766	$(gtime) tigmint-cut -p$j -n$(tigmint_n) -w$(tigmint_w) -o $@ $*.fa $<
765	767
766	768	# ARCS
767		arcs_c=2
768		arcs_d=0
769		arcs_e=30000
770		arcs_l=0
771		arcs_m=4-20000
772		arcs_r=0.05
773		arcs_s=98
774		arcs_z=500
	769	arcs_c?=2
	770	arcs_d?=0
	771	arcs_e?=30000
	772	arcs_l?=0
	773	arcs_m?=4-20000
	774	arcs_r?=0.05
	775	arcs_s?=98
	776	arcs_z?=500
775	777
776	778	# Align reads and create a graph of linked contigs using ARCS.
777	779	%.arcs.dist.gv: %.fa

789	791	# Create a graph of linked contigs using ARCS.
790	792	%.arcs.dist.gv: %.lr.sortn.sam.gz
791	793	gunzip -c $< \
792		\| arcs $v -c$(arcs_c) -d$(arcs_d) -e$(arcs_e) -l$(arcs_l) -m$(arcs_m) -r$(arcs_r) -s$(arcs_s) -z$(arcs_z) \
	794	\|$(gtime) arcs $v -c$(arcs_c) -d$(arcs_d) -e$(arcs_e) -l$(arcs_l) -m$(arcs_m) -r$(arcs_r) -s$(arcs_s) -z$(arcs_z) \
793	795	-g $.arcs.dist.gv --tsv=$.arcs.tsv --barcode-counts=$*.arcs.barcode-counts.tsv /dev/stdin
794	796
795	797	# Scaffold using ARCS and abyss-scaffold.
796	798	%.arcs.path: %.arcs.dist.gv
797		abyss-scaffold $(scopt) -s$(lr_s) -n$(lr_n) -g $@.dot $(LR_SCAFFOLD_OPTIONS) $< >$@
	799	$(gtime) abyss-scaffold $(scopt) -s$(lr_s) -n$(lr_n) -g $@.dot $(LR_SCAFFOLD_OPTIONS) $< >$@
798	800
799	801	# Create the FASTA file of ARCS scaffolds.
800	802	%.arcs.fa: %.fa %.arcs.path
801		MergeContigs $(mcopt) -o $@ $^
	803	$(gtime) MergeContigs $(mcopt) -o $@ $^
802	804
803	805	%-scaffolds.fa: %-8.tigmint.arcs.fa
804	806	ln -sf $< $@

817	819	sealer_ks?=-k90 -k80 -k70 -k60 -k50 -k40 -k30
818	820
819	821	%-8_scaffold.fa: %-8.fa
820		abyss-sealer -v -j$j --print-flanks -o$*-8 -S$< $(sealer_ks) $(SEALER_OPTIONS) $(in) $(se)
	822	$(gtime) abyss-sealer -v -j$j --print-flanks -o$*-8 -S$< $(sealer_ks) $(SEALER_OPTIONS) $(in) $(se)
821	823
822	824	%-scaffolds-sealed.fa: %-8_scaffold.fa
823	825	ln -s $< $@

833	835	# Transcriptome assisted scaffolding
834	836
835	837	%.fa.bwt: %.fa
836		bwa index $<
	838	$(gtime) bwa index $<
837	839
838	840	%-8.sam.gz: $(name)-8.fa.bwt
839		bwa mem -a -t$j -S -P -k$l $(name)-8.fa $(strip $($*)) \
	841	$(gtime) bwa mem -a -t$j -S -P -k$l $(name)-8.fa $(strip $($*)) \
840	842	\|$(gzip) >$@
841	843
842	844	%-8.dist.dot: %-8.sam.gz
843		abyss-longseqdist -k$k $(LONGSEQDIST_OPTIONS) $< \
	845	$(gtime) abyss-longseqdist -k$k $(LONGSEQDIST_OPTIONS) $< \
844	846	\|grep -v "l=" >$@
845	847
846	848	%-8.path: $(name)-8.$g $(addsuffix -8.dist.dot, $(long))
847		abyss-scaffold $(scopt) -s$S -n1 -g $@.$g $(SCAFFOLD_OPTIONS) $^ >$@
	849	$(gtime) abyss-scaffold $(scopt) -s$S -n1 -g $@.$g $(SCAFFOLD_OPTIONS) $^ >$@
848	850
849	851	%-9.path %-9.$g %-9.fa: %-8.fa %-8.$g %-8.path
850		PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -s $-9.fa -g $-9.$g -o $*-9.path $^
	852	$(gtime) PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -s $-9.fa -g $-9.$g -o $*-9.path $^
851	853
852	854	%-10.fa: %-8.fa %-9.fa %-9.$g %-9.path
853	855	cat $(wordlist 1, 2, $^) \
854		\|MergeContigs $(mcopt) -o $@ - $(wordlist 3, 4, $^)
	856	\|$(gtime) MergeContigs $(mcopt) -o $@ - $(wordlist 3, 4, $^)
855	857
856	858	%-10.$g: %-9.$g %-9.path
857		PathOverlap --overlap $(poopt) --$g $^ >$@
	859	$(gtime) PathOverlap --overlap $(poopt) --$g $^ >$@
858	860
859	861	%-long-scaffs.fa: %-10.fa
860	862	ln -sf $< $@

880	882	endif
881	883
882	884	$(name)-unitigs.bam: %.bam: %.fa
883		$(align) $v -j$j -l$l $(ALIGNER_OPTIONS) $(se) $< \
	885	$(gtime) $(align) $v -j$j -l$l $(ALIGNER_OPTIONS) $(se) $< \
884	886	\|samtools view -Su - \|samtools sort -o - - >$@
885	887
886	888	$(name)-contigs.bam $(name)-scaffolds.bam: %.bam: %.fa
887		$(align) $v -j$j -l$l $(ALIGNER_OPTIONS) \
	889	$(gtime) $(align) $v -j$j -l$l $(ALIGNER_OPTIONS) \
888	890	$(call map, deref, $(sort $(lib) $(pe) $(mp))) $< \
889	891	\|$(fixmate) $v $(FIXMATE_OPTIONS) \
890	892	\|sort -snk3 -k4 \

893	895	# Align the variants to the assembly
894	896
895	897	%-variants.bam: %.fa.bwt
896		bwa bwasw -t$j $*.fa <(cat $(name)-bubbles.fa $(name)-indel.fa) \
	898	$(gtime) bwa bwasw -t$j $*.fa <(cat $(name)-bubbles.fa $(name)-indel.fa) \
897	899	\|samtools view -Su - \|samtools sort -o - - >$@
898	900
899	901	%-variants.vcf.gz: %.fa %-variants.bam
900		samtools mpileup -Buf $^ \|bcftools view -vp1 - \|bgzip >$@
	902	$(gtime) samtools mpileup -Buf $^ \|bcftools view -vp1 - \|bgzip >$@
901	903
902	904	%.gz.tbi: %.gz
903		tabix -pvcf $<
	905	$(gtime) tabix -pvcf $<
904	906
905	907	# Calculate assembly contiguity statistics
906	908

937	939	# Create an AGP file and FASTA file of scaftigs from scaffolds
938	940
939	941	%.agp %-agp.fa: %.fa
940		abyss-fatoagp $(FATOAGP_OPTIONS) -f $-agp.fa $< >$.agp
	942	$(gtime) abyss-fatoagp $(FATOAGP_OPTIONS) -f $-agp.fa $< >$.agp
941	943
942	944	# Align the contigs to the reference
943	945
944	946	%-$(ref).sam.gz: %.fa
945		bwa bwasw $(bwaswopt) $(BWASW_OPTIONS) $($(ref)) $< \|$(gzip) >$@
	947	$(gtime) bwa bwasw $(bwaswopt) $(BWASW_OPTIONS) $($(ref)) $< \|$(gzip) >$@
946	948
947	949	# Find breakpoints in the alignments
948	950
949	951	%.break: %.sam.gz
950		abyss-samtobreak $(SAMTOBREAK_OPTIONS) $< >$@
	952	$(gtime) abyss-samtobreak $(SAMTOBREAK_OPTIONS) $< >$@
951	953
952	954	# Report ABySS configuration variable(s) and value(s) currently set.
953	955

+2

-2

configure.ac less more

0	0	AC_PREREQ(2.62)
1		AC_INIT(ABySS, 2.1.0, abyss-users@bcgsc.ca, abyss,
	1	AC_INIT(ABySS, 2.1.1, abyss-users@bcgsc.ca, abyss,
2	2	http://www.bcgsc.ca/platform/bioinfo/software/abyss)
3	3	m4_include(m4/m4_ax_pthread.m4)
4	4	AM_INIT_AUTOMAKE(1.9.6 foreign subdir-objects)

309	309	BloomDBG/Makefile
310	310	DataBase/Makefile
311	311	lib/bloomfilter/Makefile
312		lib/rolling-hash/Makefile
	312	lib/nthash/Makefile
313	313	])
314	314
315	315	if test "$with_sparsehash" != "no" -a "$ac_cv_header_google_sparse_hash_map" != "yes"; then

+1

-1

doc/ABYSS.1 less more

0		.TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.1.0" "User Commands"
	0	.TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.1.1" "User Commands"
1	1	.SH NAME
2	2	ABYSS \- assemble short reads into contigs
3	3	.SH SYNOPSIS

+1

-1

doc/abyss-pe.1 less more

0		.TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.1.0" "User Commands"
	0	.TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.1.1" "User Commands"
1	1	.SH NAME
2	2	abyss-pe - assemble reads into contigs
3	3	.SH SYNOPSIS

+1

-1

doc/abyss-tofastq.1 less more

0		.TH abyss-tofastq "1" "2015-May" "ABySS 2.1.0" "User Commands"
	0	.TH abyss-tofastq "1" "2015-May" "ABySS 2.1.1" "User Commands"
1	1	.SH NAME
2	2	abyss-tofastq \- convert various file formats to FASTQ format
3	3	.br

+39

-94

lib/bloomfilter/BloomFilter.hpp less more

19	19	#include <cstdlib>
20	20	#include <stdio.h>
21	21	#include <cstring>
22		#include "lib/rolling-hash/rolling.h"
23	22
24	23	using namespace std;
25	24

32	31	>> (((0x4332322132212110 >> ((x & 0xF) << 2)) & 0xF) << 2))
33	32	>> ((0x4332322132212110 >> (((x & 0xF0) >> 2)) & 0xF) << 2)) & 0xf;
34	33	}
35
36		/* To avoid name collision with konnector `BloomFilter` class */
37
38		namespace BTL {
39	34
40	35	class BloomFilter {
41	36	public:

98	93	loadFilter(filterFilePath);
99	94	}
100	95
101		void loadHeader(FILE *file) {
102
103		FileHeader header;
104		if (fread(&header, sizeof(struct FileHeader), 1, file) != 1) {
105		cerr << "Failed to read Bloom filter file header" << endl;
106		}
107		char magic[9];
108		strncpy(magic, header.magic, 8);
109		magic[8] = '\0';
110
111		// cerr << "Loading header... magic: " <<
112		// magic << " hlen: " <<
113		// header.hlen << " size: " <<
114		// header.size << " nhash: " <<
115		// header.nhash << " kmer: " <<
116		// header.kmer << " dFPR: " <<
117		// header.dFPR << " aFPR: " <<
118		// header.aFPR << " rFPR: " <<
119		// header.rFPR << " nEntry: " <<
120		// header.nEntry << " tEntry: " <<
121		// header.tEntry << endl;
122
123		m_size = header.size;
124		initSize(m_size);
125		m_hashNum = header.nhash;
126		m_kmerSize = header.kmer;
127		}
128
129	96	void loadFilter(const string &filterFilePath)
130	97	{
131	98	FILE *file = fopen(filterFilePath.c_str(), "rb");
132	99	if (file == NULL) {
133	100	cerr << "file \"" << filterFilePath << "\" could not be read."
134		<< endl;
	101	<< endl;
135	102	exit(1);
136	103	}
137	104

143	110	fseek(file, lCurPos, 0);
144	111	if (fileSize != m_sizeInBytes) {
145	112	cerr << "Error: " << filterFilePath
146		<< " does not match size given by its information file. Size: "
147		<< fileSize << " vs " << m_sizeInBytes << " bytes." << endl;
	113	<< " does not match size given by its header. Size: "
	114	<< fileSize << " vs " << m_sizeInBytes << " bytes." << endl;
148	115	exit(1);
149	116	}
150	117
151	118	size_t countRead = fread(m_filter, fileSize, 1, file);
152	119	if (countRead != 1 && fclose(file) != 0) {
153	120	cerr << "file \"" << filterFilePath << "\" could not be read."
154		<< endl;
	121	<< endl;
155	122	exit(1);
156	123	}
	124	}
	125
	126	void loadHeader(FILE *file) {
	127
	128	FileHeader header;
	129	if (fread(&header, sizeof(struct FileHeader), 1, file) != 1) {
	130	cerr << "Failed to header" << endl;
	131	}
	132	char magic[9];
	133	strncpy(magic, header.magic, 8);
	134	magic[8] = '\0';
	135
	136	m_size = header.size;
	137	initSize(m_size);
	138	m_hashNum = header.nhash;
	139	m_kmerSize = header.kmer;
157	140	}
158	141
159	142	/*

182	165	}
183	166	}
184	167
185		void insert(const char* kmer) {
186		uint64_t hVal = getChval(kmer, m_kmerSize);
187		for (unsigned i = 0; i < m_hashNum; i++) {
188		size_t normalizedValue = (rol(varSeed, i) ^ hVal) % m_size;
189		__sync_or_and_fetch(&m_filter[normalizedValue / bitsPerChar],
190		bitMask[normalizedValue % bitsPerChar]);
191		}
192		}
193
194		/*
	168	/*
	169	* Accepts a list of precomputed hash values. Faster than rehashing each time.
195	170	* Returns if already inserted
196	171	*/
197		bool insertAndCheck(const char* kmer) {
198		uint64_t hVal = getChval(kmer, m_kmerSize);
	172	bool insertAndCheck(const size_t precomputed[]) {
	173	//iterates through hashed values adding it to the filter
199	174	bool found = true;
200		for (unsigned i = 0; i < m_hashNum; i++) {
201		size_t normalizedValue = (rol(varSeed, i) ^ hVal) % m_size;
202		found &= __sync_or_and_fetch(
	175	for (size_t i = 0; i < m_hashNum; ++i) {
	176	size_t normalizedValue = precomputed[i] % m_size;
	177	found &= __sync_fetch_and_or(
203	178	&m_filter[normalizedValue / bitsPerChar],
204		bitMask[normalizedValue % bitsPerChar]);
	179	bitMask[normalizedValue % bitsPerChar])
	180	>> (normalizedValue % bitsPerChar) & 1;
205	181	}
206	182	return found;
207	183	}

215	191	bool found = true;
216	192	for (size_t i = 0; i < m_hashNum; ++i) {
217	193	size_t normalizedValue = precomputed.at(i) % m_size;
218		found &= __sync_or_and_fetch(
	194	found &= __sync_fetch_and_or(
219	195	&m_filter[normalizedValue / bitsPerChar],
220		bitMask[normalizedValue % bitsPerChar]);
	196	bitMask[normalizedValue % bitsPerChar])
	197	>> (normalizedValue % bitsPerChar) & 1;
221	198	}
222	199	return found;
223	200	}

250	227	return true;
251	228	}
252	229
253		/*
254		* Single pass filtering, computes hash values on the fly
255		*/
256		bool contains(const char* kmer) const {
257		uint64_t hVal = getChval(kmer, m_kmerSize);
258		for (unsigned i = 0; i < m_hashNum; i++) {
259		size_t normalizedValue = (rol(varSeed, i) ^ hVal) % m_size;
260		unsigned char bit = bitMask[normalizedValue % bitsPerChar];
261		if ((m_filter[normalizedValue / bitsPerChar] & bit) == 0)
262		return false;
263		}
264		return true;
265		}
266
267		void writeHeader(ostream &out) const {
	230	void writeHeader(std::ostream& out) const {
268	231	FileHeader header;
269	232	strncpy(header.magic, "BlOOMFXX", 8);
270	233	char magic[9];

279	242	header.nEntry = m_nEntry;
280	243	header.tEntry = m_tEntry;
281	244
282		// cerr << "Writing header... magic: "
283		// << magic << " hlen: "
284		// << header.hlen << " size: "
285		// << header.size << " nhash: "
286		// << header.nhash << " kmer: "
287		// << header.kmer << " dFPR: "
288		// << header.dFPR << " aFPR: "
289		// << header.aFPR << " rFPR: "
290		// << header.rFPR << " nEntry: "
291		// << header.nEntry << " tEntry: "
292		// << header.tEntry << endl;
293
294	245	out.write(reinterpret_cast<char*>(&header), sizeof(struct FileHeader));
295	246	assert(out);
296	247	}
297	248
298	249	/** Serialize the Bloom filter to a stream */
299		// void storeFilter(std::ostream& out) const
300	250	friend std::ostream& operator<<(std::ostream& out, const BloomFilter& o)
301	251	{
302	252	assert(out);

318	268	void storeFilter(string const &filterFilePath) const {
319	269	ofstream myFile(filterFilePath.c_str(), ios::out \| ios::binary);
320	270
321		cerr << "Storing filter. Filter is " << m_sizeInBytes << "bytes."
	271	cerr << "Storing filter. Filter is " << m_sizeInBytes << " bytes."
322	272	<< endl;
323	273
324	274	myFile << *this;
325	275	myFile.close();
	276	assert(myFile);
326	277	}
327	278
328	279	size_t getPop() const {
329	280	size_t i, popBF = 0;
330		#pragma omp parallel for reduction(+:popBF)
	281	//#pragma omp parallel for reduction(+:popBF)
331	282	for (i = 0; i < (m_size + 7) / 8; i++)
332	283	popBF = popBF + popCnt(m_filter[i]);
333	284	return popBF;

340	291	unsigned getKmerSize() const {
341	292	return m_kmerSize;
342	293	}
343
344		// void setdFPR(double value) {
345		// m_dFPR = value;
346		// }
347	294
348	295	/*
349	296	* Calculates that False positive rate that a redundant entry is actually

362	309	* Return FPR based on popcount
363	310	*/
364	311	double getFPR() const {
365		return pow(double(getPop())/double(m_size), m_hashNum);
	312	return pow(double(getPop())/double(m_size), double(m_hashNum));
366	313	}
367	314
368	315	/*

396	343	~BloomFilter() {
397	344	delete[] m_filter;
398	345	}
399		private:
	346	protected:
400	347	BloomFilter(const BloomFilter& that); //to prevent copy construction
401	348
402	349	/*

450	397	* Calculates the optimal FPR to use based on hash functions
451	398	*/
452	399	double calcFPR_hashNum(unsigned hashFunctNum) const {
453		return pow(2, -hashFunctNum);
	400	return pow(2, -double(hashFunctNum));
454	401	}
455	402
456	403	uint8_t* m_filter;

463	410	uint64_t m_tEntry;
464	411	};
465	412
466		} // end namespace 'BTL'
467
468	413	#endif /* BLOOMFILTER_H_ */

+1

-2

lib/bloomfilter/README.md less more

0	0	These files come from:
1	1
2	2	* https://github.com/bcgsc/bloomfilter
3		* commit f1232c2
4		* modifications were made to BloomFilter.h (TODO: merge back to source repo)
	3	* commit 2e5e9d4

+1

-0

lib/nthash/Makefile.am less more

0

EXTRA_DIST = README.md

+1

-0

lib/nthash/README.md less more

0

nthash.hpp is taken from https://github.com/bcgsc/ntHash.git, commit 07e3f4d

+447

-0

lib/nthash/nthash.hpp less more

	0	/*
	1	*
	2	* nthash.hpp
	3	* Author: Hamid Mohamadi
	4	* Genome Sciences Centre,
	5	* British Columbia Cancer Agency
	6	*/
	7
	8	#ifndef NT_HASH_H
	9	#define NT_HASH_H
	10
	11	#include <stdint.h>
	12
	13	// offset for the complement base in the random seeds table
	14	const uint8_t cpOff = 0x07;
	15
	16	// shift for gerenerating multiple hash values
	17	const int multiShift = 27;
	18
	19	// seed for gerenerating multiple hash values
	20	static const uint64_t multiSeed = 0x90b45d39fb6da1fa;
	21
	22	// 64-bit random seeds corresponding to bases and their complements
	23	static const uint64_t seedA = 0x3c8bfbb395c60474;
	24	static const uint64_t seedC = 0x3193c18562a02b4c;
	25	static const uint64_t seedG = 0x20323ed082572324;
	26	static const uint64_t seedT = 0x295549f54be24456;
	27	static const uint64_t seedN = 0x0000000000000000;
	28
	29	static const uint64_t seedTab[256] = {
	30	seedN, seedT, seedN, seedG, seedA, seedN, seedN, seedC, // 0..7
	31	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 8..15
	32	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 16..23
	33	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 24..31
	34	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 32..39
	35	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 40..47
	36	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 48..55
	37	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 56..63
	38	seedN, seedA, seedN, seedC, seedN, seedN, seedN, seedG, // 64..71
	39	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 72..79
	40	seedN, seedN, seedN, seedN, seedT, seedN, seedN, seedN, // 80..87
	41	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 88..95
	42	seedN, seedA, seedN, seedC, seedN, seedN, seedN, seedG, // 96..103
	43	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 104..111
	44	seedN, seedN, seedN, seedN, seedT, seedN, seedN, seedN, // 112..119
	45	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 120..127
	46	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 128..135
	47	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 136..143
	48	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 144..151
	49	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 152..159
	50	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 160..167
	51	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 168..175
	52	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 176..183
	53	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 184..191
	54	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 192..199
	55	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 200..207
	56	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 208..215
	57	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 216..223
	58	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 224..231
	59	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 232..239
	60	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 240..247
	61	seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN // 248..255
	62	};
	63
	64	// rotate "v" to the left 1 position
	65	inline uint64_t rol1(const uint64_t v) {
	66	return (v << 1) \| (v >> 63);
	67	}
	68
	69	// rotate "v" to the right by 1 position
	70	inline uint64_t ror1(const uint64_t v) {
	71	return (v >> 1) \| (v << 63);
	72	}
	73
	74	// rotate 31-left bits of "v" to the left by "s" positions
	75	inline uint64_t rol31(const uint64_t v, unsigned s) {
	76	s%=31;
	77	return ((v << s) \| (v >> (31 - s))) & 0x7FFFFFFF;
	78	}
	79
	80	// rotate 33-right bits of "v" to the left by "s" positions
	81	inline uint64_t rol33(const uint64_t v, unsigned s) {
	82	s%=33;
	83	return ((v << s) \| (v >> (33 - s))) & 0x1FFFFFFFF;
	84	}
	85
	86	// swap bit 0 with bit 33 in "v"
	87	inline uint64_t swapbits033(const uint64_t v) {
	88	uint64_t x = (v ^ (v >> 33)) & 1;
	89	return v ^ (x \| (x << 33));
	90	}
	91
	92	// swap bit 32 with bit 63 in "v"
	93	inline uint64_t swapbits3263(const uint64_t v) {
	94	uint64_t x = ((v >> 32) ^ (v >> 63)) & 1;
	95	return v ^ ((x << 32) \| (x << 63));
	96	}
	97
	98	// forward-strand hash value of the base kmer, i.e. fhval(kmer_0)
	99	inline uint64_t NTF64(const char * kmerSeq, const unsigned k) {
	100	uint64_t hVal=0;
	101	for(unsigned i=0; i<k; i++) {
	102	hVal = rol1(hVal);
	103	hVal = swapbits033(hVal);
	104	hVal ^= seedTab[(unsigned char)kmerSeq[i]];
	105	}
	106	return hVal;
	107	}
	108
	109	// reverse-strand hash value of the base kmer, i.e. rhval(kmer_0)
	110	inline uint64_t NTR64(const char * kmerSeq, const unsigned k) {
	111	uint64_t hVal=0;
	112	for(unsigned i=0; i<k; i++) {
	113	hVal = rol1(hVal);
	114	hVal = swapbits033(hVal);
	115	hVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]&cpOff];
	116	}
	117	return hVal;
	118	}
	119
	120	// forward-strand ntHash for sliding k-mers
	121	inline uint64_t NTF64(const uint64_t fhVal, const unsigned k, const unsigned char charOut, const unsigned char charIn) {
	122	uint64_t hVal = rol1(fhVal);
	123	hVal = swapbits033(hVal);
	124	hVal ^= seedTab[charIn];
	125	uint64_t lBits = seedTab[charOut] >> 33;
	126	uint64_t rBits = seedTab[charOut] & 0x1FFFFFFFF;
	127	uint64_t sOut = (rol31(lBits,k) << 33) \| (rol33(rBits,k));
	128	hVal ^= sOut;
	129	return hVal;
	130	}
	131
	132	// reverse-complement ntHash for sliding k-mers
	133	inline uint64_t NTR64(const uint64_t rhVal, const unsigned k, const unsigned char charOut, const unsigned char charIn) {
	134	uint64_t lBits = seedTab[charIn&cpOff] >> 33;
	135	uint64_t rBits = seedTab[charIn&cpOff] & 0x1FFFFFFFF;
	136	uint64_t sIn = (rol31(lBits,k) << 33) \| (rol33(rBits,k));
	137	uint64_t hVal = rhVal ^ sIn;
	138	hVal ^= seedTab[charOut&cpOff];
	139	hVal = ror1(hVal);
	140	hVal = swapbits3263(hVal);
	141	return hVal;
	142	}
	143
	144	// canonical ntBase
	145	inline uint64_t NTC64(const char * kmerSeq, const unsigned k) {
	146	uint64_t fhVal=0, rhVal=0;
	147	fhVal=NTF64(kmerSeq, k);
	148	rhVal=NTR64(kmerSeq, k);
	149	return (rhVal<fhVal)? rhVal : fhVal;
	150	}
	151
	152	// canonical ntHash
	153	inline uint64_t NTC64(const char * kmerSeq, const unsigned k, uint64_t& fhVal, uint64_t& rhVal) {
	154	fhVal = NTF64(kmerSeq, k);
	155	rhVal = NTR64(kmerSeq, k);
	156	return (rhVal<fhVal)? rhVal : fhVal;
	157	}
	158
	159	// canonical ntHash for sliding k-mers
	160	inline uint64_t NTC64(const unsigned char charOut, const unsigned char charIn, const unsigned k, uint64_t& fhVal, uint64_t& rhVal) {
	161	fhVal = NTF64(fhVal, k, charOut, charIn);
	162	rhVal = NTR64(rhVal, k, charOut, charIn);
	163	return (rhVal<fhVal)? rhVal : fhVal;
	164	}
	165
	166	// forward-strand ntHash for sliding k-mers to the left
	167	inline uint64_t NTF64L(const uint64_t rhVal, const unsigned k, const unsigned char charOut, const unsigned char charIn) {
	168	uint64_t lBits = seedTab[charIn] >> 33;
	169	uint64_t rBits = seedTab[charIn] & 0x1FFFFFFFF;
	170	uint64_t sIn = (rol31(lBits,k) << 33) \| (rol33(rBits,k));
	171	uint64_t hVal = rhVal ^ sIn;
	172	hVal ^= seedTab[charOut];
	173	hVal = ror1(hVal);
	174	hVal = swapbits3263(hVal);
	175	return hVal;
	176	}
	177
	178	// reverse-complement ntHash for sliding k-mers to the left
	179	inline uint64_t NTR64L(const uint64_t fhVal, const unsigned k, const unsigned char charOut, const unsigned char charIn) {
	180	uint64_t hVal = rol1(fhVal);
	181	hVal = swapbits033(hVal);
	182	hVal ^= seedTab[charIn&cpOff];
	183	uint64_t lBits = seedTab[charOut&cpOff] >> 33;
	184	uint64_t rBits = seedTab[charOut&cpOff] & 0x1FFFFFFFF;
	185	uint64_t sOut = (rol31(lBits,k) << 33) \| (rol33(rBits,k));
	186	hVal ^= sOut;
	187	return hVal;
	188	}
	189
	190	// canonical ntHash for sliding k-mers to the left
	191	inline uint64_t NTC64L(const unsigned char charOut, const unsigned char charIn, const unsigned k, uint64_t& fhVal, uint64_t& rhVal) {
	192	fhVal = NTF64L(fhVal, k, charOut, charIn);
	193	rhVal = NTR64L(rhVal, k, charOut, charIn);
	194	return (rhVal<fhVal)? rhVal : fhVal;
	195	}
	196
	197	// ntBase with seeding option
	198	inline uint64_t NTF64(const char * kmerSeq, const unsigned k, const unsigned seed) {
	199	uint64_t hVal=NTF64(kmerSeq, k);
	200	if(seed==0) return hVal;
	201	hVal = seed ^ k multiSeed;
	202	hVal ^= hVal >> multiShift;
	203	return hVal;
	204	}
	205
	206	// canonical ntBase with seeding option
	207	inline uint64_t NTC64(const char * kmerSeq, const unsigned k, const unsigned seed) {
	208	uint64_t hVal = NTC64(kmerSeq,k);
	209	if(seed==0) return hVal;
	210	hVal = seed ^ k multiSeed;
	211	hVal ^= hVal >> multiShift;
	212	return hVal;
	213	}
	214
	215	// multihash ntHash, ntBase
	216	inline void NTM64(const char * kmerSeq, const unsigned k, const unsigned m, uint64_t *hVal) {
	217	uint64_t bVal=0, tVal=0;
	218	bVal = NTF64(kmerSeq, k);
	219	hVal[0] = bVal;
	220	for(unsigned i=1; i<m; i++) {
	221	tVal = bVal * (i ^ k * multiSeed);
	222	tVal ^= tVal >> multiShift;
	223	hVal[i] = tVal;
	224	}
	225	}
	226
	227	// one extra hash for given base hash
	228	inline uint64_t NTE64(const uint64_t hVal, const unsigned k, const unsigned i) {
	229	uint64_t tVal = hVal;
	230	tVal = (i ^ k multiSeed);
	231	tVal ^= tVal >> multiShift;
	232	return tVal;
	233	}
	234
	235	// multihash ntHash for sliding k-mers
	236	inline void NTM64(const unsigned char charOut, const unsigned char charIn, const unsigned k, const unsigned m, uint64_t *hVal) {
	237	uint64_t bVal=0, tVal=0;
	238	bVal = NTF64(hVal[0], k, charOut, charIn);
	239	hVal[0] = bVal;
	240	for(unsigned i=1; i<m; i++) {
	241	tVal = bVal * (i ^ k * multiSeed);
	242	tVal ^= tVal >> multiShift;
	243	hVal[i] = tVal;
	244	}
	245	}
	246
	247	// canonical multihash ntBase
	248	inline void NTMC64(const char * kmerSeq, const unsigned k, const unsigned m, uint64_t *hVal) {
	249	uint64_t bVal=0, tVal=0;
	250	bVal = NTC64(kmerSeq, k);
	251	hVal[0] = bVal;
	252	for(unsigned i=1; i<m; i++) {
	253	tVal = bVal * (i ^ k * multiSeed);
	254	tVal ^= tVal >> multiShift;
	255	hVal[i] = tVal;
	256	}
	257	}
	258
	259	// canonical multihash ntHash
	260	inline void NTMC64(const char * kmerSeq, const unsigned k, const unsigned m, uint64_t& fhVal, uint64_t& rhVal, uint64_t *hVal) {
	261	uint64_t bVal=0, tVal=0;
	262	bVal = NTC64(kmerSeq, k, fhVal, rhVal);
	263	hVal[0] = bVal;
	264	for(unsigned i=1; i<m; i++) {
	265	tVal = bVal * (i ^ k * multiSeed);
	266	tVal ^= tVal >> multiShift;
	267	hVal[i] = tVal;
	268	}
	269	}
	270
	271	// canonical multihash ntHash for sliding k-mers
	272	inline void NTMC64(const unsigned char charOut, const unsigned char charIn, const unsigned k, const unsigned m, uint64_t& fhVal, uint64_t& rhVal, uint64_t *hVal) {
	273	uint64_t bVal=0, tVal=0;
	274	bVal = NTC64(charOut, charIn, k, fhVal, rhVal);
	275	hVal[0] = bVal;
	276	for(unsigned i=1; i<m; i++) {
	277	tVal = bVal * (i ^ k * multiSeed);
	278	tVal ^= tVal >> multiShift;
	279	hVal[i] = tVal;
	280	}
	281	}
	282
	283	/*
	284	* ignoring k-mers containing nonACGT using ntHash function
	285	*/
	286
	287	// canonical ntBase
	288	inline bool NTC4(const char *kmerSeq, const unsigned k, uint64_t& hVal, unsigned& locN) {
	289	hVal=0;
	290	locN=0;
	291	uint64_t fhVal=0,rhVal=0;
	292	for(int i=k-1; i>=0; i--) {
	293	if(seedTab[(unsigned char)kmerSeq[i]]==seedN) {
	294	locN=i;
	295	return false;
	296	}
	297	fhVal = rol1(fhVal);
	298	fhVal = swapbits033(fhVal);
	299	fhVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]];
	300
	301	rhVal = rol1(rhVal);
	302	rhVal = swapbits033(rhVal);
	303	rhVal ^= seedTab[(unsigned char)kmerSeq[i]&cpOff];
	304	}
	305	hVal = (rhVal<fhVal)? rhVal : fhVal;
	306	return true;
	307	}
	308
	309	// canonical multihash ntBase
	310	inline bool NTMC64(const char kmerSeq, const unsigned k, const unsigned m, unsigned& locN, uint64_t hVal) {
	311	uint64_t bVal=0, tVal=0, fhVal=0, rhVal=0;
	312	locN=0;
	313	for(int i=k-1; i>=0; i--) {
	314	if(seedTab[(unsigned char)kmerSeq[i]]==seedN) {
	315	locN=i;
	316	return false;
	317	}
	318	fhVal = rol1(fhVal);
	319	fhVal = swapbits033(fhVal);
	320	fhVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]];
	321
	322	rhVal = rol1(rhVal);
	323	rhVal = swapbits033(rhVal);
	324	rhVal ^= seedTab[(unsigned char)kmerSeq[i]&cpOff];
	325	}
	326	bVal = (rhVal<fhVal)? rhVal : fhVal;
	327	hVal[0] = bVal;
	328	for(unsigned i=1; i<m; i++) {
	329	tVal = bVal * (i ^ k * multiSeed);
	330	tVal ^= tVal >> multiShift;
	331	hVal[i] = tVal;
	332	}
	333	return true;
	334	}
	335
	336	// canonical ntHash
	337	inline bool NTC64(const char *kmerSeq, const unsigned k, uint64_t& fhVal, uint64_t& rhVal, uint64_t& hVal, unsigned& locN) {
	338	hVal=fhVal=rhVal=0;
	339	locN=0;
	340	for(int i=k-1; i>=0; i--) {
	341	if(seedTab[(unsigned char)kmerSeq[i]]==seedN) {
	342	locN=i;
	343	return false;
	344	}
	345	fhVal = rol1(fhVal);
	346	fhVal = swapbits033(fhVal);
	347	fhVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]];
	348
	349	rhVal = rol1(rhVal);
	350	rhVal = swapbits033(rhVal);
	351	rhVal ^= seedTab[(unsigned char)kmerSeq[i]&cpOff];
	352	}
	353	hVal = (rhVal<fhVal)? rhVal : fhVal;
	354	return true;
	355	}
	356
	357	// canonical multihash ntHash
	358	inline bool NTMC64(const char kmerSeq, const unsigned k, const unsigned m, uint64_t& fhVal, uint64_t& rhVal, unsigned& locN, uint64_t hVal) {
	359	fhVal=rhVal=0;
	360	uint64_t bVal=0, tVal=0;
	361	locN=0;
	362	for(int i=k-1; i>=0; i--) {
	363	if(seedTab[(unsigned char)kmerSeq[i]]==seedN) {
	364	locN=i;
	365	return false;
	366	}
	367	fhVal = rol1(fhVal);
	368	fhVal = swapbits033(fhVal);
	369	fhVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]];
	370
	371	rhVal = rol1(rhVal);
	372	rhVal = swapbits033(rhVal);
	373	rhVal ^= seedTab[(unsigned char)kmerSeq[i]&cpOff];
	374	}
	375	bVal = (rhVal<fhVal)? rhVal : fhVal;
	376	hVal[0] = bVal;
	377	for(unsigned i=1; i<m; i++) {
	378	tVal = bVal * (i ^ k * multiSeed);
	379	tVal ^= tVal >> multiShift;
	380	hVal[i] = tVal;
	381	}
	382	return true;
	383	}
	384
	385	// strand-aware canonical multihash ntHash
	386	inline bool NTMC64(const char kmerSeq, const unsigned k, const unsigned m, uint64_t& fhVal, uint64_t& rhVal, unsigned& locN, uint64_t hVal, bool& hStn) {
	387	fhVal=rhVal=0;
	388	uint64_t bVal=0, tVal=0;
	389	locN=0;
	390	for(int i=k-1; i>=0; i--) {
	391	if(seedTab[(unsigned char)kmerSeq[i]]==seedN) {
	392	locN=i;
	393	return false;
	394	}
	395	fhVal = rol1(fhVal);
	396	fhVal = swapbits033(fhVal);
	397	fhVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]];
	398
	399	rhVal = rol1(rhVal);
	400	rhVal = swapbits033(rhVal);
	401	rhVal ^= seedTab[(unsigned char)kmerSeq[i]&cpOff];
	402	}
	403	hStn = rhVal<fhVal;
	404	bVal = hStn? rhVal : fhVal;
	405	hVal[0] = bVal;
	406	for(unsigned i=1; i<m; i++) {
	407	tVal = bVal * (i ^ k * multiSeed);
	408	tVal ^= tVal >> multiShift;
	409	hVal[i] = tVal;
	410	}
	411	return true;
	412	}
	413
	414	// starnd-aware canonical multihash ntHash for sliding k-mers
	415	inline void NTMC64(const unsigned char charOut, const unsigned char charIn, const unsigned k, const unsigned m, uint64_t& fhVal, uint64_t& rhVal, uint64_t *hVal, bool &hStn) {
	416	uint64_t bVal=0, tVal=0;
	417	bVal = NTC64(charOut, charIn, k, fhVal, rhVal);
	418	hStn = rhVal<fhVal;
	419	hVal[0] = bVal;
	420	for(unsigned i=1; i<m; i++) {
	421	tVal = bVal * (i ^ k * multiSeed);
	422	tVal ^= tVal >> multiShift;
	423	hVal[i] = tVal;
	424	}
	425	}
	426
	427	// masking canonical ntHash using spaced seed pattern
	428	inline uint64_t maskHash(uint64_t &fkVal, uint64_t &rkVal, const char * seedSeq, const char * kmerSeq, const unsigned k) {
	429	uint64_t fsVal=fkVal, rsVal=rkVal;
	430	for(unsigned i=0; i<k; i++) {
	431	if(seedSeq[i]!='1') {
	432	uint64_t lfBits = seedTab[(unsigned char)kmerSeq[i]] >> 33;
	433	uint64_t rfBits = seedTab[(unsigned char)kmerSeq[i]] & 0x1FFFFFFFF;
	434	uint64_t sfMask = (rol31(lfBits,k-1-i) << 33) \| (rol33(rfBits,k-1-i));
	435	fsVal ^= sfMask;
	436
	437	uint64_t lrBits = seedTab[(unsigned char)kmerSeq[i]&cpOff] >> 33;
	438	uint64_t rrBits = seedTab[(unsigned char)kmerSeq[i]&cpOff] & 0x1FFFFFFFF;
	439	uint64_t srMask = (rol31(lrBits,i) << 33) \| (rol33(rrBits,i));
	440	rsVal ^= srMask;
	441	}
	442	}
	443	return (rsVal<fsVal)? rsVal : fsVal;
	444	}
	445
	446	#endif

+0

-1

~~lib/rolling-hash/Makefile.am~~ less more

0

EXTRA_DIST = README.md

+0

-2

~~lib/rolling-hash/README.md~~ less more

0		* source repo: https://github.com/bcgsc/ntHash
1		* git commit: 9f107de

+0

-316

~~lib/rolling-hash/rolling.h~~ less more

0		#ifndef ROLLING_HASH_H
1		#define ROLLING_HASH_H
2
3		#include <stdint.h>
4
5		// offset for the complement base in the random seeds table
6		const int cpOff = -20;
7
8		// shift for gerenerating multiple hash values
9		const int varShift = 27;
10
11		// seed for gerenerating multiple hash values
12		const uint64_t varSeed = 10427061540882326010ul;
13
14		// 64-bit random seed table corresponding to bases and their complements
15		static const uint64_t seedTab[256] = {
16		0, 0, 0, 0, 0, 0, 0, 0, // 0..7
17		0, 0, 0, 0, 0, 0, 0, 0, // 8..15
18		0, 0, 0, 0, 0, 0, 0, 0, // 16..23
19		0, 0, 0, 0, 0, 0, 0, 0, // 24..31
20		0, 0, 0, 0, 0, 0, 0, 0, // 32..39
21		0, 0, 0, 0, 0, 2978368046464386134ul, 0, 2319985823310095140ul, // 40..47
22		0, 0, 0, 3572411708064410444ul, 0, 0, 0, 0, // 48..55
23		0, 0, 0, 0, 0, 0, 0, 0, // 56..63
24		4362857412768957556ul, 4362857412768957556ul, 0, 3572411708064410444ul, 0, 0, 0, 2319985823310095140ul, // 64..71
25		0, 0, 0, 0, 0, 2978368046464386134ul, 0, 2319985823310095140ul, // 72..79
26		0, 0, 0, 3572411708064410444ul, 2978368046464386134ul, 0, 0, 0, // 80..87
27		0, 0, 0, 0, 0, 0, 0, 0, // 88..95
28		4362857412768957556ul, 4362857412768957556ul, 0, 3572411708064410444ul, 0, 0, 0, 2319985823310095140ul, // 96..103
29		0, 0, 0, 0, 0, 0, 0, 0, // 104..111
30		0, 0, 0, 0, 2978368046464386134ul, 0, 0, 0, // 112..119
31		0, 0, 0, 0, 0, 0, 0, 0, // 120..127
32		0, 0, 0, 0, 0, 0, 0, 0, // 128..135
33		0, 0, 0, 0, 0, 0, 0, 0, // 136..143
34		0, 0, 0, 0, 0, 0, 0, 0, // 144..151
35		0, 0, 0, 0, 0, 0, 0, 0, // 152..159
36		0, 0, 0, 0, 0, 0, 0, 0, // 160..167
37		0, 0, 0, 0, 0, 0, 0, 0, // 168..175
38		0, 0, 0, 0, 0, 0, 0, 0, // 176..183
39		0, 0, 0, 0, 0, 0, 0, 0, // 184..191
40		0, 0, 0, 0, 0, 0, 0, 0, // 192..199
41		0, 0, 0, 0, 0, 0, 0, 0, // 200..207
42		0, 0, 0, 0, 0, 0, 0, 0, // 208..215
43		0, 0, 0, 0, 0, 0, 0, 0, // 216..223
44		0, 0, 0, 0, 0, 0, 0, 0, // 224..231
45		0, 0, 0, 0, 0, 0, 0, 0, // 232..239
46		0, 0, 0, 0, 0, 0, 0, 0, // 240..247
47		0, 0, 0, 0, 0, 0, 0, 0 // 248..255
48		};
49
50		// rotate "v" to the left by "s" positions
51		inline uint64_t rol(const uint64_t v, const int s) {
52		return (v << s) \| (v >> (64 - s));
53		}
54
55		// rotate "v" to the right by "s" positions
56		inline uint64_t ror(const uint64_t v, const int s) {
57		return (v >> s) \| (v << (64 - s));
58		}
59
60		// forward-strand hash value of the base kmer, i.e. fhval(kmer_0)
61		inline uint64_t getFhval(const char * kmerSeq, const unsigned k) {
62		uint64_t hVal=0;
63		for(unsigned i=0; i<k; i++)
64		hVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i);
65		return hVal;
66		}
67
68		// reverse-strand hash value of the base kmer, i.e. rhval(kmer_0)
69		inline uint64_t getRhval(const char * kmerSeq, const unsigned k) {
70		uint64_t hVal=0;
71		for(unsigned i=0; i<k; i++)
72		hVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i);
73		return hVal;
74		}
75
76		// cannonical hash value of the base kmer, i.e. rhval(kmer_0)
77		inline uint64_t getChval(const char * kmerSeq, const unsigned k) {
78		uint64_t fhVal = getFhval(kmerSeq, k);
79		uint64_t rhVal = getRhval(kmerSeq, k);
80		return (rhVal<fhVal)? rhVal : fhVal;
81		}
82
83		// initialize forward-strand hash value of the first kmer, i.e. fhval(kmer_0)
84		inline uint64_t initHashes(const char * kmerSeq, const unsigned k) {
85		return getFhval(kmerSeq, k);
86		}
87
88		// initialize cannonical hash value of the first kmer, i.e. chval(kmer_0)
89		inline uint64_t initHashes(const char * kmerSeq, const unsigned k, uint64_t& fhVal, uint64_t& rhVal) {
90		fhVal = getFhval(kmerSeq, k);
91		rhVal = getRhval(kmerSeq, k);
92		return (rhVal<fhVal)? rhVal : fhVal;
93		}
94
95		// recursive forward-strand hash value for next k-mer
96		inline uint64_t rollHashesRight(const uint64_t fhVal, const unsigned char charOut, const unsigned char charIn, const unsigned k) {
97		return(rol(fhVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn]);
98		}
99
100		// recursive cannonical hash value for next k-mer
101		inline uint64_t rollHashesRight(uint64_t& fhVal, uint64_t& rhVal, const unsigned char charOut, const unsigned char charIn, const unsigned k) {
102		fhVal = rol(fhVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn];
103		rhVal = ror(rhVal, 1) ^ ror(seedTab[charOut+cpOff], 1) ^ rol(seedTab[charIn+cpOff], k-1);
104		return (rhVal<fhVal)? rhVal : fhVal;
105		}
106
107		// recursive forward-strand hash value for prev k-mer
108		inline uint64_t rollHashesLeft(const uint64_t fhVal, const unsigned char charIn, const unsigned char charOut, const unsigned k) {
109		return(ror(fhVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1));
110		}
111
112		// recursive canonical hash value for prev k-mer
113		inline uint64_t rollHashesLeft(uint64_t& fhVal, uint64_t& rhVal, const unsigned char charIn, const unsigned char charOut, const unsigned k) {
114		fhVal = ror(fhVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1);
115		rhVal = rol(rhVal, 1) ^ rol(seedTab[charOut+cpOff], k) ^ seedTab[charIn+cpOff];
116		return (rhVal<fhVal)? rhVal : fhVal;
117		}
118
119		// change a single base and update forward-strand hash value accordingly
120		inline uint64_t setBase(uint64_t fhVal, char* kmerSeq, unsigned pos, char base, unsigned k)
121		{
122		fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos);
123		kmerSeq[pos] = base;
124		fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos);
125		return fhVal;
126		}
127
128		// change a single base and update hash values accordingly
129		inline uint64_t setBase(uint64_t& fhVal, uint64_t& rhVal, char* kmerSeq, unsigned pos, char base, unsigned k)
130		{
131		fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos);
132		rhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]+cpOff], pos);
133		kmerSeq[pos] = base;
134		fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos);
135		rhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]+cpOff], pos);
136		return (rhVal<fhVal)? rhVal : fhVal;
137		}
138
139		/**
140		* Compute multiple pseudo-independent hash values from a seed hash value.
141		*
142		* @param hashes array for storing computed hash values
143		* @param seedVal seed value for multi-hash calculation
144		* @param numHashes number of hash values to compute
145		* @param k-mer size
146		*/
147		inline void multiHash(uint64_t hashes[], uint64_t seedVal, unsigned numHashes, unsigned k)
148		{
149		for (unsigned i = 0; i < numHashes; i++) {
150		hashes[i] = seedVal * (i ^ k * varSeed);
151		hashes[i] ^= hashes[i] >> varShift;
152		}
153		}
154
155		// spaced-seed hash values
156
157		/**
158		* Calculate forward-strand spaced seed hash value of the base kmer, i.e. fhval(kmer_0)
159		*
160		* @param kVal set to forward-strand hash value for unmasked k-mer
161		* @param seedSeq bitmask indicating "don't care" positions for hashing
162		* @param kmerSeq k-mer to be hashed
163		* @param k k-mer size
164		* @return hash value for masked forward-strand k-mer
165		*/
166		inline uint64_t getFhval(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned k) {
167		kVal=0;
168		uint64_t sVal=0;
169		for(unsigned i=0; i<k; i++) {
170		kVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i);
171		if(seedSeq[i]=='1')
172		sVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i);
173		}
174		return sVal;
175		}
176
177		/**
178		* Calculate reverse-strand spaced seed hash value of the base kmer, i.e. rhval(kmer_0)
179		*
180		* @param kVal set to reverse-strand hash value for unmasked k-mer
181		* @param seedSeq bitmask indicating "don't care" positions for hashing
182		* @param kmerSeq k-mer to be hashed
183		* @param k k-mer size
184		* @return hash for masked reverse-strand k-mer
185		*/
186		// reverse-strand spaced seed hash value of the base kmer, i.e. rhval(kmer_0)
187		inline uint64_t getRhval(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned k) {
188		kVal=0;
189		uint64_t sVal=0;
190		for(unsigned i=0; i<k; i++) {
191		kVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i);
192		if(seedSeq[i]=='1')
193		sVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i);
194		}
195		return sVal;
196		}
197
198		/**
199		* Recursive forward-strand spaced seed hash value for next k-mer
200		*
201		* @param kVal hash value for current k-mer unmasked and in forward orientation
202		* @param seedSeq bitmask indicating "don't care" positions for hashing
203		* @param kmerSeq sequence for current k-mer (not the k-mer we are rolling into)
204		* @param charIn new base we are rolling in from the right
205		* @param k k-mer size
206		* @return hash for masked k-mer in forward orientation
207		*/
208		inline uint64_t rollHashesRight(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) {
209		const unsigned charOut = kmerSeq[0];
210		kVal = rol(kVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn];
211		uint64_t sVal=kVal;
212		for(unsigned i=1; i<k-1; i++) {
213		if(seedSeq[i]!='1')
214		sVal ^= rol(seedTab[(unsigned char)kmerSeq[i+1]], k-1-i);
215		}
216		return sVal;
217		}
218
219		/**
220		* Recursive forward-strand spaced seed hash value for prev k-mer
221		*
222		* @param kVal hash value for current k-mer unmasked and in forward orientation
223		* @param seedSeq bitmask indicating "don't care" positions for hashing
224		* @param kmerSeq sequence for current k-mer (not the k-mer we are rolling into)
225		* @param charIn new base we are rolling in from the left
226		* @param k k-mer size
227		* @return hash for masked k-mer in forward orientation
228		*/
229		inline uint64_t rollHashesLeft(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) {
230		const unsigned charOut = kmerSeq[k-1];
231		kVal = ror(kVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1);
232		uint64_t sVal=kVal;
233		for(unsigned i=1; i<k-1; i++) {
234		if(seedSeq[i]!='1')
235		sVal ^= rol(seedTab[(unsigned char)kmerSeq[i-1]], k-1-i);
236		}
237		return sVal;
238		}
239
240		/**
241		* Recursive canonical spaced seed hash value for next k-mer
242		*
243		* @param fkVal hash value for current k-mer unmasked and in forward orientation
244		* @param rkVal hash value for current k-mer unmasked and in reverse complement orientation
245		* @param seedSeq bitmask indicating "don't care" positions for hashing
246		* @param kmerSeq sequence for current k-mer (not the k-mer we are rolling into)
247		* @param charIn new base we are rolling in from the right
248		* @param k k-mer size
249		* @return canonical hash value for masked k-mer
250		*/
251		inline uint64_t rollHashesRight(uint64_t &fkVal, uint64_t &rkVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) {
252		const unsigned charOut = kmerSeq[0];
253		fkVal = rol(fkVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn];
254		rkVal = ror(rkVal, 1) ^ ror(seedTab[charOut+cpOff], 1) ^ rol(seedTab[charIn+cpOff], k-1);
255		uint64_t fsVal=fkVal, rsVal=rkVal;
256		for(unsigned i=1; i<k-1; i++) {
257		if(seedSeq[i]!='1') {
258		fsVal ^= rol(seedTab[(unsigned char)kmerSeq[i+1]], k-1-i);
259		rsVal ^= rol(seedTab[(unsigned char)kmerSeq[i+1]+cpOff], i);
260		}
261		}
262		return (rsVal<fsVal)? rsVal : fsVal;
263		}
264
265		/**
266		* Recursive canonical spaced seed hash value for prev k-mer
267		*
268		* @param fkVal hash value for current k-mer unmasked and in forward orientation
269		* @param rkVal hash value for current k-mer unmasked and in reverse complement orientation
270		* @param seedSeq bitmask indicating "don't care" positions for hashing
271		* @param kmerSeq sequence for current k-mer (not the k-mer we are rolling into)
272		* @param charIn new base we are rolling in from the left
273		* @param k k-mer size
274		* @return canonical hash value for masked k-mer
275		*/
276		inline uint64_t rollHashesLeft(uint64_t &fkVal, uint64_t &rkVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) {
277		const unsigned charOut = kmerSeq[k-1];
278		fkVal = ror(fkVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1);
279		rkVal = rol(rkVal, 1) ^ rol(seedTab[charOut+cpOff], k) ^ seedTab[charIn+cpOff];
280		uint64_t fsVal=fkVal, rsVal=rkVal;
281		for(unsigned i=1; i<k-1; i++) {
282		if(seedSeq[i]!='1') {
283		fsVal ^= rol(seedTab[(unsigned char)kmerSeq[i-1]], k-1-i);
284		rsVal ^= rol(seedTab[(unsigned char)kmerSeq[i-1]+cpOff], i);
285		}
286		}
287		return (rsVal<fsVal)? rsVal : fsVal;
288		}
289
290		/**
291		* Change a single base and recompute spaced seed hash values
292		*
293		* @param fkVal hash value for current k-mer unmasked and in forward orientation
294		* @param rkVal hash value for current k-mer unmasked and in reverse complement orientation
295		* @param seedSeq bitmask indicating "don't care" positions for hashing
296		* @param kmerSeq sequence for current k-mer
297		* @param pos position of base to change
298		* @param base new base value
299		* @param k k-mer size
300		* @return updated canonical hash value for masked k-mer
301		*/
302		inline uint64_t setBase(uint64_t& fkVal, uint64_t& rkVal, const char * seedSeq, char * kmerSeq, unsigned pos, char base, unsigned k)
303		{
304		setBase(fkVal, rkVal, kmerSeq, pos, base, k);
305		uint64_t fsVal=fkVal, rsVal=rkVal;
306		for(unsigned i=0; i<k; i++) {
307		if(seedSeq[i]!='1') {
308		fsVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i);
309		rsVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i);
310		}
311		}
312		return (rsVal<fsVal)? rsVal : fsVal;
313		}
314
315		#endif