Codebase list abyss / bd7747b
New upstream version 2.1.1 Andreas Tille 5 years ago
61 changed file(s) with 999 addition(s) and 798 deletion(s). Raw diff Collapse all Expand all
11 jobs:
22 build:
33 docker:
4 - image: ubuntu:xenial
4 - image: ubuntu:bionic
55 steps:
66 - run: |
77 apt-get update -qq
0 # Please report
1
2 - [ ] version of ABySS with `abyss-pe version`
3 - [ ] distribution of Linux with `lsb_release -d`
4
5 # Assembly error
6
7 - [ ] complete `abyss-pe` command line
8 - [ ] last 20 lines of the output of `abyss-pe`
9 - [ ] number of sequenced bases
10 - [ ] estimated genome size and ploidy
11 - [ ] estimated sequencing depth of coverage
12
13 # Build error
14
15 Consider installing ABySS using [Linuxbrew](https://linuxbrew.sh) on Linux or [Homebrew](https://brew.sh) on macOS with `brew install abyss`, or using [Bioconda](https://bioconda.github.io) with `conda install abyss`.
16
17 - [ ] Have you tried installing ABySS using Brew or Bioconda?
18 - [ ] version of GCC or compiler with `gcc --version`
19 - [ ] complete `./configure` command line
20 - [ ] last 20 lines of the output of `./configure`
21 - [ ] last 20 lines of the output of `make`
7171
7272 AssemblyAlgorithms::setCoverageParameters(
7373 AssemblyAlgorithms::coverageHistogram(g));
74
75 if (opt::kc > 0) {
76 cout << "Minimum k-mer multiplicity kc is " << opt::kc << endl;
77 cout << "Removing low-multiplicity k-mers" << endl;
78 size_t removed = AssemblyAlgorithms::applyKmerCoverageThreshold(g, opt::kc);
79 cout << "Removed " << removed
80 << " low-multiplicity k-mers, " << g.size()
81 << " k-mers remaining" << std::endl;
82 }
7483
7584 cout << "Generating adjacency" << endl;
7685 AssemblyAlgorithms::generateAdjacency(&g);
11 #define BRANCHGROUP_H 1
22
33 #include "Common/Algorithms.h"
4 #include "Common/Exception.h"
45 #include <algorithm> // for swap
56 #include <map>
67 #include <utility>
210211
211212 namespace std {
212213 template <>
213 inline void swap(BranchGroup&, BranchGroup&) { assert(false); }
214 inline void swap(BranchGroup&, BranchGroup&) NOEXCEPT { assert(false); }
214215 }
215216
216217 #endif
00 #ifndef ASSEMBLY_BRANCHRECORDBASE_H
11 #define ASSEMBLY_BRANCHRECORDBASE_H 1
2
3 #include "Common/Exception.h"
24
35 #include <algorithm>
46 #include <cassert>
171173
172174 namespace std {
173175 template <>
174 inline void swap(BranchRecord& a, BranchRecord& b)
176 inline void swap(BranchRecord& a, BranchRecord& b) NOEXCEPT
175177 {
176178 a.swap(b);
177179 }
111111 }
112112 }
113113
114 /** Remove all k-mers with multiplicity lower than the given threshold */
115 static inline
116 size_t applyKmerCoverageThreshold(SequenceCollectionHash& c, unsigned kc)
117 {
118 if (kc == 0)
119 return 0;
120
121 for (SequenceCollectionHash::iterator it = c.begin();
122 it != c.end(); ++it) {
123 if (it->second.getMultiplicity() < kc)
124 it->second.setFlag(SF_DELETE);
125 }
126
127 return c.cleanup();
128 }
129
114130 } // namespace AssemblyAlgorithms
115131
116132 #endif
5353 " -t, --trim-length=N maximum length of blunt contigs to trim [k]\n"
5454 " -c, --coverage=FLOAT remove contigs with mean k-mer coverage\n"
5555 " less than this threshold\n"
56 " --kc=N remove all k-mers with multiplicity < N [0]\n"
5657 " -b, --bubbles=N pop bubbles shorter than N bp [3*k]\n"
5758 " -b0, --no-bubbles do not pop bubbles\n"
5859 " -e, --erode=N erode bases at the ends of blunt contigs with coverage\n"
106107 /** Coverage cutoff. */
107108 float coverage = -1;
108109
110 /** Minimum k-mer multiplicity cutoff. */
111 unsigned kc = 0;
112
109113 /** Pop bubbles shorter than N bp. */
110114 int bubbleLen = -1;
111115
146150
147151 static const char shortopts[] = "b:c:e:E:g:k:K:mo:Q:q:s:t:v";
148152
149 enum { OPT_HELP = 1, OPT_VERSION, COVERAGE_HIST, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
153 enum { OPT_HELP = 1, OPT_VERSION, COVERAGE_HIST, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES, OPT_KC };
150154
151155 static const struct option longopts[] = {
152156 { "out", required_argument, NULL, 'o' },
163167 { "SS", no_argument, &opt::ss, 1 },
164168 { "no-SS", no_argument, &opt::ss, 0 },
165169 { "coverage", required_argument, NULL, 'c' },
170 { "kc", required_argument, NULL, OPT_KC },
166171 { "coverage-hist", required_argument, NULL, COVERAGE_HIST },
167172 { "bubble-length", required_argument, NULL, 'b' },
168173 { "no-bubbles", no_argument, &opt::bubbleLen, 0 },
301306 case OPT_SPECIES:
302307 arg >> opt::metaVars[2];
303308 break;
309 case OPT_KC:
310 arg >> opt::kc;
311 break;
304312 }
305313 if (optarg != NULL && !arg.eof()) {
306314 cerr << PROGRAM ": invalid option: `-"
1313 extern unsigned erodeStrand;
1414 extern unsigned trimLen;
1515 extern float coverage;
16 extern unsigned kc;
1617 extern unsigned bubbleLen;
1718 extern unsigned ss;
1819 extern bool maskCov;
1212 #include <vector>
1313 #include <iostream>
1414 #include <boost/dynamic_bitset.hpp>
15
16 /*
17 * Put `BloomFilter` class in `Konnector` namespace to avoid collision with BTL
18 * `BloomFilter` class of the same name.
19 */
20 namespace Konnector {
1521
1622 /** A Bloom filter. */
1723 class BloomFilter
164170 char* m_array;
165171 };
166172
173 } // end Konnector namespace
174
167175 #endif
1313 * A bloom filter that represents a window
1414 * within a larger bloom filter.
1515 */
16 class BloomFilterWindow : public BloomFilter
16 class BloomFilterWindow : public Konnector::BloomFilter
1717 {
1818 public:
1919
2020 /** Constructor. */
21 BloomFilterWindow() : BloomFilter() { };
21 BloomFilterWindow() : Konnector::BloomFilter() { };
2222
2323 /** Constructor.
2424 *
2828 */
2929 BloomFilterWindow(size_t fullBloomSize, size_t startBitPos,
3030 size_t endBitPos, size_t hashSeed=0) :
31 BloomFilter(endBitPos - startBitPos + 1, hashSeed),
31 Konnector::BloomFilter(endBitPos - startBitPos + 1, hashSeed),
3232 m_fullBloomSize(fullBloomSize),
3333 m_startBitPos(startBitPos),
3434 m_endBitPos(endBitPos)
6262 /** Return the size of the bit array. */
6363 size_t size() const
6464 {
65 return BloomFilter::size();
65 return Konnector::BloomFilter::size();
6666 }
6767
6868 /** Return the number of elements with count >= max_count. */
6969 size_t popcount() const
7070 {
71 return BloomFilter::popcount();
71 return Konnector::BloomFilter::popcount();
7272 }
7373
7474 /** Return the estimated false positive rate */
7575 double FPR() const
7676 {
77 return BloomFilter::FPR();
77 return Konnector::BloomFilter::FPR();
7878 }
7979
8080 /** Return whether the specified bit is set. */
8181 bool operator[](size_t i) const
8282 {
8383 if (i >= m_startBitPos && i <= m_endBitPos)
84 return BloomFilter::operator[](i - m_startBitPos);
84 return Konnector::BloomFilter::operator[](i - m_startBitPos);
8585 return false;
8686 }
8787
9595 void insert(size_t i)
9696 {
9797 if (i >= m_startBitPos && i <= m_endBitPos)
98 BloomFilter::insert(i - m_startBitPos);
98 Konnector::BloomFilter::insert(i - m_startBitPos);
9999 }
100100
101101 /** Add the object to this set. */
142142
143143 if (m_size != bits) {
144144 if (readOp == BITWISE_OVERWRITE) {
145 BloomFilter::resize(bits);
145 Konnector::BloomFilter::resize(bits);
146146 } else {
147147 std::cerr << "error: can't union/intersect bloom filters with "
148148 << "different sizes\n";
2121 {
2222 m_data.reserve(max_count);
2323 for (unsigned i = 0; i < max_count; i++)
24 m_data.push_back(new BloomFilter(n, hashSeed));
24 m_data.push_back(new Konnector::BloomFilter(n, hashSeed));
2525 }
2626
2727 /** Destructor */
2828 ~CascadingBloomFilter()
2929 {
30 typedef std::vector<BloomFilter*>::iterator Iterator;
30 typedef std::vector<Konnector::BloomFilter*>::iterator Iterator;
3131 for (Iterator i = m_data.begin(); i != m_data.end(); i++) {
3232 assert(*i != NULL);
3333 delete *i;
9090 }
9191
9292 /** Get the Bloom filter for a given level */
93 BloomFilter& getBloomFilter(unsigned level)
93 Konnector::BloomFilter& getBloomFilter(unsigned level)
9494 {
9595 assert(m_data.at(level) != NULL);
9696 return *m_data.at(level);
111111
112112 private:
113113 size_t m_hashSeed;
114 std::vector<BloomFilter*> m_data;
114 std::vector<Konnector::BloomFilter*> m_data;
115115
116116 };
117117
204204 { NULL, 0, NULL, 0 }
205205 };
206206
207 void dieWithUsageError()
207 __attribute__((noreturn))
208 static void dieWithUsageError()
208209 {
209210 cerr << "Try `" << PROGRAM
210211 << " --help' for more information.\n";
417418 if (opt::windows == 0) {
418419
419420 if (opt::levels == 1) {
420 BloomFilter bloom(bits, opt::hashSeed);
421 Konnector::BloomFilter bloom(bits, opt::hashSeed);
421422 #ifdef _OPENMP
422 ConcurrentBloomFilter<BloomFilter>
423 ConcurrentBloomFilter<Konnector::BloomFilter>
423424 cbf(bloom, opt::numLocks, opt::hashSeed);
424425 loadFilters(cbf, argc, argv);
425426 #else
639640 string outputPath(argv[optind]);
640641 optind++;
641642
642 BloomFilter bloom;
643 Konnector::BloomFilter bloom;
643644
644645 for (int i = optind; i < argc; i++) {
645646 string path(argv[i]);
694695 dieWithUsageError();
695696 }
696697
697 BloomFilter bloom;
698 Konnector::BloomFilter bloom;
698699 string path = argv[optind];
699700
700701 if (opt::verbose)
743744 std::cerr << "Computing distance for 2"
744745 << " samples...\n";
745746 // Get both paths and open istreams
746 BloomFilter bloomA;
747 Konnector::BloomFilter bloomA;
747748 string pathA(argv[optind]);
748 BloomFilter bloomB;
749 Konnector::BloomFilter bloomB;
749750 string pathB(argv[optind+1]);
750751 if (opt::verbose)
751752 std::cerr << "Loading bloom filters from "
844845
845846 int memberOf(int argc, char ** argv){
846847 // Initalise bloom and get globals
847 BloomFilter bloom;
848 Konnector::BloomFilter bloom;
848849 parseGlobalOpts(argc, argv);
849850 // Arg parser to get `m' option in case set
850851 for (int c; (c = getopt_long(argc, argv,
928929 /**
929930 * Calculate number of bases to trim from left end of sequence.
930931 */
931 int calcLeftTrim(const Sequence& seq, unsigned k, const BloomFilter& bloom,
932 int calcLeftTrim(const Sequence& seq, unsigned k, const Konnector::BloomFilter& bloom,
932933 size_t minBranchLen)
933934 {
934935 // Boost graph interface for Bloom filter
935 DBGBloom<BloomFilter> g(bloom);
936 DBGBloom<Konnector::BloomFilter> g(bloom);
936937
937938 // if this is the first k-mer we have found in
938939 // Bloom filter, starting from the left end
9991000 cerr << "Loading bloom filter from `"
10001001 << bloomPath << "'...\n";
10011002
1002 BloomFilter bloom;
1003 Konnector::BloomFilter bloom;
10031004 istream *in = openInputStream(bloomPath);
10041005 assert_good(*in, bloomPath);
10051006 bloom.read(*in);
4141 {
4242 m_data.reserve(levels);
4343 for (unsigned i = 0; i < levels; i++)
44 m_data.push_back(new BTL::BloomFilter(size, hashes, k));
44 m_data.push_back(new BloomFilter(size, hashes, k));
4545 }
4646
4747 /**
48 * Constructor to load a single-level BTL::BloomFilter from
49 * files. This is used to make BTL::BloomFilter support the
48 * Constructor to load a single-level BloomFilter from
49 * files. This is used to make BloomFilter support the
5050 * same interface as HashAgnosticCascadingBloom.
5151 */
5252 HashAgnosticCascadingBloom(const string& bloomPath)
137137 }
138138
139139 /** Get the Bloom filter for a given level */
140 BTL::BloomFilter& getBloomFilter(unsigned level)
140 BloomFilter& getBloomFilter(unsigned level)
141141 {
142142 assert(m_data.at(level) != NULL);
143143 return *m_data.at(level);
158158 void loadFilter(const string& bloomPath)
159159 {
160160 clear();
161 BTL::BloomFilter* bloom = new BTL::BloomFilter(bloomPath);
161 BloomFilter* bloom = new BloomFilter(bloomPath);
162162 m_k = bloom->getKmerSize();
163163 m_hashes = bloom->getHashNum();
164164 m_data.push_back(bloom);
171171 {
172172 m_k = 0;
173173 m_hashes = 0;
174 typedef std::vector<BTL::BloomFilter*>::iterator Iterator;
174 typedef std::vector<BloomFilter*>::iterator Iterator;
175175 for (Iterator i = m_data.begin(); i != m_data.end(); i++) {
176176 assert(*i != NULL);
177177 delete *i;
184184 /** number of hash functions */
185185 unsigned m_hashes;
186186 /** the array of Bloom filters */
187 std::vector<BTL::BloomFilter*> m_data;
187 std::vector<BloomFilter*> m_data;
188188 };
189189
190190 #endif
00 #ifndef LIGHTWEIGHT_KMER_H
11 #define LIGHTWEIGHT_KMER_H 1
2
3 #include "BloomDBG/MaskedKmer.h"
24
35 #include <algorithm>
46 #include <cstring>
2525 RollingHashIterator.h \
2626 SpacedSeed.h \
2727 $(top_srcdir)/lib/bloomfilter/BloomFilter.hpp \
28 $(top_srcdir)/lib/rolling-hash/rolling.h
28 $(top_srcdir)/lib/nthash/nthash.hpp
4444 : m_kmer(kmer), m_rollingHash(rollingHash) {}
4545
4646 const LightweightKmer& kmer() const { return m_kmer; };
47 LightweightKmer& kmer() { return m_kmer; };
48
4749 const RollingHash& rollingHash() const { return m_rollingHash; }
4850
4951 RollingBloomDBGVertex clone() const {
6264
6365 void setLastBase(extDirection dir, char base)
6466 {
65 const unsigned k = Kmer::length();
66 if (dir == SENSE) {
67 m_rollingHash.setBase(m_kmer.c_str(), k-1, base);
68 } else {
69 m_rollingHash.setBase(m_kmer.c_str(), 0, base);
70 }
67 m_rollingHash.setLastBase(kmer().c_str(), dir, base);
68 kmer().setLastBase(dir, base);
7169 }
7270
7371 /**
11 #define ABYSS_ROLLING_HASH_H 1
22
33 #include "config.h"
4 #include "lib/rolling-hash/rolling.h"
4
5 #include "BloomDBG/LightweightKmer.h"
56 #include "BloomDBG/MaskedKmer.h"
7 #include "Common/Sense.h"
8 #include "lib/nthash/nthash.hpp"
9
10 #include <algorithm>
611 #include <string>
712 #include <vector>
813 #include <cassert>
6065 */
6166 void reset(const std::string& kmer)
6267 {
68 /* compute initial hash values for forward and reverse-complement k-mer */
69 NTC64(kmer.c_str(), m_k, m_hash1, m_rcHash1);
70
71 /* get canonical hash value from forward/reverse hash values */
72 m_hash = canonicalHash(m_hash1, m_rcHash1);
73
6374 if (!MaskedKmer::mask().empty())
64 resetMasked(kmer.c_str());
65 else
66 resetUnmasked(kmer);
67 }
68
69 /**
70 * Initialize hash values from current k-mer. When computing the hash
71 * value, mask out "don't care" positions as per the active
72 * k-mer mask.
73 */
74 void resetMasked(const char* kmer)
75 {
76 const std::string& spacedSeed = MaskedKmer::mask();
77 assert(spacedSeed.length() == m_k);
78
79 /* compute first hash function for k-mer */
80 uint64_t hash1 = getFhval(m_hash1, spacedSeed.c_str(), kmer, m_k);
81
82 /* compute first hash function for reverse complement of k-mer */
83 uint64_t rcHash1 = getRhval(m_rcHash1, spacedSeed.c_str(), kmer, m_k);
84
85 m_hash = canonicalHash(hash1, rcHash1);
86 }
87
88 /**
89 * Initialize hash values from sequence.
90 * @param kmer k-mer used to initialize hash state
91 */
92 void resetUnmasked(const std::string& kmer)
93 {
94 /* compute first hash function for k-mer */
95 m_hash1 = getFhval(kmer.c_str(), m_k);
96
97 /* compute first hash function for reverse complement
98 * of k-mer */
99 m_rcHash1 = getRhval(kmer.c_str(), m_k);
100
101 m_hash = canonicalHash(m_hash1, m_rcHash1);
75 m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(),
76 kmer.c_str(), m_k);
10277 }
10378
10479 /**
10984 */
11085 void rollRight(const char* kmer, char charIn)
11186 {
112 if (!MaskedKmer::mask().empty())
113 rollRightMasked(kmer, charIn);
114 else
115 rollRightUnmasked(kmer, charIn);
116 }
117
118 /**
119 * Compute hash values for next k-mer to the right and
120 * update internal state. When computing the new hash, mask
121 * out "don't care" positions according to the active
122 * k-mer mask.
123 * @param kmer current k-mer
124 * @param nextKmer k-mer we are rolling into
125 */
126 void rollRightMasked(const char* kmer, char charIn)
127 {
128 const std::string& spacedSeed = MaskedKmer::mask();
129 m_hash = rollHashesRight(m_hash1, m_rcHash1, spacedSeed.c_str(),
130 kmer, charIn, m_k);
131 }
132
133 /**
134 * Compute hash values for next k-mer to the right and
135 * update internal state.
136 * @param kmer current k-mer
137 * @param nextKmer k-mer we are rolling into
138 */
139 void rollRightUnmasked(const char* kmer, char charIn)
140 {
141 /* update first hash function */
142 rollHashesRight(m_hash1, m_rcHash1, kmer[0], charIn, m_k);
143 m_hash = canonicalHash(m_hash1, m_rcHash1);
87 NTC64(kmer[0], charIn, m_k, m_hash1, m_rcHash1);
88 m_hash = canonicalHash(m_hash1, m_rcHash1);
89
90 if (!MaskedKmer::mask().empty()) {
91 // TODO: copying the k-mer and shifting is very inefficient;
92 // we need a specialized nthash function that rolls and masks
93 // simultaneously
94 LightweightKmer next(kmer);
95 next.shift(SENSE, charIn);
96 m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(),
97 next.c_str(), m_k);
98 }
14499 }
145100
146101 /**
151106 */
152107 void rollLeft(char charIn, const char* kmer)
153108 {
154 if (!MaskedKmer::mask().empty())
155 rollLeftMasked(charIn, kmer);
156 else
157 rollLeftUnmasked(charIn, kmer);
158 }
159
160 /**
161 * Compute hash values for next k-mer to the left and
162 * update internal state. When computing the new hash, mask
163 * out "don't care" positions according to the active
164 * k-mer mask.
165 * @param prevKmer k-mer we are rolling into
166 * @param kmer current k-mer
167 */
168 void rollLeftMasked(char charIn, const char* kmer)
169 {
170 const std::string& spacedSeed = MaskedKmer::mask();
171 m_hash = rollHashesLeft(m_hash1, m_rcHash1, spacedSeed.c_str(),
172 kmer, charIn, m_k);
173 }
174
175 /**
176 * Compute hash values for next k-mer to the left and
177 * update internal state.
178 * @param prevKmer k-mer we are rolling into
179 * @param kmer current k-mer
180 */
181 void rollLeftUnmasked(char charIn, const char* kmer)
182 {
183 /* update first hash function */
184 rollHashesLeft(m_hash1, m_rcHash1, charIn, kmer[m_k-1], m_k);
185 m_hash = canonicalHash(m_hash1, m_rcHash1);
109 NTC64L(kmer[m_k-1], charIn, m_k, m_hash1, m_rcHash1);
110 m_hash = canonicalHash(m_hash1, m_rcHash1);
111
112 if (!MaskedKmer::mask().empty()) {
113 // TODO: copying the k-mer and shifting is very inefficient;
114 // we need a specialized nthash function that rolls and masks
115 // simultaneously
116 LightweightKmer next(kmer);
117 next.shift(ANTISENSE, charIn);
118 m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(),
119 next.c_str(), m_k);
120 }
186121 }
187122
188123 /**
202137 */
203138 void getHashes(size_t hashes[]) const
204139 {
205 uint64_t tmpHashes[MAX_HASHES];
206 multiHash(tmpHashes, m_hash, m_numHashes, m_k);
207 for (unsigned i = 0; i < m_numHashes; ++i) {
208 hashes[i] = (size_t)tmpHashes[i];
209 }
140 for (unsigned i = 0; i < m_numHashes; ++i)
141 hashes[i] = NTE64(m_hash, m_k, i);
210142 }
211143
212144 /** Equality operator */
229161 }
230162
231163 /**
232 * Set the base at a given position in the k-mer and update the hash
233 * value accordingly.
164 * Change the hash value to reflect a change in the first/last base of
165 * the k-mer.
234166 * @param kmer point to the k-mer char array
235 * @param pos position of the base to be changed
167 * @param dir if SENSE, change last base; if ANTISENSE,
168 * change first base
236169 * @param base new value for the base
237170 */
238 void setBase(char* kmer, unsigned pos, char base)
239 {
171 void setLastBase(char* kmer, extDirection dir, char base)
172 {
173 if (dir == SENSE) {
174 /* roll left to remove old last char */
175 NTC64L(kmer[m_k-1], 'A', m_k, m_hash1, m_rcHash1);
176 /* roll right to add new last char */
177 NTC64('A', base, m_k, m_hash1, m_rcHash1);
178 } else {
179 /* roll right to remove old first char */
180 NTC64(kmer[0], 'A', m_k, m_hash1, m_rcHash1);
181 /* roll left to add new first char */
182 NTC64L('A', base, m_k, m_hash1, m_rcHash1);
183 }
184 m_hash = canonicalHash(m_hash1, m_rcHash1);
185
240186 if (!MaskedKmer::mask().empty())
241 setBaseMasked(kmer, pos, base);
242 else
243 setBaseUnmasked(kmer, pos, base);
244 }
245
246 /**
247 * Set the base at a given position in the k-mer and update the hash
248 * value accordingly.
249 * @param kmer point to the k-mer char array
250 * @param pos position of the base to be changed
251 * @param base new value for the base
252 */
253 void setBaseMasked(char* kmer, unsigned pos, char base)
254 {
255 const std::string& spacedSeed = MaskedKmer::mask();
256 assert(spacedSeed.length() == m_k);
257 m_hash = ::setBase(m_hash1, m_rcHash1, spacedSeed.c_str(), kmer,
258 pos, base, m_k);
259 }
260
261 /**
262 * Set the base at a given position in the k-mer and update the hash
263 * value accordingly.
264 * @param kmer point to the k-mer char array
265 * @param pos position of the base to be changed
266 * @param base new value for the base
267 */
268 void setBaseUnmasked(char* kmer, unsigned pos, char base)
269 {
270 m_hash = ::setBase(m_hash1, m_rcHash1, kmer, pos, base, m_k);
187 m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(),
188 kmer, m_k);
271189 }
272190
273191 private:
227227 HashAgnosticCascadingBloom solidKmerSet;
228228
229229 /* empty visited k-mers Bloom filter */
230 BTL::BloomFilter visitedKmerSet;
230 BloomFilter visitedKmerSet;
231231
232232 /* counters for progress messages */
233233 BloomDBG::AssemblyCounters counters;
573573 if (!assembledKmerSet.contains(*fwd))
574574 break;
575575 }
576 if (fwd.pos() > 0)
576 if (fwd != RollingHashIterator::end() && fwd.pos() > 0)
577577 seq.erase(0, fwd.pos());
578578
579579 /* trim previously assembled k-mers from end of sequence */
583583 if (!assembledKmerSet.contains(*rev))
584584 break;
585585 }
586 if (rev.pos() > 0)
586 if (rev != RollingHashIterator::end() && rev.pos() > 0)
587587 rcSeq.erase(0, rev.pos());
588588
589589 /* flip seq back to original orientation */
833833 std::ostream& out)
834834 {
835835 /* k-mers in previously assembled contigs */
836 BTL::BloomFilter visitedKmerSet(solidKmerSet.size(),
836 BloomFilter visitedKmerSet(solidKmerSet.size(),
837837 solidKmerSet.getHashNum(), solidKmerSet.getKmerSize());
838838
839839 /* counters for progress messages */
0 2018-09-11 Ben Vandervalk <benv@bcgsc.ca>
1
2 * Release version 2.1.1
3
4 abyss-bloom-dbg:
5 * upgrade to most recent version of ntHash to reduce
6 some assembly/hashing artifacts. On a human assembly, this
7 reduced QUAST major misassemblies by 5% and increased
8 scaffold contiguity by 10%
9 * `kc` parameter now also applies to MPI assemblies (see below)
10
11 abyss-fac:
12 * change N20 and N80 to N25 and N75, respectively
13
14 ABYSS-P:
15 * add `--kc` option, with implements a hard minimum k-mer
16 multiplicity cutoff
17
18 abyss-pe:
19 * fix `zsh: no such option: pipefail` error with
20 old versions of `zsh` (fallback to `bash` instead)
21 * adding `time=1` now times *all* assembly commands
22
23 abyss-sealer:
24 * parallelize gap sealing with OpenMP (thanks to
25 @schutzekatze!)
26 * add `--gap-file` option (thanks to @schutzekatze!)
27
28 DistanceEst:
29 * add support for GFA output
30
031 2018-04-13 Ben Vandervalk <benv@bcgsc.ca>
132
233 * Release version 2.1.0
11 #define ESTIMATE_H 1
22
33 #include "Common/ContigProperties.h" // for Distance
4 #include "Common/Exception.h"
45 #include "ContigID.h"
56 #include "ContigNode.h"
67 #include "Graph/Options.h" // for opt::k
197198
198199 namespace std {
199200 template<>
200 inline void swap(EstimateRecord&, EstimateRecord&)
201 inline void swap(EstimateRecord&, EstimateRecord&) NOEXCEPT
201202 {
202203 assert(false);
203204 }
0 #ifndef _EXCEPTION_H_
1 #define _EXCEPTION_H_ 1
2
3 /* `noexcept` is only recognized in C++11 or later. See https://stackoverflow.com/questions/24567173/backwards-compatible-noexceptfalse-for-destructors */
4 #if __cplusplus >= 201103L
5 #define NOEXCEPT noexcept
6 #else
7 #define NOEXCEPT
8 #endif
9
10 #endif
00 #ifndef HISTOGRAM_H
11 #define HISTOGRAM_H 1
22
3 #include "Common/Exception.h"
34 #include "StringUtil.h" // for toEng
45 #include "VectorUtil.h" // for make_vector
56 #include <cassert>
311312
312313 namespace std {
313314 template<>
314 inline void swap(Histogram&, Histogram&) { assert(false); }
315 inline void swap(Histogram&, Histogram&) NOEXCEPT { assert(false); }
315316 }
316317
317318 /** Print assembly contiguity statistics header. */
328329 out << "LG50" << sep
329330 << "NG50" << sep;
330331 return out << "min" << sep
331 << "N80" << sep
332 << "N75" << sep
332333 << "N50" << sep
333 << "N20" << sep
334 << "N25" << sep
334335 << "E-size" << sep
335336 << "max" << sep
336337 << "sum" << sep
363364 }
364365 return out
365366 << toEng(h.minimum()) << sep
366 << toEng(h.weightedPercentile(1 - 0.8)) << sep
367 << toEng(h.weightedPercentile(1 - 0.75)) << sep
367368 << toEng(n50) << sep
368 << toEng(h.weightedPercentile(1 - 0.2)) << sep
369 << toEng(h.weightedPercentile(1 - 0.25)) << sep
369370 << toEng((unsigned)h.expectedValue()) << sep
370371 << toEng(h.maximum()) << sep
371372 << toEng(sum);
385386 << h.size()
386387 << h.count(n50, INT_MAX)
387388 << h.minimum()
388 << h.weightedPercentile(1 - 0.8)
389 << h.weightedPercentile(1 - 0.75)
389390 << n50
390 << h.weightedPercentile(1 - 0.2)
391 << h.weightedPercentile(1 - 0.25)
391392 << (unsigned)h.expectedValue()
392393 << h.maximum()
393394 << sum;
1212 ContigProperties.h \
1313 Dictionary.h \
1414 Estimate.h \
15 Exception.h \
1516 Fcontrol.cpp Fcontrol.h \
1617 Functional.h \
1718 Hash.h \
00 #ifndef PMF_H
11 #define PMF_H 1
22
3 #include "Common/Exception.h"
34 #include "Histogram.h"
45 #include <cassert>
56 #include <cmath>
6263
6364 namespace std {
6465 template<>
65 inline void swap(PMF&, PMF&) { assert(false); }
66 inline void swap(PMF&, PMF&) NOEXCEPT { assert(false); }
6667 }
6768
6869 #endif
171171 case 'I': case 'X': case '=':
172172 qlen += len;
173173 clip1 += len;
174 // fallthrough
174175 case 'D': case 'N': case 'P':
175176 if (a.align_length == 0) {
176177 // Ignore a malformatted CIGAR string whose first
187188 clip1 = 0;
188189 break;
189190 }
191 // fallthrough
190192 case 'H': case 'S':
191193 qlen += len;
192194 clip1 += len;
321323 #if SAM_SEQ_QUAL
322324 out << '\t' << o.seq
323325 << '\t' << o.qual;
326 // note: leading tab ('\t') is already embedded in o.tags
324327 if (!o.tags.empty())
325 out << '\t' << o.tags;
328 out << o.tags;
326329 #else
327330 out << "\t*\t*";
328331 #endif
130130 /** The sequence */
131131 Sequence seq;
132132
133 FastaRecord() { }
133 FastaRecord() : anchor(0) { }
134134 FastaRecord(const std::string& id, const std::string& comment,
135135 const Sequence& seq)
136136 : id(id), comment(comment), anchor(0), seq(seq) { }
112112 cout << "n:NG50" << sep
113113 << "NG50" << sep;
114114 cout << "min" << sep
115 << "N80" << sep
115 << "N75" << sep
116116 << "N50" << sep
117 << "N20" << sep
117 << "N25" << sep
118118 << "E-size" << sep
119119 << "max" << sep
120120 << "sum" << sep
129129 cout << "n:NG50" << sep
130130 << "NG50" << sep;
131131 cout << "min" << sep
132 << "N80" << sep
132 << "N75" << sep
133133 << "N50" << sep
134 << "N20" << sep
134 << "N25" << sep
135135 << "E-size" << sep
136136 << "max" << sep
137137 << "sum" << sep
6060 " (maximum likelihood estimator)\n"
6161 " --mean use the difference of the population mean\n"
6262 " and the sample mean\n"
63 " --dist output graph in dist format [default]\n"
64 " --dot output graph in dot format\n"
63 " --dist output the graph in dist format [default]\n"
64 " --dot output the graph in GraphViz format\n"
65 " --gv output the graph in GraphViz format\n"
66 " --gfa output the graph in GFA2 format\n"
67 " --gfa2 output the graph in GFA2 format\n"
6568 " -j, --threads=N use N parallel threads [1]\n"
6669 " -v, --verbose display verbose output\n"
6770 " --help display this help and exit\n"
118121 static const struct option longopts[] = {
119122 { "dist", no_argument, &opt::format, DIST, },
120123 { "dot", no_argument, &opt::format, DOT, },
124 { "gv", no_argument, &opt::format, DOT, },
125 { "gfa", no_argument, &opt::format, GFA2, },
126 { "gfa2", no_argument, &opt::format, GFA2, },
121127 { "fr", no_argument, &opt::rf, false },
122128 { "rf", no_argument, &opt::rf, true },
123129 { "min-align", required_argument, NULL, 'l' },
274280 if (opt::format == DOT) {
275281 #pragma omp critical(out)
276282 out << get(g_contigNames, e) << " [" << est << "]\n";
283 } else if (opt::format == GFA2) {
284 // Output only one of the two complementary edges.
285 if (len1 < opt::seedLen || e.first < e.second || e.first == e.second)
286 #pragma omp critical(out)
287 out << "G\t*"
288 << '\t' << get(g_contigNames, e.first)
289 << '\t' << get(g_contigNames, e.second)
290 << '\t' << est.distance
291 << '\t' << (int)ceilf(est.stdDev)
292 << "\tFC:i:" << est.numPairs
293 << '\n';
277294 } else
278295 out << ' ' << get(g_contigNames, id1) << ',' << est;
279296 } else if (opt::verbose > 1) {
316333 const PairsMap& x = dataMap[sense0 ^ opt::rf];
317334 for (PairsMap::const_iterator it = x.begin();
318335 it != x.end(); ++it)
319 writeEstimate(opt::format == DOT ? out : ss,
336 writeEstimate(opt::format == DIST ? ss : out,
320337 ContigNode(id0, sense0), it->first,
321338 len0, lengthVec[it->first.id()],
322339 it->second, pmf);
501518 "k=" << opt::k << " "
502519 "s=" << opt::seedLen << " "
503520 "n=" << opt::npairs << "]\n";
521 else if (opt::format == GFA2)
522 out << "H\tVN:Z:2.0\n";
504523
505524 vector<int> vals = make_vector<int>()
506525 << opt::k
216216 }
217217
218218 /** Read a graph in GFA format. */
219 template <typename Graph>
220 std::istream& read_gfa(std::istream& in, Graph& g)
219 template <typename Graph, typename BetterEP>
220 std::istream& read_gfa(std::istream& in, Graph& g, BetterEP betterEP)
221221 {
222222 assert(in);
223223
224224 typedef typename graph_traits<Graph>::vertex_descriptor V;
225225 typedef typename vertex_property<Graph>::type VP;
226 typedef typename graph_traits<Graph>::edge_descriptor E;
226227 typedef typename edge_property<Graph>::type EP;
227228
228229 // Add vertices if this graph is empty.
350351 assert(in);
351352 V u = find_vertex(uname, g);
352353 V v = find_vertex(vname, g);
353 add_edge(u, v, ep, g);
354 E e;
355 bool found;
356 boost::tie(e, found) = edge(u, v, g);
357 if (found) {
358 // Parallel edge
359 EP& ref = g[e];
360 ref = betterEP(ref, ep);
361 } else
362 add_edge(u, v, ep, g);
354363 break;
355364 }
356365
6666 case 'T': // HT: ASQG format
6767 return read_asqg(in, g);
6868 case '\t': // H: GAF format
69 return read_gfa(in, g);
69 return read_gfa(in, g, betterEP);
7070 default:
7171 std::cerr << "Unknown file format: `H" << c << "'\n";
7272 exit(EXIT_FAILURE);
3232 #endif
3333
3434 using namespace std;
35 using Konnector::BloomFilter;
3536
3637 #define PROGRAM "konnector"
3738
814814 result = ES_EXTENDED_TO_CYCLE;
815815 else
816816 result = ES_INTERNAL_CYCLE;
817 break;
817818 case DEAD_END:
818819 result = ES_DEAD_END;
819820 break;
5656 Sealer \
5757 AdjList \
5858 lib/bloomfilter \
59 lib/rolling-hash \
59 lib/nthash \
6060 $(GTest) \
6161 $(UnitTest)
6262
640640 printContiguityStats(cerr, lengthHistogram, STATS_MIN_LENGTH)
641641 << '\t' << opt::out << '\n';
642642 }
643 #if 0
644 // assembly contiguity statistics
645 vector<int> vals = passContiguityStatsVal(lengthHistogram,200);
646 vector<string> keys = make_vector<string>()
647 << "n"
648 << "n200"
649 << "nN50"
650 << "min"
651 << "N80"
652 << "N50"
653 << "N20"
654 << "Esize"
655 << "max"
656 << "sum"
657 << "nNG50"
658 << "NG50";
659
660 if (!opt::db.empty()) {
661 for (unsigned a=0; a<vals.size(); a++)
662 addToDb(db, keys[a], vals[a]);
663 }
664 #endif
665643 return 0;
666644 }
278278 where
279279 help = putStr (usageInfo usage options) >> exitSuccess
280280 tryHelp = "Try 'abyss-samtobreak --help' for more information."
281 version = "abyss-samtobreak (ABySS) 2.1.0\n"
281 version = "abyss-samtobreak (ABySS) 2.1.1\n"
282282 usage = "Usage: samtobreak [OPTION]... [FILE]...\n\
283283 \Calculate contig and scaffold contiguity and correctness metrics.\n"
284284
9797 = AssemblyAlgorithms::coverageHistogram(m_data);
9898 Histogram h(m_comm.reduce(myh.toVector()));
9999 AssemblyAlgorithms::setCoverageParameters(h);
100
101 /*
102 * If we have loaded the k-mer hash table from disk, then
103 * the adjacency data for each k-mer has already been computed
104 * in the hash table.
105 *
106 * `applyKmerCoverageThreshold` would not work correctly
107 * in this case because it removes k-mer records from
108 * the hash table without attempting to update the adjacency
109 * info of neighbouring k-mers.
110 */
111 if (!m_data.isAdjacencyLoaded() && opt::kc > 0) {
112 m_comm.barrier();
113 size_t removed = AssemblyAlgorithms::applyKmerCoverageThreshold(m_data, opt::kc);
114 logger(1) << "Removed " << removed
115 << " low multiplicity k-mers" << std::endl;
116 m_comm.reduce(removed);
117 m_comm.reduce(m_data.size());
118 }
119
100120 EndState();
101121 SetState(NAS_WAITING);
102122 break;
103123 }
104124 case NAS_GEN_ADJ:
105125 m_comm.barrier();
126
106127 m_numBasesAdjSet = 0;
107128 AssemblyAlgorithms::generateAdjacency(this);
108129 EndState();
473494 = AssemblyAlgorithms::coverageHistogram(m_data);
474495 Histogram h(m_comm.reduce(myh.toVector()));
475496 AssemblyAlgorithms::setCoverageParameters(h);
476 EndState();
477
497
498 /*
499 * If we have loaded the k-mer hash table from disk, then
500 * the adjacency data for each k-mer has already been computed
501 * in the hash table.
502 *
503 * `applyKmerCoverageThreshold` would not work correctly
504 * in this case because it removes k-mer records from
505 * the hash table without attempting to update the adjacency
506 * info of neighbouring k-mers.
507 */
508 if (!m_data.isAdjacencyLoaded() && opt::kc > 0) {
509 cout << "Minimum k-mer multiplicity kc is "
510 << opt::kc << '\n';
511 cout << "Removing low multiplicity k-mers..." << std::endl;
512 m_comm.barrier();
513 size_t removed = AssemblyAlgorithms::applyKmerCoverageThreshold(m_data, opt::kc);
514 logger(1) << "Removed " << removed
515 << " low multiplicity k-mers" << std::endl;
516 size_t sumRemoved = m_comm.reduce(removed);
517 size_t remaining = m_comm.reduce(m_data.size());
518 cout << "Removed " << sumRemoved
519 << " low-multiplicity k-mers, " << remaining
520 << " k-mers remaining" << std::endl;
521 }
522
523 EndState();
478524 SetState(m_data.isAdjacencyLoaded()
479525 ? NAS_ERODE : NAS_GEN_ADJ);
480526 break;
4141
4242 Install [Linuxbrew](http://linuxbrew.sh/), and run the command
4343
44 brew install brewsci/bio/abyss
44 brew install abyss
4545
4646 ## Install ABySS on macOS
4747
4848 Install [Homebrew](https://brew.sh/), and run the command
4949
50 brew install brewsci/bio/abyss
50 brew install abyss
5151
5252 ## Install ABySS on Windows
5353
5454 Install [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/) and [Linuxbrew](http://linuxbrew.sh/), and run the command
5555
56 brew install brewsci/bio/abyss
56 brew install abyss
5757
5858 ## Install ABySS on Debian or Ubuntu
5959
9696 - [ARCS](https://github.com/bcgsc/arcs) to scaffold
9797 - [Tigmint](https://github.com/bcgsc/tigmint) to correct assembly errors
9898
99 brew install brewsci/bio/arcs brewsci/bio/links
99 brew install brewsci/bio/arcs brewsci/bio/links-scaffolder
100100
101101 ## Optional dependencies
102102
187187 installed ABySS in `/opt/abyss`, add `/opt/abyss/bin` to your `PATH`:
188188
189189 PATH=/opt/abyss/bin:$PATH
190
191 Before starting an assembly
192 ===========================
193
194 ABySS stores temporary files in `TMPDIR`, which is `/tmp` by default on most systems. If your default temporary disk volume is too small, set `TMPDIR` to a larger volume, such as `/var/tmp` or your home directory.
195
196 export TMPDIR=/var/tmp
190197
191198 Assembling a paired-end library
192199 ===============================
653653 << "n200"
654654 << "nN50"
655655 << "min"
656 << "N80"
656 << "N75"
657657 << "N50"
658 << "N20"
658 << "N25"
659659 << "Esize"
660660 << "max"
661661 << "sum"
4545 #endif
4646
4747 using namespace std;
48 using Konnector::BloomFilter;
4849 #if USESEQAN
4950 using namespace seqan;
5051 #endif
109110 " -s, --search-mem=N mem limit for graph searches; multiply by the\n"
110111 " number of threads (-j) to get the total mem used\n"
111112 " for graph traversal [500M]\n"
113 " -g, --gap-file=FILE write sealed gaps to FILE\n"
112114 " -t, --trace-file=FILE write graph search stats to FILE\n"
113115 " -v, --verbose display verbose output\n"
114116 " --help display this help and exit\n"
203205 /** Output file for graph search stats */
204206 static string tracefilePath;
205207
208 /** Output file for sealed gaps */
209 static string gapfilePath;
210
206211 /** Mask bases not in flanks */
207212 static int mask = 0;
208213
235240 size_t skipped;
236241 };
237242
238 static const char shortopts[] = "S:L:b:B:d:ef:F:G:i:Ij:k:lm:M:no:P:q:r:s:t:v";
243 static const char shortopts[] = "S:L:b:B:d:ef:F:G:g:i:Ij:k:lm:M:no:P:q:r:s:t:v";
239244
240245 enum { OPT_HELP = 1, OPT_VERSION };
241246
272277 { "read-name", required_argument, NULL, 'r' },
273278 { "search-mem", required_argument, NULL, 's' },
274279 { "trace-file", required_argument, NULL, 't' },
280 { "gap-file", required_argument, NULL, 'g' },
275281 { "verbose", no_argument, NULL, 'v' },
276282 { "help", no_argument, NULL, OPT_HELP },
277283 { "version", no_argument, NULL, OPT_VERSION },
414420 string merge(const Graph& g,
415421 unsigned k,
416422 const Gap& gap,
417 FastaRecord &read1,
418 FastaRecord &read2,
423 const FastaRecord &read1,
424 const FastaRecord &read2,
419425 const ConnectPairsParams& params,
420426 Counters& g_count,
421427 ofstream& traceStream)
571577 map<FastaRecord, map<FastaRecord, Gap> > &flanks,
572578 unsigned &gapsclosed,
573579 ofstream &logStream,
574 ofstream &traceStream)
580 ofstream &traceStream,
581 ofstream &gapStream)
575582 {
576583 map<FastaRecord, map<FastaRecord, Gap> >::iterator read1_it;
577584 map<FastaRecord, Gap>::iterator read2_it;
578585 unsigned uniqueGapsClosed = 0;
579 bool success;
580586
581587 Counters g_count;
582588 g_count.noStartOrGoalKmer = 0;
596602
597603 printLog(logStream, "Flanks inserted into k run = " + IntToString(flanks.size()) + "\n");
598604
599 for (read1_it = flanks.begin(); read1_it != flanks.end();) {
600 success = false;
605 int counter = 0;
606 vector<map<FastaRecord, map<FastaRecord, Gap> >::iterator> flanks_closed;
607 #pragma omp parallel private(read1_it, read2_it) firstprivate(counter)
608 for (read1_it = flanks.begin(); read1_it != flanks.end(); ++read1_it) {
601609 FastaRecord read1 = read1_it->first;
602 for (read2_it = flanks[read1].begin(); read2_it != flanks[read1].end(); read2_it++) {
610 bool success = false;
611 for (read2_it = flanks[read1].begin(); read2_it != flanks[read1].end(); ++read2_it, ++counter) {
612 #if _OPENMP
613 if (counter % omp_get_num_threads() != omp_get_thread_num())
614 continue;
615 #endif
603616 FastaRecord read2 = read2_it->first;
604617
605618 int startposition = read2_it->second.gapStart();
606619 string tempSeq = merge(g, k, read2_it->second, read1, read2, params, g_count, traceStream);
607620 if (!tempSeq.empty()) {
608621 success = true;
622 #pragma omp critical (allmerged)
609623 allmerged[read1.id.substr(0,read1.id.length()-2)][startposition]
610624 = ClosedGap(read2_it->second, tempSeq);
611 //#pragma omp atomic
612 gapsclosed++;
613 //#pragma omp atomic
614 uniqueGapsClosed++;
615 if (gapsclosed % 100 == 0)
625 #pragma omp atomic
626 ++uniqueGapsClosed;
627 #pragma omp critical (gapsclosed)
628 if (++gapsclosed % 100 == 0)
616629 printLog(logStream, IntToString(gapsclosed) + " gaps closed so far\n");
630
631 if (!opt::gapfilePath.empty())
632 #pragma omp critical (gapStream)
633 gapStream << ">" << read1.id.substr(0,read1.id.length()-2)
634 << "_" << read2_it->second.gapStart() << "-" << read2_it->second.gapEnd()
635 << " LN:i:" << tempSeq.length() << '\n'
636 << tempSeq << '\n';
617637 }
618638 }
619639 if (success) {
620 flanks.erase(read1_it++);
640 #pragma omp critical (flanks_closed)
641 flanks_closed.push_back(read1_it);
621642 }
622 else
623 read1_it++;
643 }
644
645 for (vector<map<FastaRecord, map<FastaRecord, Gap> >::iterator>::iterator it = flanks_closed.begin();
646 it != flanks_closed.end(); ++it) {
647 if (flanks.count((*it)->first) > 0)
648 flanks.erase(*it);
624649 }
625650
626651 printLog(logStream, IntToString(uniqueGapsClosed) + " unique gaps closed for k" + IntToString(k) + "\n");
770795 opt::searchMem = SIToBytes(arg); break;
771796 case 't':
772797 arg >> opt::tracefilePath; break;
798 case 'g':
799 arg >> opt::gapfilePath; break;
773800 case 'v':
774801 opt::verbose++; break;
775802 case OPT_HELP:
871898 assert(traceStream.is_open());
872899 ConnectPairsResult::printHeaders(traceStream);
873900 assert_good(traceStream, opt::tracefilePath);
901 }
902
903 ofstream gapStream;
904 if (!opt::gapfilePath.empty()) {
905 gapStream.open(opt::gapfilePath.c_str());
906 assert(gapStream.is_open());
907 assert_good(gapStream, opt::gapfilePath);
874908 }
875909
876910 string logOutputPath(opt::outputPrefix);
10191053 temp = "Starting K run with k = " + IntToString(opt::k) + "\n";
10201054 printLog(logStream, temp);
10211055
1022 kRun(params, opt::k, g, allmerged, flanks, gapsclosed, logStream, traceStream);
1056 kRun(params, opt::k, g, allmerged, flanks, gapsclosed, logStream, traceStream, gapStream);
10231057
10241058 temp = "k" + IntToString(opt::k) + " run complete\n"
10251059 + "Total gaps closed so far = " + IntToString(gapsclosed) + "\n\n";
10721106 traceStream.close();
10731107 }
10741108
1109 if (!opt::gapfilePath.empty()) {
1110 assert_good(gapStream, opt::gapfilePath);
1111 gapStream.close();
1112 }
1113
10751114 return 0;
10761115 }
88 #include <iostream>
99
1010 using namespace std;
11 typedef RollingBloomDBG<BTL::BloomFilter> Graph;
11 typedef RollingBloomDBG<BloomFilter> Graph;
1212 typedef graph_traits<Graph> GraphTraits;
1313
1414 /* each vertex is represented by
5959 * GACTCGG
6060 */
6161
62 BTL::BloomFilter bloom1(bloomSize, numHashes, k);
62 BloomFilter bloom1(bloomSize, numHashes, k);
6363
6464 RollingHash("GACTC", numHashes, k).getHashes(hashes);
6565 bloom1.insert(hashes);
9494 * GACTCGG
9595 */
9696
97 BTL::BloomFilter bloom2(bloomSize, numHashes, k);
97 BloomFilter bloom2(bloomSize, numHashes, k);
9898
9999 RollingHash("GACTC", numHashes, k).getHashes(hashes);
100100 bloom2.insert(hashes);
130130 * ACTCG
131131 */
132132
133 BTL::BloomFilter bloom3(bloomSize, numHashes, k);
133 BloomFilter bloom3(bloomSize, numHashes, k);
134134
135135 RollingHash("TACTC", numHashes, k).getHashes(hashes);
136136 bloom2.insert(hashes);
77 using namespace std;
88 using namespace boost;
99
10 typedef RollingBloomDBG<BTL::BloomFilter> Graph;
10 typedef RollingBloomDBG<BloomFilter> Graph;
1111 typedef graph_traits<Graph> GraphTraits;
1212 typedef graph_traits<Graph>::vertex_descriptor V;
1313
1919 const unsigned m_k;
2020 const unsigned m_bloomSize;
2121 const unsigned m_numHashes;
22 BTL::BloomFilter m_bloom;
22 BloomFilter m_bloom;
2323 Graph m_graph;
2424
2525 RollingBloomDBGTest() : m_k(5), m_bloomSize(100000), m_numHashes(2),
148148 * CGACT-GACTC-ACTCG
149149 */
150150
151 BTL::BloomFilter bloom(m_bloomSize, m_numHashes, m_k);
151 BloomFilter bloom(m_bloomSize, m_numHashes, m_k);
152152 Graph graph(bloom);
153153
154154 const V CGACT("CGACT", RollingHash("CGACT", m_numHashes, m_k));
196196 const unsigned m_k;
197197 const unsigned m_bloomSize;
198198 const unsigned m_numHashes;
199 BTL::BloomFilter m_bloom;
199 BloomFilter m_bloom;
200200 Graph m_graph;
201201 const std::string m_spacedSeed;
202202
152152 ASSERT_EQ(rightKmerHash, middleKmerHash);
153153 }
154154
155 TEST_F(RollingHashTest, setBase)
155 TEST_F(RollingHashTest, setLastBase)
156156 {
157157 MaskedKmer::mask().clear();
158158
159159 char kmer1[] = "ACGT";
160 char kmer2[] = "ACCT";
160 char kmer2[] = "ACGA";
161 char kmer3[] = "GCGT";
161162
162163 RollingHash hash1(kmer1, m_numHashes, m_k);
163164 RollingHash hash2(kmer2, m_numHashes, m_k);
165 RollingHash hash3(kmer3, m_numHashes, m_k);
164166
165167 ASSERT_NE(hash2, hash1);
166 hash1.setBase(kmer1, 2, 'C');
167 ASSERT_EQ(0, strcmp(kmer1, kmer2));
168 hash1.setLastBase(kmer1, SENSE, 'A');
168169 ASSERT_EQ(hash2, hash1);
170
171 hash1.reset(kmer1);
172 ASSERT_NE(hash3, hash1);
173 hash1.setLastBase(kmer1, ANTISENSE, 'G');
174 ASSERT_EQ(hash3, hash1);
169175 }
170
171 TEST_F(RollingHashTest, setBaseMasked)
172 {
173 MaskedKmer::setMask("1101");
174
175 char kmer1[] = "ACGT";
176 char kmer2[] = "ACCT";
177
178 RollingHash hash1(kmer1, m_numHashes, m_k);
179 RollingHash hash2(kmer2, m_numHashes, m_k);
180
181 /* hashes should agree since mismatch is in masked position */
182 ASSERT_EQ(hash2, hash1);
183 ASSERT_NE(0, strcmp(kmer1, kmer2));
184
185 /* fix mismatch in masked position (hash values shouldn't change) */
186 hash1.setBase(kmer1, 2, 'C');
187 ASSERT_EQ(hash2, hash1);
188 ASSERT_EQ(0, strcmp(kmer1, kmer2));
189
190 /* create mismatch in unmasked position (hash value should now differ) */
191 hash1.setBase(kmer1, 1, 'G');
192 ASSERT_NE(hash2, hash1);
193 ASSERT_NE(0, strcmp(kmer1, kmer2));
194 }
88 #include <string>
99
1010 using namespace std;
11 using Konnector::BloomFilter;
1112
1213 TEST(BloomFilter, base)
1314 {
55 #include <string>
66
77 using namespace std;
8 using Konnector::BloomFilter;
89
910 /*
1011 * Tests for getStartKmerPos() function, which does
33
44 #include <gtest/gtest.h>
55 #include <string>
6
7 using Konnector::BloomFilter;
68
79 TEST(DBGBloom, BloomFilterPolymorphism)
810 {
33 #include <gtest/gtest.h>
44
55 using namespace std;
6 using Konnector::BloomFilter;
67
78 // workaround: opt::k must be defined because
89 // it is used by write_dot(..)
212212 BloomDBG/HashAgnosticCascadingBloomTest.cpp
213213 BloomDBG_HashAgnosticCascadingBloom_CXXFLAGS = $(AM_CXXFLAGS) \
214214 $(OPENMP_CXXFLAGS)
215 BloomDBG_HashAgnosticCascadingBloom_LDADD = \
216 $(top_builddir)/DataLayer/libdatalayer.a \
217 $(top_builddir)/Common/libcommon.a \
218 $(LDADD)
215219
216220 check_PROGRAMS += BloomDBG_RollingBloomDBG
217221 BloomDBG_RollingBloomDBG_SOURCES = BloomDBG/RollingBloomDBGTest.cpp
33 # Anthony Raymond <traymond@bcgsc.ca>.
44
55 SHELL=bash -e -o pipefail
6 ifneq ($(shell command -v zsh),)
6 ifeq ($(shell zsh -e -o pipefail -c 'true' 2>/dev/null; echo $$?), 0)
77 # Set pipefail to ensure that all commands of a pipe succeed.
88 SHELL=zsh -e -o pipefail
99 # Report run time and memory usage with zsh.
109109 endif
110110 endif
111111
112 ifndef preserve_path
112113 # Determine the path to the ABySS executables
113114 path?=$(shell dirname `command -v $(MAKEFILE_LIST)`)
114115 ifdef path
115116 PATH:=$(path):$(PATH)
117 endif
116118 endif
117119
118120 ifdef db
195197 ifdef c
196198 abyssopt += -c$c
197199 endif
200 ifdef kc
201 abyssopt += --kc=$(kc)
202 endif
198203 ifdef b
199204 abyssopt += -b$b
200205 pbopt += -b$b
215220 endif
216221 ifdef j
217222 abyssopt += -j$j
218 endif
219 ifdef kc
220 abyssopt += --kc=$(kc)
221223 endif
222224 ifdef x
223225 abyssopt += -s$x
392394 @echo 'Report bugs to https://github.com/bcgsc/abyss/issues or abyss-users@bcgsc.ca.'
393395
394396 version:
395 @echo "abyss-pe (ABySS) 2.1.0"
397 @echo "abyss-pe (ABySS) 2.1.1"
396398 @echo "Written by Shaun Jackman and Anthony Raymond."
397399 @echo
398400 @echo "Copyright 2012 Canada's Michael Smith Genome Science Centre"
529531
530532 ifdef B
531533 %-1.fa:
532 abyss-bloom-dbg $(abyssopt) $(ABYSS_OPTIONS) $(in) $(se) > $@
534 $(gtime) abyss-bloom-dbg $(abyssopt) $(ABYSS_OPTIONS) $(in) $(se) > $@
533535 else ifdef K
534536
535537 ifdef np
536538 %-1.fa:
537 $(mpirun) -np $(np) abyss-paired-dbg-mpi $(abyssopt) $(ABYSS_OPTIONS) -o $*-1.fa $(in) $(se)
539 $(gtime) $(mpirun) -np $(np) abyss-paired-dbg-mpi $(abyssopt) $(ABYSS_OPTIONS) -o $*-1.fa $(in) $(se)
538540 else
539541 %-1.fa %-1.$g:
540 abyss-paired-dbg $(abyssopt) $(ABYSS_OPTIONS) -o $*-1.fa -g $*-1.$g $(in) $(se)
542 $(gtime) abyss-paired-dbg $(abyssopt) $(ABYSS_OPTIONS) -o $*-1.fa -g $*-1.$g $(in) $(se)
541543 endif
542544
543545 else ifdef np
544546 %-1.fa:
545 $(mpirun) -np $(np) ABYSS-P $(abyssopt) $(ABYSS_OPTIONS) -o $@ $(in) $(se)
547 $(gtime) $(mpirun) -np $(np) ABYSS-P $(abyssopt) $(ABYSS_OPTIONS) -o $@ $(in) $(se)
546548 else
547549 %-1.fa:
548 ABYSS $(abyssopt) $(ABYSS_OPTIONS) -o $@ $(in) $(se)
550 $(gtime) ABYSS $(abyssopt) $(ABYSS_OPTIONS) -o $@ $(in) $(se)
549551 endif
550552
551553 # Find overlapping contigs
552554
553555 %-1.$g: %-1.fa
554 AdjList $(alopt) --$g $< >$@
556 $(gtime) AdjList $(alopt) --$g $< >$@
555557
556558 # Remove shim contigs
557559
558560 %-2.$g1 %-1.path: %-1.$g %-1.fa
559 abyss-filtergraph $v --$g $(fgopt) $(FILTERGRAPH_OPTIONS) -k$k -g $*-2.$g1 $^ >$*-1.path
561 $(gtime) abyss-filtergraph $v --$g $(fgopt) $(FILTERGRAPH_OPTIONS) -k$k -g $*-2.$g1 $^ >$*-1.path
560562
561563 %-2.fa %-2.$g: %-1.fa %-2.$g1 %-1.path
562 MergeContigs --$g $(mcopt) -g $*-2.$g -o $*-2.fa $^
564 $(gtime) MergeContigs --$g $(mcopt) -g $*-2.$g -o $*-2.fa $^
563565
564566 # Pop bubbles
565567
566568 %-2.path %-3.$g: %-2.fa %-2.$g
567 PopBubbles $v --$g -j$j -k$k $(SS) $(pbopt) $(POPBUBBLES_OPTIONS) -g $*-3.$g $^ >$*-2.path
569 $(gtime) PopBubbles $v --$g -j$j -k$k $(SS) $(pbopt) $(POPBUBBLES_OPTIONS) -g $*-3.$g $^ >$*-2.path
568570
569571 %-3.fa: %-2.fa %-2.$g %-2.path
570 MergeContigs $(mcopt) -o $@ $^
572 $(gtime) MergeContigs $(mcopt) -o $@ $^
571573 awk '!/^>/ {x[">" $$1]=1; next} {getline s} $$1 in x {print $$0 "\n" s}' \
572574 $*-2.path $*-1.fa >$*-indel.fa
573575
580582 # Estimate distances between unitigs
581583
582584 %-3.sam.gz %-3.hist: $(name)-3.fa
583 $(align) $(mapopt) $(strip $($*)) $< \
585 $(gtime) $(align) $(mapopt) $(strip $($*)) $< \
584586 |$(fixmate) $(fmopt) -h $*-3.hist \
585587 |sort -snk3 -k4 \
586588 |$(gzip) >$*-3.sam.gz
587589
588590 %-3.bam %-3.hist: $(name)-3.fa
589 $(align) $(mapopt) $(strip $($*)) $< \
591 $(gtime) $(align) $(mapopt) $(strip $($*)) $< \
590592 |$(fixmate) $(fmopt) -h $*-3.hist \
591593 |sort -snk3 -k4 \
592594 |samtools view -Sb - -o $*-3.bam
593595
594596 %-3.dist: %-3.sam.gz %-3.hist
595597 gunzip -c $< \
598 |$(gtime) $(DistanceEst) $(deopt) -o $@ $*-3.hist
599
600 %-3.dist: %-3.bam %-3.hist
601 $(gtime) samtools view -h $< \
596602 |$(DistanceEst) $(deopt) -o $@ $*-3.hist
597603
598 %-3.dist: %-3.bam %-3.hist
599 samtools view -h $< \
600 |$(DistanceEst) $(deopt) -o $@ $*-3.hist
601
602604 %-3.dist: $(name)-3.fa
603 $(align) $(mapopt) $(strip $($*)) $< \
605 $(gtime) $(align) $(mapopt) $(strip $($*)) $< \
604606 |$(fixmate) $(fmopt) -h $*-3.hist \
605607 |sort -snk3 -k4 \
606608 |$(DistanceEst) $(deopt) -o $@ $*-3.hist
609611
610612 ifneq ($(name)-3.dist, $(dist))
611613 $(name)-3.dist: $(name)-3.fa $(dist)
612 abyss-todot $v --dist -e $^ >$@
614 $(gtime) abyss-todot $v --dist -e $^ >$@
613615
614616 $(name)-3.bam: $(addsuffix -3.bam, $(pe))
615 samtools merge -r $@ $^
617 $(gtime) samtools merge -r $@ $^
616618 endif
617619
618620 # Find overlaps between contigs
619621
620622 %-4.fa %-4.$g: %-3.fa %-3.$g %-3.dist
621 Overlap $v --$g $(SS) $(OVERLAP_OPTIONS) -k$k -g $*-4.$g -o $*-4.fa $^
623 $(gtime) Overlap $v --$g $(SS) $(OVERLAP_OPTIONS) -k$k -g $*-4.$g -o $*-4.fa $^
622624
623625 # Assemble contigs
624626
625627 %-4.path1: %-4.$g %-3.dist
626 SimpleGraph $v $(sgopt) $(SIMPLEGRAPH_OPTIONS) -j$j -k$k -o $@ $^
628 $(gtime) SimpleGraph $v $(sgopt) $(SIMPLEGRAPH_OPTIONS) -j$j -k$k -o $@ $^
627629
628630 %-4.path2: %-4.path1 %-3.fa.fai %-4.fa.fai
629631 cat $*-3.fa.fai $*-4.fa.fai \
630 |MergePaths $(mpopt) $(MERGEPATHS_OPTIONS) -o $@ - $<
632 |$(gtime) MergePaths $(mpopt) $(MERGEPATHS_OPTIONS) -o $@ - $<
631633
632634 %-4.path3: %-4.$g %-4.path2
633635 PathOverlap --assemble $(poopt) $(SS) $^ >$@
636638
637639 %-5.path %-5.fa %-5.$g: %-3.fa %-4.fa %-4.$g %-4.path3
638640 cat $(wordlist 1, 2, $^) \
639 |PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -o $*-5.path -s $*-5.fa -g $*-5.$g - $(wordlist 3, 4, $^)
641 |$(gtime) PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -o $*-5.path -s $*-5.fa -g $*-5.$g - $(wordlist 3, 4, $^)
640642
641643 %-6.fa: %-3.fa %-4.fa %-5.fa %-5.$g %-5.path
642 cat $(wordlist 1, 3, $^) |MergeContigs $(mcopt) -o $@ - $(wordlist 4, 5, $^)
644 cat $(wordlist 1, 3, $^) |$(gtime) MergeContigs $(mcopt) -o $@ - $(wordlist 4, 5, $^)
643645
644646 else
645647
648650 ln -sf $*-4.path3 $*-5.path
649651
650652 %-cs.fa: %-3.fa %-4.fa %-4.$g %-4.path3
651 cat $(wordlist 1, 2, $^) |MergeContigs $(mcopt) -o $@ - $(wordlist 3, 4, $^)
653 cat $(wordlist 1, 2, $^) |$(gtime) MergeContigs $(mcopt) -o $@ - $(wordlist 3, 4, $^)
652654
653655 # Convert colour-space sequence to nucleotides
654656
655657 %-6.fa: %-cs.fa
656 KAligner $v --seq -m -j$j -l$l $(in) $(se) $< \
658 $(gtime) KAligner $v --seq -m -j$j -l$l $(in) $(se) $< \
657659 |Consensus $v -o $@ $<
658660
659661 endif
660662
661663 %-6.$g: %-5.$g %-5.path
662 PathOverlap --overlap $(poopt) --$g $^ >$@
664 $(gtime) PathOverlap --overlap $(poopt) --$g $^ >$@
663665
664666 %-contigs.fa: %-6.fa
665667 ln -sf $< $@
670672 # Estimate distances between contigs
671673
672674 %-6.sam.gz %-6.hist: $(name)-6.fa
673 $(align) $(mapopt) $(strip $($*)) $< \
675 $(gtime) $(align) $(mapopt) $(strip $($*)) $< \
674676 |$(fixmate) $(fmopt) -h $*-6.hist \
675677 |sort -snk3 -k4 \
676678 |$(gzip) >$*-6.sam.gz
677679
678680 %-6.bam %-6.hist: $(name)-6.fa
679 $(align) $(mapopt) $(strip $($*)) $< \
681 $(gtime) $(align) $(mapopt) $(strip $($*)) $< \
680682 |$(fixmate) $(fmopt) -h $*-6.hist \
681683 |sort -snk3 -k4 \
682684 |samtools view -Sb - -o $*-6.bam
683685
684686 %-6.dist.dot: %-6.sam.gz %-6.hist
685687 gunzip -c $< \
686 |$(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist
688 |$(gtime) $(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist
687689
688690 %-6.dist.dot: %-6.bam %-6.hist
689691 samtools view -h $< \
690 |$(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist
692 |$(gtime) $(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist
691693
692694 %-6.dist.dot: $(name)-6.fa
693 $(align) $(mapopt) $(strip $($*)) $< \
695 $(gtime) $(align) $(mapopt) $(strip $($*)) $< \
694696 |$(fixmate) $(fmopt) -h $*-6.hist \
695697 |sort -snk3 -k4 \
696698 |$(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist
698700 # Scaffold
699701
700702 %-6.path: $(name)-6.$g $(addsuffix -6.dist.dot, $(mp))
701 abyss-scaffold $(scopt) -s$S -n$N -g $@.dot $(SCAFFOLD_OPTIONS) $^ >$@
703 $(gtime) abyss-scaffold $(scopt) -s$S -n$N -g $@.dot $(SCAFFOLD_OPTIONS) $^ >$@
702704
703705 %-7.path %-7.$g %-7.fa: %-6.fa %-6.$g %-6.path
704 PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -s $*-7.fa -g $*-7.$g -o $*-7.path $^
706 $(gtime) PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -s $*-7.fa -g $*-7.$g -o $*-7.path $^
705707
706708 %-8.fa: %-6.fa %-7.fa %-7.$g %-7.path
707709 cat $(wordlist 1, 2, $^) \
708 |MergeContigs $(mcopt) -o $@ - $(wordlist 3, 4, $^)
710 |$(gtime) MergeContigs $(mcopt) -o $@ - $(wordlist 3, 4, $^)
709711
710712 %-8.$g: %-7.$g %-7.path
711 PathOverlap --overlap $(poopt) --$g $^ >$@
713 $(gtime) PathOverlap --overlap $(poopt) --$g $^ >$@
712714
713715 # Scaffold using linked reads
714716 ifdef lr
716718 # Tigmint
717719
718720 # Options for mapping the reads to the draft assembly.
719 lr_l=$l
721 lr_l?=$l
720722 override lrmapopt=$v -j$j -l$(lr_l) $(LR_MAP_OPTIONS)
721723
722724 # Options for abyss-scaffold
723 lr_s=1000-100000
724 lr_n=5-20
725 lr_s?=1000-100000
726 lr_n?=5-20
725727
726728 # Minimum AS/Read length ratio
727 tigmint_as=0.65
729 tigmint_as?=0.65
728730
729731 # Maximum number of mismatches
730 tigmint_nm=5
732 tigmint_nm?=5
731733
732734 # Minimum mapping quality threshold
733 tigmint_mapq=0
735 tigmint_mapq?=0
734736
735737 # Maximum distance between reads to be considered the same molecule
736 tigmint_d=50000
738 tigmint_d?=50000
737739
738740 # Minimum number of spanning molecules
739 tigmint_n=10
741 tigmint_n?=10
740742
741743 # Size of the window that must be spanned by moecules
742 tigmint_w=1000
744 tigmint_w?=1000
743745
744746 # Align paired-end reads to the draft genome, sort by BX tag,
745747 # and create molecule extents BED.
746 %.lr.bed: %.fa
747 $(gtime) $(align) $(lrmapopt) $(lr_reads) $< \
748 %.lr.bed: %.fa.fai
749 $(gtime) $(align) $(lrmapopt) $(lr_reads) $*.fa \
748750 | samtools sort -@$j -tBX -l0 -T$$(mktemp -u -t $@.XXXXXX) \
749751 | tigmint-molecule -a $(tigmint_as) -n $(tigmint_nm) -q $(tigmint_mapq) -d $(tigmint_d) - \
750752 | sort -k1,1 -k2,2n -k3,3n >$@
751753
752754 # Align paired-end reads to the draft genome and sort by BX tag.
753 %.lr.sortbx.bam: %.fa
754 $(gtime) $(align) $(lrmapopt) $(lr_reads) $< \
755 %.lr.sortbx.bam: %.fa.fai
756 $(gtime) $(align) $(lrmapopt) $(lr_reads) $*.fa \
755757 | samtools sort -@$j -tBX -T$$(mktemp -u -t $@.XXXXXX) -o $@
756758
757759 # Filter the BAM file, create molecule extents BED.
764766 $(gtime) tigmint-cut -p$j -n$(tigmint_n) -w$(tigmint_w) -o $@ $*.fa $<
765767
766768 # ARCS
767 arcs_c=2
768 arcs_d=0
769 arcs_e=30000
770 arcs_l=0
771 arcs_m=4-20000
772 arcs_r=0.05
773 arcs_s=98
774 arcs_z=500
769 arcs_c?=2
770 arcs_d?=0
771 arcs_e?=30000
772 arcs_l?=0
773 arcs_m?=4-20000
774 arcs_r?=0.05
775 arcs_s?=98
776 arcs_z?=500
775777
776778 # Align reads and create a graph of linked contigs using ARCS.
777779 %.arcs.dist.gv: %.fa
789791 # Create a graph of linked contigs using ARCS.
790792 %.arcs.dist.gv: %.lr.sortn.sam.gz
791793 gunzip -c $< \
792 | arcs $v -c$(arcs_c) -d$(arcs_d) -e$(arcs_e) -l$(arcs_l) -m$(arcs_m) -r$(arcs_r) -s$(arcs_s) -z$(arcs_z) \
794 |$(gtime) arcs $v -c$(arcs_c) -d$(arcs_d) -e$(arcs_e) -l$(arcs_l) -m$(arcs_m) -r$(arcs_r) -s$(arcs_s) -z$(arcs_z) \
793795 -g $*.arcs.dist.gv --tsv=$*.arcs.tsv --barcode-counts=$*.arcs.barcode-counts.tsv /dev/stdin
794796
795797 # Scaffold using ARCS and abyss-scaffold.
796798 %.arcs.path: %.arcs.dist.gv
797 abyss-scaffold $(scopt) -s$(lr_s) -n$(lr_n) -g $@.dot $(LR_SCAFFOLD_OPTIONS) $< >$@
799 $(gtime) abyss-scaffold $(scopt) -s$(lr_s) -n$(lr_n) -g $@.dot $(LR_SCAFFOLD_OPTIONS) $< >$@
798800
799801 # Create the FASTA file of ARCS scaffolds.
800802 %.arcs.fa: %.fa %.arcs.path
801 MergeContigs $(mcopt) -o $@ $^
803 $(gtime) MergeContigs $(mcopt) -o $@ $^
802804
803805 %-scaffolds.fa: %-8.tigmint.arcs.fa
804806 ln -sf $< $@
817819 sealer_ks?=-k90 -k80 -k70 -k60 -k50 -k40 -k30
818820
819821 %-8_scaffold.fa: %-8.fa
820 abyss-sealer -v -j$j --print-flanks -o$*-8 -S$< $(sealer_ks) $(SEALER_OPTIONS) $(in) $(se)
822 $(gtime) abyss-sealer -v -j$j --print-flanks -o$*-8 -S$< $(sealer_ks) $(SEALER_OPTIONS) $(in) $(se)
821823
822824 %-scaffolds-sealed.fa: %-8_scaffold.fa
823825 ln -s $< $@
833835 # Transcriptome assisted scaffolding
834836
835837 %.fa.bwt: %.fa
836 bwa index $<
838 $(gtime) bwa index $<
837839
838840 %-8.sam.gz: $(name)-8.fa.bwt
839 bwa mem -a -t$j -S -P -k$l $(name)-8.fa $(strip $($*)) \
841 $(gtime) bwa mem -a -t$j -S -P -k$l $(name)-8.fa $(strip $($*)) \
840842 |$(gzip) >$@
841843
842844 %-8.dist.dot: %-8.sam.gz
843 abyss-longseqdist -k$k $(LONGSEQDIST_OPTIONS) $< \
845 $(gtime) abyss-longseqdist -k$k $(LONGSEQDIST_OPTIONS) $< \
844846 |grep -v "l=" >$@
845847
846848 %-8.path: $(name)-8.$g $(addsuffix -8.dist.dot, $(long))
847 abyss-scaffold $(scopt) -s$S -n1 -g $@.$g $(SCAFFOLD_OPTIONS) $^ >$@
849 $(gtime) abyss-scaffold $(scopt) -s$S -n1 -g $@.$g $(SCAFFOLD_OPTIONS) $^ >$@
848850
849851 %-9.path %-9.$g %-9.fa: %-8.fa %-8.$g %-8.path
850 PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -s $*-9.fa -g $*-9.$g -o $*-9.path $^
852 $(gtime) PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -s $*-9.fa -g $*-9.$g -o $*-9.path $^
851853
852854 %-10.fa: %-8.fa %-9.fa %-9.$g %-9.path
853855 cat $(wordlist 1, 2, $^) \
854 |MergeContigs $(mcopt) -o $@ - $(wordlist 3, 4, $^)
856 |$(gtime) MergeContigs $(mcopt) -o $@ - $(wordlist 3, 4, $^)
855857
856858 %-10.$g: %-9.$g %-9.path
857 PathOverlap --overlap $(poopt) --$g $^ >$@
859 $(gtime) PathOverlap --overlap $(poopt) --$g $^ >$@
858860
859861 %-long-scaffs.fa: %-10.fa
860862 ln -sf $< $@
880882 endif
881883
882884 $(name)-unitigs.bam: %.bam: %.fa
883 $(align) $v -j$j -l$l $(ALIGNER_OPTIONS) $(se) $< \
885 $(gtime) $(align) $v -j$j -l$l $(ALIGNER_OPTIONS) $(se) $< \
884886 |samtools view -Su - |samtools sort -o - - >$@
885887
886888 $(name)-contigs.bam $(name)-scaffolds.bam: %.bam: %.fa
887 $(align) $v -j$j -l$l $(ALIGNER_OPTIONS) \
889 $(gtime) $(align) $v -j$j -l$l $(ALIGNER_OPTIONS) \
888890 $(call map, deref, $(sort $(lib) $(pe) $(mp))) $< \
889891 |$(fixmate) $v $(FIXMATE_OPTIONS) \
890892 |sort -snk3 -k4 \
893895 # Align the variants to the assembly
894896
895897 %-variants.bam: %.fa.bwt
896 bwa bwasw -t$j $*.fa <(cat $(name)-bubbles.fa $(name)-indel.fa) \
898 $(gtime) bwa bwasw -t$j $*.fa <(cat $(name)-bubbles.fa $(name)-indel.fa) \
897899 |samtools view -Su - |samtools sort -o - - >$@
898900
899901 %-variants.vcf.gz: %.fa %-variants.bam
900 samtools mpileup -Buf $^ |bcftools view -vp1 - |bgzip >$@
902 $(gtime) samtools mpileup -Buf $^ |bcftools view -vp1 - |bgzip >$@
901903
902904 %.gz.tbi: %.gz
903 tabix -pvcf $<
905 $(gtime) tabix -pvcf $<
904906
905907 # Calculate assembly contiguity statistics
906908
937939 # Create an AGP file and FASTA file of scaftigs from scaffolds
938940
939941 %.agp %-agp.fa: %.fa
940 abyss-fatoagp $(FATOAGP_OPTIONS) -f $*-agp.fa $< >$*.agp
942 $(gtime) abyss-fatoagp $(FATOAGP_OPTIONS) -f $*-agp.fa $< >$*.agp
941943
942944 # Align the contigs to the reference
943945
944946 %-$(ref).sam.gz: %.fa
945 bwa bwasw $(bwaswopt) $(BWASW_OPTIONS) $($(ref)) $< |$(gzip) >$@
947 $(gtime) bwa bwasw $(bwaswopt) $(BWASW_OPTIONS) $($(ref)) $< |$(gzip) >$@
946948
947949 # Find breakpoints in the alignments
948950
949951 %.break: %.sam.gz
950 abyss-samtobreak $(SAMTOBREAK_OPTIONS) $< >$@
952 $(gtime) abyss-samtobreak $(SAMTOBREAK_OPTIONS) $< >$@
951953
952954 # Report ABySS configuration variable(s) and value(s) currently set.
953955
00 AC_PREREQ(2.62)
1 AC_INIT(ABySS, 2.1.0, abyss-users@bcgsc.ca, abyss,
1 AC_INIT(ABySS, 2.1.1, abyss-users@bcgsc.ca, abyss,
22 http://www.bcgsc.ca/platform/bioinfo/software/abyss)
33 m4_include(m4/m4_ax_pthread.m4)
44 AM_INIT_AUTOMAKE(1.9.6 foreign subdir-objects)
309309 BloomDBG/Makefile
310310 DataBase/Makefile
311311 lib/bloomfilter/Makefile
312 lib/rolling-hash/Makefile
312 lib/nthash/Makefile
313313 ])
314314
315315 if test "$with_sparsehash" != "no" -a "$ac_cv_header_google_sparse_hash_map" != "yes"; then
0 .TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.1.0" "User Commands"
0 .TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.1.1" "User Commands"
11 .SH NAME
22 ABYSS \- assemble short reads into contigs
33 .SH SYNOPSIS
0 .TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.1.0" "User Commands"
0 .TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.1.1" "User Commands"
11 .SH NAME
22 abyss-pe - assemble reads into contigs
33 .SH SYNOPSIS
0 .TH abyss-tofastq "1" "2015-May" "ABySS 2.1.0" "User Commands"
0 .TH abyss-tofastq "1" "2015-May" "ABySS 2.1.1" "User Commands"
11 .SH NAME
22 abyss-tofastq \- convert various file formats to FASTQ format
33 .br
1919 #include <cstdlib>
2020 #include <stdio.h>
2121 #include <cstring>
22 #include "lib/rolling-hash/rolling.h"
2322
2423 using namespace std;
2524
3231 >> (((0x4332322132212110 >> ((x & 0xF) << 2)) & 0xF) << 2))
3332 >> ((0x4332322132212110 >> (((x & 0xF0) >> 2)) & 0xF) << 2)) & 0xf;
3433 }
35
36 /* To avoid name collision with konnector `BloomFilter` class */
37
38 namespace BTL {
3934
4035 class BloomFilter {
4136 public:
9893 loadFilter(filterFilePath);
9994 }
10095
101 void loadHeader(FILE *file) {
102
103 FileHeader header;
104 if (fread(&header, sizeof(struct FileHeader), 1, file) != 1) {
105 cerr << "Failed to read Bloom filter file header" << endl;
106 }
107 char magic[9];
108 strncpy(magic, header.magic, 8);
109 magic[8] = '\0';
110
111 // cerr << "Loading header... magic: " <<
112 // magic << " hlen: " <<
113 // header.hlen << " size: " <<
114 // header.size << " nhash: " <<
115 // header.nhash << " kmer: " <<
116 // header.kmer << " dFPR: " <<
117 // header.dFPR << " aFPR: " <<
118 // header.aFPR << " rFPR: " <<
119 // header.rFPR << " nEntry: " <<
120 // header.nEntry << " tEntry: " <<
121 // header.tEntry << endl;
122
123 m_size = header.size;
124 initSize(m_size);
125 m_hashNum = header.nhash;
126 m_kmerSize = header.kmer;
127 }
128
12996 void loadFilter(const string &filterFilePath)
13097 {
13198 FILE *file = fopen(filterFilePath.c_str(), "rb");
13299 if (file == NULL) {
133100 cerr << "file \"" << filterFilePath << "\" could not be read."
134 << endl;
101 << endl;
135102 exit(1);
136103 }
137104
143110 fseek(file, lCurPos, 0);
144111 if (fileSize != m_sizeInBytes) {
145112 cerr << "Error: " << filterFilePath
146 << " does not match size given by its information file. Size: "
147 << fileSize << " vs " << m_sizeInBytes << " bytes." << endl;
113 << " does not match size given by its header. Size: "
114 << fileSize << " vs " << m_sizeInBytes << " bytes." << endl;
148115 exit(1);
149116 }
150117
151118 size_t countRead = fread(m_filter, fileSize, 1, file);
152119 if (countRead != 1 && fclose(file) != 0) {
153120 cerr << "file \"" << filterFilePath << "\" could not be read."
154 << endl;
121 << endl;
155122 exit(1);
156123 }
124 }
125
126 void loadHeader(FILE *file) {
127
128 FileHeader header;
129 if (fread(&header, sizeof(struct FileHeader), 1, file) != 1) {
130 cerr << "Failed to header" << endl;
131 }
132 char magic[9];
133 strncpy(magic, header.magic, 8);
134 magic[8] = '\0';
135
136 m_size = header.size;
137 initSize(m_size);
138 m_hashNum = header.nhash;
139 m_kmerSize = header.kmer;
157140 }
158141
159142 /*
182165 }
183166 }
184167
185 void insert(const char* kmer) {
186 uint64_t hVal = getChval(kmer, m_kmerSize);
187 for (unsigned i = 0; i < m_hashNum; i++) {
188 size_t normalizedValue = (rol(varSeed, i) ^ hVal) % m_size;
189 __sync_or_and_fetch(&m_filter[normalizedValue / bitsPerChar],
190 bitMask[normalizedValue % bitsPerChar]);
191 }
192 }
193
194 /*
168 /*
169 * Accepts a list of precomputed hash values. Faster than rehashing each time.
195170 * Returns if already inserted
196171 */
197 bool insertAndCheck(const char* kmer) {
198 uint64_t hVal = getChval(kmer, m_kmerSize);
172 bool insertAndCheck(const size_t precomputed[]) {
173 //iterates through hashed values adding it to the filter
199174 bool found = true;
200 for (unsigned i = 0; i < m_hashNum; i++) {
201 size_t normalizedValue = (rol(varSeed, i) ^ hVal) % m_size;
202 found &= __sync_or_and_fetch(
175 for (size_t i = 0; i < m_hashNum; ++i) {
176 size_t normalizedValue = precomputed[i] % m_size;
177 found &= __sync_fetch_and_or(
203178 &m_filter[normalizedValue / bitsPerChar],
204 bitMask[normalizedValue % bitsPerChar]);
179 bitMask[normalizedValue % bitsPerChar])
180 >> (normalizedValue % bitsPerChar) & 1;
205181 }
206182 return found;
207183 }
215191 bool found = true;
216192 for (size_t i = 0; i < m_hashNum; ++i) {
217193 size_t normalizedValue = precomputed.at(i) % m_size;
218 found &= __sync_or_and_fetch(
194 found &= __sync_fetch_and_or(
219195 &m_filter[normalizedValue / bitsPerChar],
220 bitMask[normalizedValue % bitsPerChar]);
196 bitMask[normalizedValue % bitsPerChar])
197 >> (normalizedValue % bitsPerChar) & 1;
221198 }
222199 return found;
223200 }
250227 return true;
251228 }
252229
253 /*
254 * Single pass filtering, computes hash values on the fly
255 */
256 bool contains(const char* kmer) const {
257 uint64_t hVal = getChval(kmer, m_kmerSize);
258 for (unsigned i = 0; i < m_hashNum; i++) {
259 size_t normalizedValue = (rol(varSeed, i) ^ hVal) % m_size;
260 unsigned char bit = bitMask[normalizedValue % bitsPerChar];
261 if ((m_filter[normalizedValue / bitsPerChar] & bit) == 0)
262 return false;
263 }
264 return true;
265 }
266
267 void writeHeader(ostream &out) const {
230 void writeHeader(std::ostream& out) const {
268231 FileHeader header;
269232 strncpy(header.magic, "BlOOMFXX", 8);
270233 char magic[9];
279242 header.nEntry = m_nEntry;
280243 header.tEntry = m_tEntry;
281244
282 // cerr << "Writing header... magic: "
283 // << magic << " hlen: "
284 // << header.hlen << " size: "
285 // << header.size << " nhash: "
286 // << header.nhash << " kmer: "
287 // << header.kmer << " dFPR: "
288 // << header.dFPR << " aFPR: "
289 // << header.aFPR << " rFPR: "
290 // << header.rFPR << " nEntry: "
291 // << header.nEntry << " tEntry: "
292 // << header.tEntry << endl;
293
294245 out.write(reinterpret_cast<char*>(&header), sizeof(struct FileHeader));
295246 assert(out);
296247 }
297248
298249 /** Serialize the Bloom filter to a stream */
299 // void storeFilter(std::ostream& out) const
300250 friend std::ostream& operator<<(std::ostream& out, const BloomFilter& o)
301251 {
302252 assert(out);
318268 void storeFilter(string const &filterFilePath) const {
319269 ofstream myFile(filterFilePath.c_str(), ios::out | ios::binary);
320270
321 cerr << "Storing filter. Filter is " << m_sizeInBytes << "bytes."
271 cerr << "Storing filter. Filter is " << m_sizeInBytes << " bytes."
322272 << endl;
323273
324274 myFile << *this;
325275 myFile.close();
276 assert(myFile);
326277 }
327278
328279 size_t getPop() const {
329280 size_t i, popBF = 0;
330 #pragma omp parallel for reduction(+:popBF)
281 //#pragma omp parallel for reduction(+:popBF)
331282 for (i = 0; i < (m_size + 7) / 8; i++)
332283 popBF = popBF + popCnt(m_filter[i]);
333284 return popBF;
340291 unsigned getKmerSize() const {
341292 return m_kmerSize;
342293 }
343
344 // void setdFPR(double value) {
345 // m_dFPR = value;
346 // }
347294
348295 /*
349296 * Calculates that False positive rate that a redundant entry is actually
362309 * Return FPR based on popcount
363310 */
364311 double getFPR() const {
365 return pow(double(getPop())/double(m_size), m_hashNum);
312 return pow(double(getPop())/double(m_size), double(m_hashNum));
366313 }
367314
368315 /*
396343 ~BloomFilter() {
397344 delete[] m_filter;
398345 }
399 private:
346 protected:
400347 BloomFilter(const BloomFilter& that); //to prevent copy construction
401348
402349 /*
450397 * Calculates the optimal FPR to use based on hash functions
451398 */
452399 double calcFPR_hashNum(unsigned hashFunctNum) const {
453 return pow(2, -hashFunctNum);
400 return pow(2, -double(hashFunctNum));
454401 }
455402
456403 uint8_t* m_filter;
463410 uint64_t m_tEntry;
464411 };
465412
466 } // end namespace 'BTL'
467
468413 #endif /* BLOOMFILTER_H_ */
00 These files come from:
11
22 * https://github.com/bcgsc/bloomfilter
3 * commit f1232c2
4 * modifications were made to BloomFilter.h (TODO: merge back to source repo)
3 * commit 2e5e9d4
0 EXTRA_DIST = README.md
0 nthash.hpp is taken from https://github.com/bcgsc/ntHash.git, commit 07e3f4d
0 /*
1 *
2 * nthash.hpp
3 * Author: Hamid Mohamadi
4 * Genome Sciences Centre,
5 * British Columbia Cancer Agency
6 */
7
8 #ifndef NT_HASH_H
9 #define NT_HASH_H
10
11 #include <stdint.h>
12
13 // offset for the complement base in the random seeds table
14 const uint8_t cpOff = 0x07;
15
16 // shift for gerenerating multiple hash values
17 const int multiShift = 27;
18
19 // seed for gerenerating multiple hash values
20 static const uint64_t multiSeed = 0x90b45d39fb6da1fa;
21
22 // 64-bit random seeds corresponding to bases and their complements
23 static const uint64_t seedA = 0x3c8bfbb395c60474;
24 static const uint64_t seedC = 0x3193c18562a02b4c;
25 static const uint64_t seedG = 0x20323ed082572324;
26 static const uint64_t seedT = 0x295549f54be24456;
27 static const uint64_t seedN = 0x0000000000000000;
28
29 static const uint64_t seedTab[256] = {
30 seedN, seedT, seedN, seedG, seedA, seedN, seedN, seedC, // 0..7
31 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 8..15
32 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 16..23
33 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 24..31
34 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 32..39
35 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 40..47
36 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 48..55
37 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 56..63
38 seedN, seedA, seedN, seedC, seedN, seedN, seedN, seedG, // 64..71
39 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 72..79
40 seedN, seedN, seedN, seedN, seedT, seedN, seedN, seedN, // 80..87
41 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 88..95
42 seedN, seedA, seedN, seedC, seedN, seedN, seedN, seedG, // 96..103
43 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 104..111
44 seedN, seedN, seedN, seedN, seedT, seedN, seedN, seedN, // 112..119
45 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 120..127
46 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 128..135
47 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 136..143
48 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 144..151
49 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 152..159
50 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 160..167
51 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 168..175
52 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 176..183
53 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 184..191
54 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 192..199
55 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 200..207
56 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 208..215
57 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 216..223
58 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 224..231
59 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 232..239
60 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 240..247
61 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN // 248..255
62 };
63
64 // rotate "v" to the left 1 position
65 inline uint64_t rol1(const uint64_t v) {
66 return (v << 1) | (v >> 63);
67 }
68
69 // rotate "v" to the right by 1 position
70 inline uint64_t ror1(const uint64_t v) {
71 return (v >> 1) | (v << 63);
72 }
73
74 // rotate 31-left bits of "v" to the left by "s" positions
75 inline uint64_t rol31(const uint64_t v, unsigned s) {
76 s%=31;
77 return ((v << s) | (v >> (31 - s))) & 0x7FFFFFFF;
78 }
79
80 // rotate 33-right bits of "v" to the left by "s" positions
81 inline uint64_t rol33(const uint64_t v, unsigned s) {
82 s%=33;
83 return ((v << s) | (v >> (33 - s))) & 0x1FFFFFFFF;
84 }
85
86 // swap bit 0 with bit 33 in "v"
87 inline uint64_t swapbits033(const uint64_t v) {
88 uint64_t x = (v ^ (v >> 33)) & 1;
89 return v ^ (x | (x << 33));
90 }
91
92 // swap bit 32 with bit 63 in "v"
93 inline uint64_t swapbits3263(const uint64_t v) {
94 uint64_t x = ((v >> 32) ^ (v >> 63)) & 1;
95 return v ^ ((x << 32) | (x << 63));
96 }
97
98 // forward-strand hash value of the base kmer, i.e. fhval(kmer_0)
99 inline uint64_t NTF64(const char * kmerSeq, const unsigned k) {
100 uint64_t hVal=0;
101 for(unsigned i=0; i<k; i++) {
102 hVal = rol1(hVal);
103 hVal = swapbits033(hVal);
104 hVal ^= seedTab[(unsigned char)kmerSeq[i]];
105 }
106 return hVal;
107 }
108
109 // reverse-strand hash value of the base kmer, i.e. rhval(kmer_0)
110 inline uint64_t NTR64(const char * kmerSeq, const unsigned k) {
111 uint64_t hVal=0;
112 for(unsigned i=0; i<k; i++) {
113 hVal = rol1(hVal);
114 hVal = swapbits033(hVal);
115 hVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]&cpOff];
116 }
117 return hVal;
118 }
119
120 // forward-strand ntHash for sliding k-mers
121 inline uint64_t NTF64(const uint64_t fhVal, const unsigned k, const unsigned char charOut, const unsigned char charIn) {
122 uint64_t hVal = rol1(fhVal);
123 hVal = swapbits033(hVal);
124 hVal ^= seedTab[charIn];
125 uint64_t lBits = seedTab[charOut] >> 33;
126 uint64_t rBits = seedTab[charOut] & 0x1FFFFFFFF;
127 uint64_t sOut = (rol31(lBits,k) << 33) | (rol33(rBits,k));
128 hVal ^= sOut;
129 return hVal;
130 }
131
132 // reverse-complement ntHash for sliding k-mers
133 inline uint64_t NTR64(const uint64_t rhVal, const unsigned k, const unsigned char charOut, const unsigned char charIn) {
134 uint64_t lBits = seedTab[charIn&cpOff] >> 33;
135 uint64_t rBits = seedTab[charIn&cpOff] & 0x1FFFFFFFF;
136 uint64_t sIn = (rol31(lBits,k) << 33) | (rol33(rBits,k));
137 uint64_t hVal = rhVal ^ sIn;
138 hVal ^= seedTab[charOut&cpOff];
139 hVal = ror1(hVal);
140 hVal = swapbits3263(hVal);
141 return hVal;
142 }
143
144 // canonical ntBase
145 inline uint64_t NTC64(const char * kmerSeq, const unsigned k) {
146 uint64_t fhVal=0, rhVal=0;
147 fhVal=NTF64(kmerSeq, k);
148 rhVal=NTR64(kmerSeq, k);
149 return (rhVal<fhVal)? rhVal : fhVal;
150 }
151
152 // canonical ntHash
153 inline uint64_t NTC64(const char * kmerSeq, const unsigned k, uint64_t& fhVal, uint64_t& rhVal) {
154 fhVal = NTF64(kmerSeq, k);
155 rhVal = NTR64(kmerSeq, k);
156 return (rhVal<fhVal)? rhVal : fhVal;
157 }
158
159 // canonical ntHash for sliding k-mers
160 inline uint64_t NTC64(const unsigned char charOut, const unsigned char charIn, const unsigned k, uint64_t& fhVal, uint64_t& rhVal) {
161 fhVal = NTF64(fhVal, k, charOut, charIn);
162 rhVal = NTR64(rhVal, k, charOut, charIn);
163 return (rhVal<fhVal)? rhVal : fhVal;
164 }
165
166 // forward-strand ntHash for sliding k-mers to the left
167 inline uint64_t NTF64L(const uint64_t rhVal, const unsigned k, const unsigned char charOut, const unsigned char charIn) {
168 uint64_t lBits = seedTab[charIn] >> 33;
169 uint64_t rBits = seedTab[charIn] & 0x1FFFFFFFF;
170 uint64_t sIn = (rol31(lBits,k) << 33) | (rol33(rBits,k));
171 uint64_t hVal = rhVal ^ sIn;
172 hVal ^= seedTab[charOut];
173 hVal = ror1(hVal);
174 hVal = swapbits3263(hVal);
175 return hVal;
176 }
177
178 // reverse-complement ntHash for sliding k-mers to the left
179 inline uint64_t NTR64L(const uint64_t fhVal, const unsigned k, const unsigned char charOut, const unsigned char charIn) {
180 uint64_t hVal = rol1(fhVal);
181 hVal = swapbits033(hVal);
182 hVal ^= seedTab[charIn&cpOff];
183 uint64_t lBits = seedTab[charOut&cpOff] >> 33;
184 uint64_t rBits = seedTab[charOut&cpOff] & 0x1FFFFFFFF;
185 uint64_t sOut = (rol31(lBits,k) << 33) | (rol33(rBits,k));
186 hVal ^= sOut;
187 return hVal;
188 }
189
190 // canonical ntHash for sliding k-mers to the left
191 inline uint64_t NTC64L(const unsigned char charOut, const unsigned char charIn, const unsigned k, uint64_t& fhVal, uint64_t& rhVal) {
192 fhVal = NTF64L(fhVal, k, charOut, charIn);
193 rhVal = NTR64L(rhVal, k, charOut, charIn);
194 return (rhVal<fhVal)? rhVal : fhVal;
195 }
196
197 // ntBase with seeding option
198 inline uint64_t NTF64(const char * kmerSeq, const unsigned k, const unsigned seed) {
199 uint64_t hVal=NTF64(kmerSeq, k);
200 if(seed==0) return hVal;
201 hVal *= seed ^ k * multiSeed;
202 hVal ^= hVal >> multiShift;
203 return hVal;
204 }
205
206 // canonical ntBase with seeding option
207 inline uint64_t NTC64(const char * kmerSeq, const unsigned k, const unsigned seed) {
208 uint64_t hVal = NTC64(kmerSeq,k);
209 if(seed==0) return hVal;
210 hVal *= seed ^ k * multiSeed;
211 hVal ^= hVal >> multiShift;
212 return hVal;
213 }
214
215 // multihash ntHash, ntBase
216 inline void NTM64(const char * kmerSeq, const unsigned k, const unsigned m, uint64_t *hVal) {
217 uint64_t bVal=0, tVal=0;
218 bVal = NTF64(kmerSeq, k);
219 hVal[0] = bVal;
220 for(unsigned i=1; i<m; i++) {
221 tVal = bVal * (i ^ k * multiSeed);
222 tVal ^= tVal >> multiShift;
223 hVal[i] = tVal;
224 }
225 }
226
227 // one extra hash for given base hash
228 inline uint64_t NTE64(const uint64_t hVal, const unsigned k, const unsigned i) {
229 uint64_t tVal = hVal;
230 tVal *= (i ^ k * multiSeed);
231 tVal ^= tVal >> multiShift;
232 return tVal;
233 }
234
235 // multihash ntHash for sliding k-mers
236 inline void NTM64(const unsigned char charOut, const unsigned char charIn, const unsigned k, const unsigned m, uint64_t *hVal) {
237 uint64_t bVal=0, tVal=0;
238 bVal = NTF64(hVal[0], k, charOut, charIn);
239 hVal[0] = bVal;
240 for(unsigned i=1; i<m; i++) {
241 tVal = bVal * (i ^ k * multiSeed);
242 tVal ^= tVal >> multiShift;
243 hVal[i] = tVal;
244 }
245 }
246
247 // canonical multihash ntBase
248 inline void NTMC64(const char * kmerSeq, const unsigned k, const unsigned m, uint64_t *hVal) {
249 uint64_t bVal=0, tVal=0;
250 bVal = NTC64(kmerSeq, k);
251 hVal[0] = bVal;
252 for(unsigned i=1; i<m; i++) {
253 tVal = bVal * (i ^ k * multiSeed);
254 tVal ^= tVal >> multiShift;
255 hVal[i] = tVal;
256 }
257 }
258
259 // canonical multihash ntHash
260 inline void NTMC64(const char * kmerSeq, const unsigned k, const unsigned m, uint64_t& fhVal, uint64_t& rhVal, uint64_t *hVal) {
261 uint64_t bVal=0, tVal=0;
262 bVal = NTC64(kmerSeq, k, fhVal, rhVal);
263 hVal[0] = bVal;
264 for(unsigned i=1; i<m; i++) {
265 tVal = bVal * (i ^ k * multiSeed);
266 tVal ^= tVal >> multiShift;
267 hVal[i] = tVal;
268 }
269 }
270
271 // canonical multihash ntHash for sliding k-mers
272 inline void NTMC64(const unsigned char charOut, const unsigned char charIn, const unsigned k, const unsigned m, uint64_t& fhVal, uint64_t& rhVal, uint64_t *hVal) {
273 uint64_t bVal=0, tVal=0;
274 bVal = NTC64(charOut, charIn, k, fhVal, rhVal);
275 hVal[0] = bVal;
276 for(unsigned i=1; i<m; i++) {
277 tVal = bVal * (i ^ k * multiSeed);
278 tVal ^= tVal >> multiShift;
279 hVal[i] = tVal;
280 }
281 }
282
283 /*
284 * ignoring k-mers containing nonACGT using ntHash function
285 */
286
287 // canonical ntBase
288 inline bool NTC4(const char *kmerSeq, const unsigned k, uint64_t& hVal, unsigned& locN) {
289 hVal=0;
290 locN=0;
291 uint64_t fhVal=0,rhVal=0;
292 for(int i=k-1; i>=0; i--) {
293 if(seedTab[(unsigned char)kmerSeq[i]]==seedN) {
294 locN=i;
295 return false;
296 }
297 fhVal = rol1(fhVal);
298 fhVal = swapbits033(fhVal);
299 fhVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]];
300
301 rhVal = rol1(rhVal);
302 rhVal = swapbits033(rhVal);
303 rhVal ^= seedTab[(unsigned char)kmerSeq[i]&cpOff];
304 }
305 hVal = (rhVal<fhVal)? rhVal : fhVal;
306 return true;
307 }
308
309 // canonical multihash ntBase
310 inline bool NTMC64(const char *kmerSeq, const unsigned k, const unsigned m, unsigned& locN, uint64_t* hVal) {
311 uint64_t bVal=0, tVal=0, fhVal=0, rhVal=0;
312 locN=0;
313 for(int i=k-1; i>=0; i--) {
314 if(seedTab[(unsigned char)kmerSeq[i]]==seedN) {
315 locN=i;
316 return false;
317 }
318 fhVal = rol1(fhVal);
319 fhVal = swapbits033(fhVal);
320 fhVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]];
321
322 rhVal = rol1(rhVal);
323 rhVal = swapbits033(rhVal);
324 rhVal ^= seedTab[(unsigned char)kmerSeq[i]&cpOff];
325 }
326 bVal = (rhVal<fhVal)? rhVal : fhVal;
327 hVal[0] = bVal;
328 for(unsigned i=1; i<m; i++) {
329 tVal = bVal * (i ^ k * multiSeed);
330 tVal ^= tVal >> multiShift;
331 hVal[i] = tVal;
332 }
333 return true;
334 }
335
336 // canonical ntHash
337 inline bool NTC64(const char *kmerSeq, const unsigned k, uint64_t& fhVal, uint64_t& rhVal, uint64_t& hVal, unsigned& locN) {
338 hVal=fhVal=rhVal=0;
339 locN=0;
340 for(int i=k-1; i>=0; i--) {
341 if(seedTab[(unsigned char)kmerSeq[i]]==seedN) {
342 locN=i;
343 return false;
344 }
345 fhVal = rol1(fhVal);
346 fhVal = swapbits033(fhVal);
347 fhVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]];
348
349 rhVal = rol1(rhVal);
350 rhVal = swapbits033(rhVal);
351 rhVal ^= seedTab[(unsigned char)kmerSeq[i]&cpOff];
352 }
353 hVal = (rhVal<fhVal)? rhVal : fhVal;
354 return true;
355 }
356
357 // canonical multihash ntHash
358 inline bool NTMC64(const char *kmerSeq, const unsigned k, const unsigned m, uint64_t& fhVal, uint64_t& rhVal, unsigned& locN, uint64_t* hVal) {
359 fhVal=rhVal=0;
360 uint64_t bVal=0, tVal=0;
361 locN=0;
362 for(int i=k-1; i>=0; i--) {
363 if(seedTab[(unsigned char)kmerSeq[i]]==seedN) {
364 locN=i;
365 return false;
366 }
367 fhVal = rol1(fhVal);
368 fhVal = swapbits033(fhVal);
369 fhVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]];
370
371 rhVal = rol1(rhVal);
372 rhVal = swapbits033(rhVal);
373 rhVal ^= seedTab[(unsigned char)kmerSeq[i]&cpOff];
374 }
375 bVal = (rhVal<fhVal)? rhVal : fhVal;
376 hVal[0] = bVal;
377 for(unsigned i=1; i<m; i++) {
378 tVal = bVal * (i ^ k * multiSeed);
379 tVal ^= tVal >> multiShift;
380 hVal[i] = tVal;
381 }
382 return true;
383 }
384
385 // strand-aware canonical multihash ntHash
386 inline bool NTMC64(const char *kmerSeq, const unsigned k, const unsigned m, uint64_t& fhVal, uint64_t& rhVal, unsigned& locN, uint64_t* hVal, bool& hStn) {
387 fhVal=rhVal=0;
388 uint64_t bVal=0, tVal=0;
389 locN=0;
390 for(int i=k-1; i>=0; i--) {
391 if(seedTab[(unsigned char)kmerSeq[i]]==seedN) {
392 locN=i;
393 return false;
394 }
395 fhVal = rol1(fhVal);
396 fhVal = swapbits033(fhVal);
397 fhVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]];
398
399 rhVal = rol1(rhVal);
400 rhVal = swapbits033(rhVal);
401 rhVal ^= seedTab[(unsigned char)kmerSeq[i]&cpOff];
402 }
403 hStn = rhVal<fhVal;
404 bVal = hStn? rhVal : fhVal;
405 hVal[0] = bVal;
406 for(unsigned i=1; i<m; i++) {
407 tVal = bVal * (i ^ k * multiSeed);
408 tVal ^= tVal >> multiShift;
409 hVal[i] = tVal;
410 }
411 return true;
412 }
413
414 // starnd-aware canonical multihash ntHash for sliding k-mers
415 inline void NTMC64(const unsigned char charOut, const unsigned char charIn, const unsigned k, const unsigned m, uint64_t& fhVal, uint64_t& rhVal, uint64_t *hVal, bool &hStn) {
416 uint64_t bVal=0, tVal=0;
417 bVal = NTC64(charOut, charIn, k, fhVal, rhVal);
418 hStn = rhVal<fhVal;
419 hVal[0] = bVal;
420 for(unsigned i=1; i<m; i++) {
421 tVal = bVal * (i ^ k * multiSeed);
422 tVal ^= tVal >> multiShift;
423 hVal[i] = tVal;
424 }
425 }
426
427 // masking canonical ntHash using spaced seed pattern
428 inline uint64_t maskHash(uint64_t &fkVal, uint64_t &rkVal, const char * seedSeq, const char * kmerSeq, const unsigned k) {
429 uint64_t fsVal=fkVal, rsVal=rkVal;
430 for(unsigned i=0; i<k; i++) {
431 if(seedSeq[i]!='1') {
432 uint64_t lfBits = seedTab[(unsigned char)kmerSeq[i]] >> 33;
433 uint64_t rfBits = seedTab[(unsigned char)kmerSeq[i]] & 0x1FFFFFFFF;
434 uint64_t sfMask = (rol31(lfBits,k-1-i) << 33) | (rol33(rfBits,k-1-i));
435 fsVal ^= sfMask;
436
437 uint64_t lrBits = seedTab[(unsigned char)kmerSeq[i]&cpOff] >> 33;
438 uint64_t rrBits = seedTab[(unsigned char)kmerSeq[i]&cpOff] & 0x1FFFFFFFF;
439 uint64_t srMask = (rol31(lrBits,i) << 33) | (rol33(rrBits,i));
440 rsVal ^= srMask;
441 }
442 }
443 return (rsVal<fsVal)? rsVal : fsVal;
444 }
445
446 #endif
+0
-1
lib/rolling-hash/Makefile.am less more
0 EXTRA_DIST = README.md
+0
-2
lib/rolling-hash/README.md less more
0 * source repo: https://github.com/bcgsc/ntHash
1 * git commit: 9f107de
+0
-316
lib/rolling-hash/rolling.h less more
0 #ifndef ROLLING_HASH_H
1 #define ROLLING_HASH_H
2
3 #include <stdint.h>
4
5 // offset for the complement base in the random seeds table
6 const int cpOff = -20;
7
8 // shift for gerenerating multiple hash values
9 const int varShift = 27;
10
11 // seed for gerenerating multiple hash values
12 const uint64_t varSeed = 10427061540882326010ul;
13
14 // 64-bit random seed table corresponding to bases and their complements
15 static const uint64_t seedTab[256] = {
16 0, 0, 0, 0, 0, 0, 0, 0, // 0..7
17 0, 0, 0, 0, 0, 0, 0, 0, // 8..15
18 0, 0, 0, 0, 0, 0, 0, 0, // 16..23
19 0, 0, 0, 0, 0, 0, 0, 0, // 24..31
20 0, 0, 0, 0, 0, 0, 0, 0, // 32..39
21 0, 0, 0, 0, 0, 2978368046464386134ul, 0, 2319985823310095140ul, // 40..47
22 0, 0, 0, 3572411708064410444ul, 0, 0, 0, 0, // 48..55
23 0, 0, 0, 0, 0, 0, 0, 0, // 56..63
24 4362857412768957556ul, 4362857412768957556ul, 0, 3572411708064410444ul, 0, 0, 0, 2319985823310095140ul, // 64..71
25 0, 0, 0, 0, 0, 2978368046464386134ul, 0, 2319985823310095140ul, // 72..79
26 0, 0, 0, 3572411708064410444ul, 2978368046464386134ul, 0, 0, 0, // 80..87
27 0, 0, 0, 0, 0, 0, 0, 0, // 88..95
28 4362857412768957556ul, 4362857412768957556ul, 0, 3572411708064410444ul, 0, 0, 0, 2319985823310095140ul, // 96..103
29 0, 0, 0, 0, 0, 0, 0, 0, // 104..111
30 0, 0, 0, 0, 2978368046464386134ul, 0, 0, 0, // 112..119
31 0, 0, 0, 0, 0, 0, 0, 0, // 120..127
32 0, 0, 0, 0, 0, 0, 0, 0, // 128..135
33 0, 0, 0, 0, 0, 0, 0, 0, // 136..143
34 0, 0, 0, 0, 0, 0, 0, 0, // 144..151
35 0, 0, 0, 0, 0, 0, 0, 0, // 152..159
36 0, 0, 0, 0, 0, 0, 0, 0, // 160..167
37 0, 0, 0, 0, 0, 0, 0, 0, // 168..175
38 0, 0, 0, 0, 0, 0, 0, 0, // 176..183
39 0, 0, 0, 0, 0, 0, 0, 0, // 184..191
40 0, 0, 0, 0, 0, 0, 0, 0, // 192..199
41 0, 0, 0, 0, 0, 0, 0, 0, // 200..207
42 0, 0, 0, 0, 0, 0, 0, 0, // 208..215
43 0, 0, 0, 0, 0, 0, 0, 0, // 216..223
44 0, 0, 0, 0, 0, 0, 0, 0, // 224..231
45 0, 0, 0, 0, 0, 0, 0, 0, // 232..239
46 0, 0, 0, 0, 0, 0, 0, 0, // 240..247
47 0, 0, 0, 0, 0, 0, 0, 0 // 248..255
48 };
49
50 // rotate "v" to the left by "s" positions
51 inline uint64_t rol(const uint64_t v, const int s) {
52 return (v << s) | (v >> (64 - s));
53 }
54
55 // rotate "v" to the right by "s" positions
56 inline uint64_t ror(const uint64_t v, const int s) {
57 return (v >> s) | (v << (64 - s));
58 }
59
60 // forward-strand hash value of the base kmer, i.e. fhval(kmer_0)
61 inline uint64_t getFhval(const char * kmerSeq, const unsigned k) {
62 uint64_t hVal=0;
63 for(unsigned i=0; i<k; i++)
64 hVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i);
65 return hVal;
66 }
67
68 // reverse-strand hash value of the base kmer, i.e. rhval(kmer_0)
69 inline uint64_t getRhval(const char * kmerSeq, const unsigned k) {
70 uint64_t hVal=0;
71 for(unsigned i=0; i<k; i++)
72 hVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i);
73 return hVal;
74 }
75
76 // cannonical hash value of the base kmer, i.e. rhval(kmer_0)
77 inline uint64_t getChval(const char * kmerSeq, const unsigned k) {
78 uint64_t fhVal = getFhval(kmerSeq, k);
79 uint64_t rhVal = getRhval(kmerSeq, k);
80 return (rhVal<fhVal)? rhVal : fhVal;
81 }
82
83 // initialize forward-strand hash value of the first kmer, i.e. fhval(kmer_0)
84 inline uint64_t initHashes(const char * kmerSeq, const unsigned k) {
85 return getFhval(kmerSeq, k);
86 }
87
88 // initialize cannonical hash value of the first kmer, i.e. chval(kmer_0)
89 inline uint64_t initHashes(const char * kmerSeq, const unsigned k, uint64_t& fhVal, uint64_t& rhVal) {
90 fhVal = getFhval(kmerSeq, k);
91 rhVal = getRhval(kmerSeq, k);
92 return (rhVal<fhVal)? rhVal : fhVal;
93 }
94
95 // recursive forward-strand hash value for next k-mer
96 inline uint64_t rollHashesRight(const uint64_t fhVal, const unsigned char charOut, const unsigned char charIn, const unsigned k) {
97 return(rol(fhVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn]);
98 }
99
100 // recursive cannonical hash value for next k-mer
101 inline uint64_t rollHashesRight(uint64_t& fhVal, uint64_t& rhVal, const unsigned char charOut, const unsigned char charIn, const unsigned k) {
102 fhVal = rol(fhVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn];
103 rhVal = ror(rhVal, 1) ^ ror(seedTab[charOut+cpOff], 1) ^ rol(seedTab[charIn+cpOff], k-1);
104 return (rhVal<fhVal)? rhVal : fhVal;
105 }
106
107 // recursive forward-strand hash value for prev k-mer
108 inline uint64_t rollHashesLeft(const uint64_t fhVal, const unsigned char charIn, const unsigned char charOut, const unsigned k) {
109 return(ror(fhVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1));
110 }
111
112 // recursive canonical hash value for prev k-mer
113 inline uint64_t rollHashesLeft(uint64_t& fhVal, uint64_t& rhVal, const unsigned char charIn, const unsigned char charOut, const unsigned k) {
114 fhVal = ror(fhVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1);
115 rhVal = rol(rhVal, 1) ^ rol(seedTab[charOut+cpOff], k) ^ seedTab[charIn+cpOff];
116 return (rhVal<fhVal)? rhVal : fhVal;
117 }
118
119 // change a single base and update forward-strand hash value accordingly
120 inline uint64_t setBase(uint64_t fhVal, char* kmerSeq, unsigned pos, char base, unsigned k)
121 {
122 fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos);
123 kmerSeq[pos] = base;
124 fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos);
125 return fhVal;
126 }
127
128 // change a single base and update hash values accordingly
129 inline uint64_t setBase(uint64_t& fhVal, uint64_t& rhVal, char* kmerSeq, unsigned pos, char base, unsigned k)
130 {
131 fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos);
132 rhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]+cpOff], pos);
133 kmerSeq[pos] = base;
134 fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos);
135 rhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]+cpOff], pos);
136 return (rhVal<fhVal)? rhVal : fhVal;
137 }
138
139 /**
140 * Compute multiple pseudo-independent hash values from a seed hash value.
141 *
142 * @param hashes array for storing computed hash values
143 * @param seedVal seed value for multi-hash calculation
144 * @param numHashes number of hash values to compute
145 * @param k-mer size
146 */
147 inline void multiHash(uint64_t hashes[], uint64_t seedVal, unsigned numHashes, unsigned k)
148 {
149 for (unsigned i = 0; i < numHashes; i++) {
150 hashes[i] = seedVal * (i ^ k * varSeed);
151 hashes[i] ^= hashes[i] >> varShift;
152 }
153 }
154
155 // spaced-seed hash values
156
157 /**
158 * Calculate forward-strand spaced seed hash value of the base kmer, i.e. fhval(kmer_0)
159 *
160 * @param kVal set to forward-strand hash value for unmasked k-mer
161 * @param seedSeq bitmask indicating "don't care" positions for hashing
162 * @param kmerSeq k-mer to be hashed
163 * @param k k-mer size
164 * @return hash value for masked forward-strand k-mer
165 */
166 inline uint64_t getFhval(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned k) {
167 kVal=0;
168 uint64_t sVal=0;
169 for(unsigned i=0; i<k; i++) {
170 kVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i);
171 if(seedSeq[i]=='1')
172 sVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i);
173 }
174 return sVal;
175 }
176
177 /**
178 * Calculate reverse-strand spaced seed hash value of the base kmer, i.e. rhval(kmer_0)
179 *
180 * @param kVal set to reverse-strand hash value for unmasked k-mer
181 * @param seedSeq bitmask indicating "don't care" positions for hashing
182 * @param kmerSeq k-mer to be hashed
183 * @param k k-mer size
184 * @return hash for masked reverse-strand k-mer
185 */
186 // reverse-strand spaced seed hash value of the base kmer, i.e. rhval(kmer_0)
187 inline uint64_t getRhval(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned k) {
188 kVal=0;
189 uint64_t sVal=0;
190 for(unsigned i=0; i<k; i++) {
191 kVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i);
192 if(seedSeq[i]=='1')
193 sVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i);
194 }
195 return sVal;
196 }
197
198 /**
199 * Recursive forward-strand spaced seed hash value for next k-mer
200 *
201 * @param kVal hash value for current k-mer unmasked and in forward orientation
202 * @param seedSeq bitmask indicating "don't care" positions for hashing
203 * @param kmerSeq sequence for *current* k-mer (not the k-mer we are rolling into)
204 * @param charIn new base we are rolling in from the right
205 * @param k k-mer size
206 * @return hash for masked k-mer in forward orientation
207 */
208 inline uint64_t rollHashesRight(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) {
209 const unsigned charOut = kmerSeq[0];
210 kVal = rol(kVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn];
211 uint64_t sVal=kVal;
212 for(unsigned i=1; i<k-1; i++) {
213 if(seedSeq[i]!='1')
214 sVal ^= rol(seedTab[(unsigned char)kmerSeq[i+1]], k-1-i);
215 }
216 return sVal;
217 }
218
219 /**
220 * Recursive forward-strand spaced seed hash value for prev k-mer
221 *
222 * @param kVal hash value for current k-mer unmasked and in forward orientation
223 * @param seedSeq bitmask indicating "don't care" positions for hashing
224 * @param kmerSeq sequence for current k-mer (not the k-mer we are rolling into)
225 * @param charIn new base we are rolling in from the left
226 * @param k k-mer size
227 * @return hash for masked k-mer in forward orientation
228 */
229 inline uint64_t rollHashesLeft(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) {
230 const unsigned charOut = kmerSeq[k-1];
231 kVal = ror(kVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1);
232 uint64_t sVal=kVal;
233 for(unsigned i=1; i<k-1; i++) {
234 if(seedSeq[i]!='1')
235 sVal ^= rol(seedTab[(unsigned char)kmerSeq[i-1]], k-1-i);
236 }
237 return sVal;
238 }
239
240 /**
241 * Recursive canonical spaced seed hash value for next k-mer
242 *
243 * @param fkVal hash value for current k-mer unmasked and in forward orientation
244 * @param rkVal hash value for current k-mer unmasked and in reverse complement orientation
245 * @param seedSeq bitmask indicating "don't care" positions for hashing
246 * @param kmerSeq sequence for current k-mer (not the k-mer we are rolling into)
247 * @param charIn new base we are rolling in from the right
248 * @param k k-mer size
249 * @return canonical hash value for masked k-mer
250 */
251 inline uint64_t rollHashesRight(uint64_t &fkVal, uint64_t &rkVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) {
252 const unsigned charOut = kmerSeq[0];
253 fkVal = rol(fkVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn];
254 rkVal = ror(rkVal, 1) ^ ror(seedTab[charOut+cpOff], 1) ^ rol(seedTab[charIn+cpOff], k-1);
255 uint64_t fsVal=fkVal, rsVal=rkVal;
256 for(unsigned i=1; i<k-1; i++) {
257 if(seedSeq[i]!='1') {
258 fsVal ^= rol(seedTab[(unsigned char)kmerSeq[i+1]], k-1-i);
259 rsVal ^= rol(seedTab[(unsigned char)kmerSeq[i+1]+cpOff], i);
260 }
261 }
262 return (rsVal<fsVal)? rsVal : fsVal;
263 }
264
265 /**
266 * Recursive canonical spaced seed hash value for prev k-mer
267 *
268 * @param fkVal hash value for current k-mer unmasked and in forward orientation
269 * @param rkVal hash value for current k-mer unmasked and in reverse complement orientation
270 * @param seedSeq bitmask indicating "don't care" positions for hashing
271 * @param kmerSeq sequence for current k-mer (not the k-mer we are rolling into)
272 * @param charIn new base we are rolling in from the left
273 * @param k k-mer size
274 * @return canonical hash value for masked k-mer
275 */
276 inline uint64_t rollHashesLeft(uint64_t &fkVal, uint64_t &rkVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) {
277 const unsigned charOut = kmerSeq[k-1];
278 fkVal = ror(fkVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1);
279 rkVal = rol(rkVal, 1) ^ rol(seedTab[charOut+cpOff], k) ^ seedTab[charIn+cpOff];
280 uint64_t fsVal=fkVal, rsVal=rkVal;
281 for(unsigned i=1; i<k-1; i++) {
282 if(seedSeq[i]!='1') {
283 fsVal ^= rol(seedTab[(unsigned char)kmerSeq[i-1]], k-1-i);
284 rsVal ^= rol(seedTab[(unsigned char)kmerSeq[i-1]+cpOff], i);
285 }
286 }
287 return (rsVal<fsVal)? rsVal : fsVal;
288 }
289
290 /**
291 * Change a single base and recompute spaced seed hash values
292 *
293 * @param fkVal hash value for current k-mer unmasked and in forward orientation
294 * @param rkVal hash value for current k-mer unmasked and in reverse complement orientation
295 * @param seedSeq bitmask indicating "don't care" positions for hashing
296 * @param kmerSeq sequence for current k-mer
297 * @param pos position of base to change
298 * @param base new base value
299 * @param k k-mer size
300 * @return updated canonical hash value for masked k-mer
301 */
302 inline uint64_t setBase(uint64_t& fkVal, uint64_t& rkVal, const char * seedSeq, char * kmerSeq, unsigned pos, char base, unsigned k)
303 {
304 setBase(fkVal, rkVal, kmerSeq, pos, base, k);
305 uint64_t fsVal=fkVal, rsVal=rkVal;
306 for(unsigned i=0; i<k; i++) {
307 if(seedSeq[i]!='1') {
308 fsVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i);
309 rsVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i);
310 }
311 }
312 return (rsVal<fsVal)? rsVal : fsVal;
313 }
314
315 #endif