New upstream version 2.1.1
Andreas Tille
5 years ago
1 | 1 | jobs: |
2 | 2 | build: |
3 | 3 | docker: |
4 | - image: ubuntu:xenial | |
4 | - image: ubuntu:bionic | |
5 | 5 | steps: |
6 | 6 | - run: | |
7 | 7 | apt-get update -qq |
0 | # Please report | |
1 | ||
2 | - [ ] version of ABySS with `abyss-pe version` | |
3 | - [ ] distribution of Linux with `lsb_release -d` | |
4 | ||
5 | # Assembly error | |
6 | ||
7 | - [ ] complete `abyss-pe` command line | |
8 | - [ ] last 20 lines of the output of `abyss-pe` | |
9 | - [ ] number of sequenced bases | |
10 | - [ ] estimated genome size and ploidy | |
11 | - [ ] estimated sequencing depth of coverage | |
12 | ||
13 | # Build error | |
14 | ||
15 | Consider installing ABySS using [Linuxbrew](https://linuxbrew.sh) on Linux or [Homebrew](https://brew.sh) on macOS with `brew install abyss`, or using [Bioconda](https://bioconda.github.io) with `conda install abyss`. | |
16 | ||
17 | - [ ] Have you tried installing ABySS using Brew or Bioconda? | |
18 | - [ ] version of GCC or compiler with `gcc --version` | |
19 | - [ ] complete `./configure` command line | |
20 | - [ ] last 20 lines of the output of `./configure` | |
21 | - [ ] last 20 lines of the output of `make` |
71 | 71 | |
72 | 72 | AssemblyAlgorithms::setCoverageParameters( |
73 | 73 | AssemblyAlgorithms::coverageHistogram(g)); |
74 | ||
75 | if (opt::kc > 0) { | |
76 | cout << "Minimum k-mer multiplicity kc is " << opt::kc << endl; | |
77 | cout << "Removing low-multiplicity k-mers" << endl; | |
78 | size_t removed = AssemblyAlgorithms::applyKmerCoverageThreshold(g, opt::kc); | |
79 | cout << "Removed " << removed | |
80 | << " low-multiplicity k-mers, " << g.size() | |
81 | << " k-mers remaining" << std::endl; | |
82 | } | |
74 | 83 | |
75 | 84 | cout << "Generating adjacency" << endl; |
76 | 85 | AssemblyAlgorithms::generateAdjacency(&g); |
1 | 1 | #define BRANCHGROUP_H 1 |
2 | 2 | |
3 | 3 | #include "Common/Algorithms.h" |
4 | #include "Common/Exception.h" | |
4 | 5 | #include <algorithm> // for swap |
5 | 6 | #include <map> |
6 | 7 | #include <utility> |
210 | 211 | |
211 | 212 | namespace std { |
212 | 213 | template <> |
213 | inline void swap(BranchGroup&, BranchGroup&) { assert(false); } | |
214 | inline void swap(BranchGroup&, BranchGroup&) NOEXCEPT { assert(false); } | |
214 | 215 | } |
215 | 216 | |
216 | 217 | #endif |
0 | 0 | #ifndef ASSEMBLY_BRANCHRECORDBASE_H |
1 | 1 | #define ASSEMBLY_BRANCHRECORDBASE_H 1 |
2 | ||
3 | #include "Common/Exception.h" | |
2 | 4 | |
3 | 5 | #include <algorithm> |
4 | 6 | #include <cassert> |
171 | 173 | |
172 | 174 | namespace std { |
173 | 175 | template <> |
174 | inline void swap(BranchRecord& a, BranchRecord& b) | |
176 | inline void swap(BranchRecord& a, BranchRecord& b) NOEXCEPT | |
175 | 177 | { |
176 | 178 | a.swap(b); |
177 | 179 | } |
111 | 111 | } |
112 | 112 | } |
113 | 113 | |
114 | /** Remove all k-mers with multiplicity lower than the given threshold */ | |
115 | static inline | |
116 | size_t applyKmerCoverageThreshold(SequenceCollectionHash& c, unsigned kc) | |
117 | { | |
118 | if (kc == 0) | |
119 | return 0; | |
120 | ||
121 | for (SequenceCollectionHash::iterator it = c.begin(); | |
122 | it != c.end(); ++it) { | |
123 | if (it->second.getMultiplicity() < kc) | |
124 | it->second.setFlag(SF_DELETE); | |
125 | } | |
126 | ||
127 | return c.cleanup(); | |
128 | } | |
129 | ||
114 | 130 | } // namespace AssemblyAlgorithms |
115 | 131 | |
116 | 132 | #endif |
53 | 53 | " -t, --trim-length=N maximum length of blunt contigs to trim [k]\n" |
54 | 54 | " -c, --coverage=FLOAT remove contigs with mean k-mer coverage\n" |
55 | 55 | " less than this threshold\n" |
56 | " --kc=N remove all k-mers with multiplicity < N [0]\n" | |
56 | 57 | " -b, --bubbles=N pop bubbles shorter than N bp [3*k]\n" |
57 | 58 | " -b0, --no-bubbles do not pop bubbles\n" |
58 | 59 | " -e, --erode=N erode bases at the ends of blunt contigs with coverage\n" |
106 | 107 | /** Coverage cutoff. */ |
107 | 108 | float coverage = -1; |
108 | 109 | |
110 | /** Minimum k-mer multiplicity cutoff. */ | |
111 | unsigned kc = 0; | |
112 | ||
109 | 113 | /** Pop bubbles shorter than N bp. */ |
110 | 114 | int bubbleLen = -1; |
111 | 115 | |
146 | 150 | |
147 | 151 | static const char shortopts[] = "b:c:e:E:g:k:K:mo:Q:q:s:t:v"; |
148 | 152 | |
149 | enum { OPT_HELP = 1, OPT_VERSION, COVERAGE_HIST, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES }; | |
153 | enum { OPT_HELP = 1, OPT_VERSION, COVERAGE_HIST, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES, OPT_KC }; | |
150 | 154 | |
151 | 155 | static const struct option longopts[] = { |
152 | 156 | { "out", required_argument, NULL, 'o' }, |
163 | 167 | { "SS", no_argument, &opt::ss, 1 }, |
164 | 168 | { "no-SS", no_argument, &opt::ss, 0 }, |
165 | 169 | { "coverage", required_argument, NULL, 'c' }, |
170 | { "kc", required_argument, NULL, OPT_KC }, | |
166 | 171 | { "coverage-hist", required_argument, NULL, COVERAGE_HIST }, |
167 | 172 | { "bubble-length", required_argument, NULL, 'b' }, |
168 | 173 | { "no-bubbles", no_argument, &opt::bubbleLen, 0 }, |
301 | 306 | case OPT_SPECIES: |
302 | 307 | arg >> opt::metaVars[2]; |
303 | 308 | break; |
309 | case OPT_KC: | |
310 | arg >> opt::kc; | |
311 | break; | |
304 | 312 | } |
305 | 313 | if (optarg != NULL && !arg.eof()) { |
306 | 314 | cerr << PROGRAM ": invalid option: `-" |
13 | 13 | extern unsigned erodeStrand; |
14 | 14 | extern unsigned trimLen; |
15 | 15 | extern float coverage; |
16 | extern unsigned kc; | |
16 | 17 | extern unsigned bubbleLen; |
17 | 18 | extern unsigned ss; |
18 | 19 | extern bool maskCov; |
12 | 12 | #include <vector> |
13 | 13 | #include <iostream> |
14 | 14 | #include <boost/dynamic_bitset.hpp> |
15 | ||
16 | /* | |
17 | * Put `BloomFilter` class in `Konnector` namespace to avoid collision with BTL | |
18 | * `BloomFilter` class of the same name. | |
19 | */ | |
20 | namespace Konnector { | |
15 | 21 | |
16 | 22 | /** A Bloom filter. */ |
17 | 23 | class BloomFilter |
164 | 170 | char* m_array; |
165 | 171 | }; |
166 | 172 | |
173 | } // end Konnector namespace | |
174 | ||
167 | 175 | #endif |
13 | 13 | * A bloom filter that represents a window |
14 | 14 | * within a larger bloom filter. |
15 | 15 | */ |
16 | class BloomFilterWindow : public BloomFilter | |
16 | class BloomFilterWindow : public Konnector::BloomFilter | |
17 | 17 | { |
18 | 18 | public: |
19 | 19 | |
20 | 20 | /** Constructor. */ |
21 | BloomFilterWindow() : BloomFilter() { }; | |
21 | BloomFilterWindow() : Konnector::BloomFilter() { }; | |
22 | 22 | |
23 | 23 | /** Constructor. |
24 | 24 | * |
28 | 28 | */ |
29 | 29 | BloomFilterWindow(size_t fullBloomSize, size_t startBitPos, |
30 | 30 | size_t endBitPos, size_t hashSeed=0) : |
31 | BloomFilter(endBitPos - startBitPos + 1, hashSeed), | |
31 | Konnector::BloomFilter(endBitPos - startBitPos + 1, hashSeed), | |
32 | 32 | m_fullBloomSize(fullBloomSize), |
33 | 33 | m_startBitPos(startBitPos), |
34 | 34 | m_endBitPos(endBitPos) |
62 | 62 | /** Return the size of the bit array. */ |
63 | 63 | size_t size() const |
64 | 64 | { |
65 | return BloomFilter::size(); | |
65 | return Konnector::BloomFilter::size(); | |
66 | 66 | } |
67 | 67 | |
68 | 68 | /** Return the number of elements with count >= max_count. */ |
69 | 69 | size_t popcount() const |
70 | 70 | { |
71 | return BloomFilter::popcount(); | |
71 | return Konnector::BloomFilter::popcount(); | |
72 | 72 | } |
73 | 73 | |
74 | 74 | /** Return the estimated false positive rate */ |
75 | 75 | double FPR() const |
76 | 76 | { |
77 | return BloomFilter::FPR(); | |
77 | return Konnector::BloomFilter::FPR(); | |
78 | 78 | } |
79 | 79 | |
80 | 80 | /** Return whether the specified bit is set. */ |
81 | 81 | bool operator[](size_t i) const |
82 | 82 | { |
83 | 83 | if (i >= m_startBitPos && i <= m_endBitPos) |
84 | return BloomFilter::operator[](i - m_startBitPos); | |
84 | return Konnector::BloomFilter::operator[](i - m_startBitPos); | |
85 | 85 | return false; |
86 | 86 | } |
87 | 87 | |
95 | 95 | void insert(size_t i) |
96 | 96 | { |
97 | 97 | if (i >= m_startBitPos && i <= m_endBitPos) |
98 | BloomFilter::insert(i - m_startBitPos); | |
98 | Konnector::BloomFilter::insert(i - m_startBitPos); | |
99 | 99 | } |
100 | 100 | |
101 | 101 | /** Add the object to this set. */ |
142 | 142 | |
143 | 143 | if (m_size != bits) { |
144 | 144 | if (readOp == BITWISE_OVERWRITE) { |
145 | BloomFilter::resize(bits); | |
145 | Konnector::BloomFilter::resize(bits); | |
146 | 146 | } else { |
147 | 147 | std::cerr << "error: can't union/intersect bloom filters with " |
148 | 148 | << "different sizes\n"; |
21 | 21 | { |
22 | 22 | m_data.reserve(max_count); |
23 | 23 | for (unsigned i = 0; i < max_count; i++) |
24 | m_data.push_back(new BloomFilter(n, hashSeed)); | |
24 | m_data.push_back(new Konnector::BloomFilter(n, hashSeed)); | |
25 | 25 | } |
26 | 26 | |
27 | 27 | /** Destructor */ |
28 | 28 | ~CascadingBloomFilter() |
29 | 29 | { |
30 | typedef std::vector<BloomFilter*>::iterator Iterator; | |
30 | typedef std::vector<Konnector::BloomFilter*>::iterator Iterator; | |
31 | 31 | for (Iterator i = m_data.begin(); i != m_data.end(); i++) { |
32 | 32 | assert(*i != NULL); |
33 | 33 | delete *i; |
90 | 90 | } |
91 | 91 | |
92 | 92 | /** Get the Bloom filter for a given level */ |
93 | BloomFilter& getBloomFilter(unsigned level) | |
93 | Konnector::BloomFilter& getBloomFilter(unsigned level) | |
94 | 94 | { |
95 | 95 | assert(m_data.at(level) != NULL); |
96 | 96 | return *m_data.at(level); |
111 | 111 | |
112 | 112 | private: |
113 | 113 | size_t m_hashSeed; |
114 | std::vector<BloomFilter*> m_data; | |
114 | std::vector<Konnector::BloomFilter*> m_data; | |
115 | 115 | |
116 | 116 | }; |
117 | 117 |
204 | 204 | { NULL, 0, NULL, 0 } |
205 | 205 | }; |
206 | 206 | |
207 | void dieWithUsageError() | |
207 | __attribute__((noreturn)) | |
208 | static void dieWithUsageError() | |
208 | 209 | { |
209 | 210 | cerr << "Try `" << PROGRAM |
210 | 211 | << " --help' for more information.\n"; |
417 | 418 | if (opt::windows == 0) { |
418 | 419 | |
419 | 420 | if (opt::levels == 1) { |
420 | BloomFilter bloom(bits, opt::hashSeed); | |
421 | Konnector::BloomFilter bloom(bits, opt::hashSeed); | |
421 | 422 | #ifdef _OPENMP |
422 | ConcurrentBloomFilter<BloomFilter> | |
423 | ConcurrentBloomFilter<Konnector::BloomFilter> | |
423 | 424 | cbf(bloom, opt::numLocks, opt::hashSeed); |
424 | 425 | loadFilters(cbf, argc, argv); |
425 | 426 | #else |
639 | 640 | string outputPath(argv[optind]); |
640 | 641 | optind++; |
641 | 642 | |
642 | BloomFilter bloom; | |
643 | Konnector::BloomFilter bloom; | |
643 | 644 | |
644 | 645 | for (int i = optind; i < argc; i++) { |
645 | 646 | string path(argv[i]); |
694 | 695 | dieWithUsageError(); |
695 | 696 | } |
696 | 697 | |
697 | BloomFilter bloom; | |
698 | Konnector::BloomFilter bloom; | |
698 | 699 | string path = argv[optind]; |
699 | 700 | |
700 | 701 | if (opt::verbose) |
743 | 744 | std::cerr << "Computing distance for 2" |
744 | 745 | << " samples...\n"; |
745 | 746 | // Get both paths and open istreams |
746 | BloomFilter bloomA; | |
747 | Konnector::BloomFilter bloomA; | |
747 | 748 | string pathA(argv[optind]); |
748 | BloomFilter bloomB; | |
749 | Konnector::BloomFilter bloomB; | |
749 | 750 | string pathB(argv[optind+1]); |
750 | 751 | if (opt::verbose) |
751 | 752 | std::cerr << "Loading bloom filters from " |
844 | 845 | |
845 | 846 | int memberOf(int argc, char ** argv){ |
846 | 847 | // Initalise bloom and get globals |
847 | BloomFilter bloom; | |
848 | Konnector::BloomFilter bloom; | |
848 | 849 | parseGlobalOpts(argc, argv); |
849 | 850 | // Arg parser to get `m' option in case set |
850 | 851 | for (int c; (c = getopt_long(argc, argv, |
928 | 929 | /** |
929 | 930 | * Calculate number of bases to trim from left end of sequence. |
930 | 931 | */ |
931 | int calcLeftTrim(const Sequence& seq, unsigned k, const BloomFilter& bloom, | |
932 | int calcLeftTrim(const Sequence& seq, unsigned k, const Konnector::BloomFilter& bloom, | |
932 | 933 | size_t minBranchLen) |
933 | 934 | { |
934 | 935 | // Boost graph interface for Bloom filter |
935 | DBGBloom<BloomFilter> g(bloom); | |
936 | DBGBloom<Konnector::BloomFilter> g(bloom); | |
936 | 937 | |
937 | 938 | // if this is the first k-mer we have found in |
938 | 939 | // Bloom filter, starting from the left end |
999 | 1000 | cerr << "Loading bloom filter from `" |
1000 | 1001 | << bloomPath << "'...\n"; |
1001 | 1002 | |
1002 | BloomFilter bloom; | |
1003 | Konnector::BloomFilter bloom; | |
1003 | 1004 | istream *in = openInputStream(bloomPath); |
1004 | 1005 | assert_good(*in, bloomPath); |
1005 | 1006 | bloom.read(*in); |
41 | 41 | { |
42 | 42 | m_data.reserve(levels); |
43 | 43 | for (unsigned i = 0; i < levels; i++) |
44 | m_data.push_back(new BTL::BloomFilter(size, hashes, k)); | |
44 | m_data.push_back(new BloomFilter(size, hashes, k)); | |
45 | 45 | } |
46 | 46 | |
47 | 47 | /** |
48 | * Constructor to load a single-level BTL::BloomFilter from | |
49 | * files. This is used to make BTL::BloomFilter support the | |
48 | * Constructor to load a single-level BloomFilter from | |
49 | * files. This is used to make BloomFilter support the | |
50 | 50 | * same interface as HashAgnosticCascadingBloom. |
51 | 51 | */ |
52 | 52 | HashAgnosticCascadingBloom(const string& bloomPath) |
137 | 137 | } |
138 | 138 | |
139 | 139 | /** Get the Bloom filter for a given level */ |
140 | BTL::BloomFilter& getBloomFilter(unsigned level) | |
140 | BloomFilter& getBloomFilter(unsigned level) | |
141 | 141 | { |
142 | 142 | assert(m_data.at(level) != NULL); |
143 | 143 | return *m_data.at(level); |
158 | 158 | void loadFilter(const string& bloomPath) |
159 | 159 | { |
160 | 160 | clear(); |
161 | BTL::BloomFilter* bloom = new BTL::BloomFilter(bloomPath); | |
161 | BloomFilter* bloom = new BloomFilter(bloomPath); | |
162 | 162 | m_k = bloom->getKmerSize(); |
163 | 163 | m_hashes = bloom->getHashNum(); |
164 | 164 | m_data.push_back(bloom); |
171 | 171 | { |
172 | 172 | m_k = 0; |
173 | 173 | m_hashes = 0; |
174 | typedef std::vector<BTL::BloomFilter*>::iterator Iterator; | |
174 | typedef std::vector<BloomFilter*>::iterator Iterator; | |
175 | 175 | for (Iterator i = m_data.begin(); i != m_data.end(); i++) { |
176 | 176 | assert(*i != NULL); |
177 | 177 | delete *i; |
184 | 184 | /** number of hash functions */ |
185 | 185 | unsigned m_hashes; |
186 | 186 | /** the array of Bloom filters */ |
187 | std::vector<BTL::BloomFilter*> m_data; | |
187 | std::vector<BloomFilter*> m_data; | |
188 | 188 | }; |
189 | 189 | |
190 | 190 | #endif |
0 | 0 | #ifndef LIGHTWEIGHT_KMER_H |
1 | 1 | #define LIGHTWEIGHT_KMER_H 1 |
2 | ||
3 | #include "BloomDBG/MaskedKmer.h" | |
2 | 4 | |
3 | 5 | #include <algorithm> |
4 | 6 | #include <cstring> |
25 | 25 | RollingHashIterator.h \ |
26 | 26 | SpacedSeed.h \ |
27 | 27 | $(top_srcdir)/lib/bloomfilter/BloomFilter.hpp \ |
28 | $(top_srcdir)/lib/rolling-hash/rolling.h | |
28 | $(top_srcdir)/lib/nthash/nthash.hpp |
44 | 44 | : m_kmer(kmer), m_rollingHash(rollingHash) {} |
45 | 45 | |
46 | 46 | const LightweightKmer& kmer() const { return m_kmer; }; |
47 | LightweightKmer& kmer() { return m_kmer; }; | |
48 | ||
47 | 49 | const RollingHash& rollingHash() const { return m_rollingHash; } |
48 | 50 | |
49 | 51 | RollingBloomDBGVertex clone() const { |
62 | 64 | |
63 | 65 | void setLastBase(extDirection dir, char base) |
64 | 66 | { |
65 | const unsigned k = Kmer::length(); | |
66 | if (dir == SENSE) { | |
67 | m_rollingHash.setBase(m_kmer.c_str(), k-1, base); | |
68 | } else { | |
69 | m_rollingHash.setBase(m_kmer.c_str(), 0, base); | |
70 | } | |
67 | m_rollingHash.setLastBase(kmer().c_str(), dir, base); | |
68 | kmer().setLastBase(dir, base); | |
71 | 69 | } |
72 | 70 | |
73 | 71 | /** |
1 | 1 | #define ABYSS_ROLLING_HASH_H 1 |
2 | 2 | |
3 | 3 | #include "config.h" |
4 | #include "lib/rolling-hash/rolling.h" | |
4 | ||
5 | #include "BloomDBG/LightweightKmer.h" | |
5 | 6 | #include "BloomDBG/MaskedKmer.h" |
7 | #include "Common/Sense.h" | |
8 | #include "lib/nthash/nthash.hpp" | |
9 | ||
10 | #include <algorithm> | |
6 | 11 | #include <string> |
7 | 12 | #include <vector> |
8 | 13 | #include <cassert> |
60 | 65 | */ |
61 | 66 | void reset(const std::string& kmer) |
62 | 67 | { |
68 | /* compute initial hash values for forward and reverse-complement k-mer */ | |
69 | NTC64(kmer.c_str(), m_k, m_hash1, m_rcHash1); | |
70 | ||
71 | /* get canonical hash value from forward/reverse hash values */ | |
72 | m_hash = canonicalHash(m_hash1, m_rcHash1); | |
73 | ||
63 | 74 | if (!MaskedKmer::mask().empty()) |
64 | resetMasked(kmer.c_str()); | |
65 | else | |
66 | resetUnmasked(kmer); | |
67 | } | |
68 | ||
69 | /** | |
70 | * Initialize hash values from current k-mer. When computing the hash | |
71 | * value, mask out "don't care" positions as per the active | |
72 | * k-mer mask. | |
73 | */ | |
74 | void resetMasked(const char* kmer) | |
75 | { | |
76 | const std::string& spacedSeed = MaskedKmer::mask(); | |
77 | assert(spacedSeed.length() == m_k); | |
78 | ||
79 | /* compute first hash function for k-mer */ | |
80 | uint64_t hash1 = getFhval(m_hash1, spacedSeed.c_str(), kmer, m_k); | |
81 | ||
82 | /* compute first hash function for reverse complement of k-mer */ | |
83 | uint64_t rcHash1 = getRhval(m_rcHash1, spacedSeed.c_str(), kmer, m_k); | |
84 | ||
85 | m_hash = canonicalHash(hash1, rcHash1); | |
86 | } | |
87 | ||
88 | /** | |
89 | * Initialize hash values from sequence. | |
90 | * @param kmer k-mer used to initialize hash state | |
91 | */ | |
92 | void resetUnmasked(const std::string& kmer) | |
93 | { | |
94 | /* compute first hash function for k-mer */ | |
95 | m_hash1 = getFhval(kmer.c_str(), m_k); | |
96 | ||
97 | /* compute first hash function for reverse complement | |
98 | * of k-mer */ | |
99 | m_rcHash1 = getRhval(kmer.c_str(), m_k); | |
100 | ||
101 | m_hash = canonicalHash(m_hash1, m_rcHash1); | |
75 | m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(), | |
76 | kmer.c_str(), m_k); | |
102 | 77 | } |
103 | 78 | |
104 | 79 | /** |
109 | 84 | */ |
110 | 85 | void rollRight(const char* kmer, char charIn) |
111 | 86 | { |
112 | if (!MaskedKmer::mask().empty()) | |
113 | rollRightMasked(kmer, charIn); | |
114 | else | |
115 | rollRightUnmasked(kmer, charIn); | |
116 | } | |
117 | ||
118 | /** | |
119 | * Compute hash values for next k-mer to the right and | |
120 | * update internal state. When computing the new hash, mask | |
121 | * out "don't care" positions according to the active | |
122 | * k-mer mask. | |
123 | * @param kmer current k-mer | |
124 | * @param nextKmer k-mer we are rolling into | |
125 | */ | |
126 | void rollRightMasked(const char* kmer, char charIn) | |
127 | { | |
128 | const std::string& spacedSeed = MaskedKmer::mask(); | |
129 | m_hash = rollHashesRight(m_hash1, m_rcHash1, spacedSeed.c_str(), | |
130 | kmer, charIn, m_k); | |
131 | } | |
132 | ||
133 | /** | |
134 | * Compute hash values for next k-mer to the right and | |
135 | * update internal state. | |
136 | * @param kmer current k-mer | |
137 | * @param nextKmer k-mer we are rolling into | |
138 | */ | |
139 | void rollRightUnmasked(const char* kmer, char charIn) | |
140 | { | |
141 | /* update first hash function */ | |
142 | rollHashesRight(m_hash1, m_rcHash1, kmer[0], charIn, m_k); | |
143 | m_hash = canonicalHash(m_hash1, m_rcHash1); | |
87 | NTC64(kmer[0], charIn, m_k, m_hash1, m_rcHash1); | |
88 | m_hash = canonicalHash(m_hash1, m_rcHash1); | |
89 | ||
90 | if (!MaskedKmer::mask().empty()) { | |
91 | // TODO: copying the k-mer and shifting is very inefficient; | |
92 | // we need a specialized nthash function that rolls and masks | |
93 | // simultaneously | |
94 | LightweightKmer next(kmer); | |
95 | next.shift(SENSE, charIn); | |
96 | m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(), | |
97 | next.c_str(), m_k); | |
98 | } | |
144 | 99 | } |
145 | 100 | |
146 | 101 | /** |
151 | 106 | */ |
152 | 107 | void rollLeft(char charIn, const char* kmer) |
153 | 108 | { |
154 | if (!MaskedKmer::mask().empty()) | |
155 | rollLeftMasked(charIn, kmer); | |
156 | else | |
157 | rollLeftUnmasked(charIn, kmer); | |
158 | } | |
159 | ||
160 | /** | |
161 | * Compute hash values for next k-mer to the left and | |
162 | * update internal state. When computing the new hash, mask | |
163 | * out "don't care" positions according to the active | |
164 | * k-mer mask. | |
165 | * @param prevKmer k-mer we are rolling into | |
166 | * @param kmer current k-mer | |
167 | */ | |
168 | void rollLeftMasked(char charIn, const char* kmer) | |
169 | { | |
170 | const std::string& spacedSeed = MaskedKmer::mask(); | |
171 | m_hash = rollHashesLeft(m_hash1, m_rcHash1, spacedSeed.c_str(), | |
172 | kmer, charIn, m_k); | |
173 | } | |
174 | ||
175 | /** | |
176 | * Compute hash values for next k-mer to the left and | |
177 | * update internal state. | |
178 | * @param prevKmer k-mer we are rolling into | |
179 | * @param kmer current k-mer | |
180 | */ | |
181 | void rollLeftUnmasked(char charIn, const char* kmer) | |
182 | { | |
183 | /* update first hash function */ | |
184 | rollHashesLeft(m_hash1, m_rcHash1, charIn, kmer[m_k-1], m_k); | |
185 | m_hash = canonicalHash(m_hash1, m_rcHash1); | |
109 | NTC64L(kmer[m_k-1], charIn, m_k, m_hash1, m_rcHash1); | |
110 | m_hash = canonicalHash(m_hash1, m_rcHash1); | |
111 | ||
112 | if (!MaskedKmer::mask().empty()) { | |
113 | // TODO: copying the k-mer and shifting is very inefficient; | |
114 | // we need a specialized nthash function that rolls and masks | |
115 | // simultaneously | |
116 | LightweightKmer next(kmer); | |
117 | next.shift(ANTISENSE, charIn); | |
118 | m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(), | |
119 | next.c_str(), m_k); | |
120 | } | |
186 | 121 | } |
187 | 122 | |
188 | 123 | /** |
202 | 137 | */ |
203 | 138 | void getHashes(size_t hashes[]) const |
204 | 139 | { |
205 | uint64_t tmpHashes[MAX_HASHES]; | |
206 | multiHash(tmpHashes, m_hash, m_numHashes, m_k); | |
207 | for (unsigned i = 0; i < m_numHashes; ++i) { | |
208 | hashes[i] = (size_t)tmpHashes[i]; | |
209 | } | |
140 | for (unsigned i = 0; i < m_numHashes; ++i) | |
141 | hashes[i] = NTE64(m_hash, m_k, i); | |
210 | 142 | } |
211 | 143 | |
212 | 144 | /** Equality operator */ |
229 | 161 | } |
230 | 162 | |
231 | 163 | /** |
232 | * Set the base at a given position in the k-mer and update the hash | |
233 | * value accordingly. | |
164 | * Change the hash value to reflect a change in the first/last base of | |
165 | * the k-mer. | |
234 | 166 | * @param kmer point to the k-mer char array |
235 | * @param pos position of the base to be changed | |
167 | * @param dir if SENSE, change last base; if ANTISENSE, | |
168 | * change first base | |
236 | 169 | * @param base new value for the base |
237 | 170 | */ |
238 | void setBase(char* kmer, unsigned pos, char base) | |
239 | { | |
171 | void setLastBase(char* kmer, extDirection dir, char base) | |
172 | { | |
173 | if (dir == SENSE) { | |
174 | /* roll left to remove old last char */ | |
175 | NTC64L(kmer[m_k-1], 'A', m_k, m_hash1, m_rcHash1); | |
176 | /* roll right to add new last char */ | |
177 | NTC64('A', base, m_k, m_hash1, m_rcHash1); | |
178 | } else { | |
179 | /* roll right to remove old first char */ | |
180 | NTC64(kmer[0], 'A', m_k, m_hash1, m_rcHash1); | |
181 | /* roll left to add new first char */ | |
182 | NTC64L('A', base, m_k, m_hash1, m_rcHash1); | |
183 | } | |
184 | m_hash = canonicalHash(m_hash1, m_rcHash1); | |
185 | ||
240 | 186 | if (!MaskedKmer::mask().empty()) |
241 | setBaseMasked(kmer, pos, base); | |
242 | else | |
243 | setBaseUnmasked(kmer, pos, base); | |
244 | } | |
245 | ||
246 | /** | |
247 | * Set the base at a given position in the k-mer and update the hash | |
248 | * value accordingly. | |
249 | * @param kmer point to the k-mer char array | |
250 | * @param pos position of the base to be changed | |
251 | * @param base new value for the base | |
252 | */ | |
253 | void setBaseMasked(char* kmer, unsigned pos, char base) | |
254 | { | |
255 | const std::string& spacedSeed = MaskedKmer::mask(); | |
256 | assert(spacedSeed.length() == m_k); | |
257 | m_hash = ::setBase(m_hash1, m_rcHash1, spacedSeed.c_str(), kmer, | |
258 | pos, base, m_k); | |
259 | } | |
260 | ||
261 | /** | |
262 | * Set the base at a given position in the k-mer and update the hash | |
263 | * value accordingly. | |
264 | * @param kmer point to the k-mer char array | |
265 | * @param pos position of the base to be changed | |
266 | * @param base new value for the base | |
267 | */ | |
268 | void setBaseUnmasked(char* kmer, unsigned pos, char base) | |
269 | { | |
270 | m_hash = ::setBase(m_hash1, m_rcHash1, kmer, pos, base, m_k); | |
187 | m_hash = maskHash(m_hash1, m_rcHash1, MaskedKmer::mask().c_str(), | |
188 | kmer, m_k); | |
271 | 189 | } |
272 | 190 | |
273 | 191 | private: |
227 | 227 | HashAgnosticCascadingBloom solidKmerSet; |
228 | 228 | |
229 | 229 | /* empty visited k-mers Bloom filter */ |
230 | BTL::BloomFilter visitedKmerSet; | |
230 | BloomFilter visitedKmerSet; | |
231 | 231 | |
232 | 232 | /* counters for progress messages */ |
233 | 233 | BloomDBG::AssemblyCounters counters; |
573 | 573 | if (!assembledKmerSet.contains(*fwd)) |
574 | 574 | break; |
575 | 575 | } |
576 | if (fwd.pos() > 0) | |
576 | if (fwd != RollingHashIterator::end() && fwd.pos() > 0) | |
577 | 577 | seq.erase(0, fwd.pos()); |
578 | 578 | |
579 | 579 | /* trim previously assembled k-mers from end of sequence */ |
583 | 583 | if (!assembledKmerSet.contains(*rev)) |
584 | 584 | break; |
585 | 585 | } |
586 | if (rev.pos() > 0) | |
586 | if (rev != RollingHashIterator::end() && rev.pos() > 0) | |
587 | 587 | rcSeq.erase(0, rev.pos()); |
588 | 588 | |
589 | 589 | /* flip seq back to original orientation */ |
833 | 833 | std::ostream& out) |
834 | 834 | { |
835 | 835 | /* k-mers in previously assembled contigs */ |
836 | BTL::BloomFilter visitedKmerSet(solidKmerSet.size(), | |
836 | BloomFilter visitedKmerSet(solidKmerSet.size(), | |
837 | 837 | solidKmerSet.getHashNum(), solidKmerSet.getKmerSize()); |
838 | 838 | |
839 | 839 | /* counters for progress messages */ |
0 | 2018-09-11 Ben Vandervalk <benv@bcgsc.ca> | |
1 | ||
2 | * Release version 2.1.1 | |
3 | ||
4 | abyss-bloom-dbg: | |
5 | * upgrade to most recent version of ntHash to reduce | |
6 | some assembly/hashing artifacts. On a human assembly, this | |
7 | reduced QUAST major misassemblies by 5% and increased | |
8 | scaffold contiguity by 10% | |
9 | * `kc` parameter now also applies to MPI assemblies (see below) | |
10 | ||
11 | abyss-fac: | |
12 | * change N20 and N80 to N25 and N75, respectively | |
13 | ||
14 | ABYSS-P: | |
15 | * add `--kc` option, with implements a hard minimum k-mer | |
16 | multiplicity cutoff | |
17 | ||
18 | abyss-pe: | |
19 | * fix `zsh: no such option: pipefail` error with | |
20 | old versions of `zsh` (fallback to `bash` instead) | |
21 | * adding `time=1` now times *all* assembly commands | |
22 | ||
23 | abyss-sealer: | |
24 | * parallelize gap sealing with OpenMP (thanks to | |
25 | @schutzekatze!) | |
26 | * add `--gap-file` option (thanks to @schutzekatze!) | |
27 | ||
28 | DistanceEst: | |
29 | * add support for GFA output | |
30 | ||
0 | 31 | 2018-04-13 Ben Vandervalk <benv@bcgsc.ca> |
1 | 32 | |
2 | 33 | * Release version 2.1.0 |
1 | 1 | #define ESTIMATE_H 1 |
2 | 2 | |
3 | 3 | #include "Common/ContigProperties.h" // for Distance |
4 | #include "Common/Exception.h" | |
4 | 5 | #include "ContigID.h" |
5 | 6 | #include "ContigNode.h" |
6 | 7 | #include "Graph/Options.h" // for opt::k |
197 | 198 | |
198 | 199 | namespace std { |
199 | 200 | template<> |
200 | inline void swap(EstimateRecord&, EstimateRecord&) | |
201 | inline void swap(EstimateRecord&, EstimateRecord&) NOEXCEPT | |
201 | 202 | { |
202 | 203 | assert(false); |
203 | 204 | } |
0 | #ifndef _EXCEPTION_H_ | |
1 | #define _EXCEPTION_H_ 1 | |
2 | ||
3 | /* `noexcept` is only recognized in C++11 or later. See https://stackoverflow.com/questions/24567173/backwards-compatible-noexceptfalse-for-destructors */ | |
4 | #if __cplusplus >= 201103L | |
5 | #define NOEXCEPT noexcept | |
6 | #else | |
7 | #define NOEXCEPT | |
8 | #endif | |
9 | ||
10 | #endif |
0 | 0 | #ifndef HISTOGRAM_H |
1 | 1 | #define HISTOGRAM_H 1 |
2 | 2 | |
3 | #include "Common/Exception.h" | |
3 | 4 | #include "StringUtil.h" // for toEng |
4 | 5 | #include "VectorUtil.h" // for make_vector |
5 | 6 | #include <cassert> |
311 | 312 | |
312 | 313 | namespace std { |
313 | 314 | template<> |
314 | inline void swap(Histogram&, Histogram&) { assert(false); } | |
315 | inline void swap(Histogram&, Histogram&) NOEXCEPT { assert(false); } | |
315 | 316 | } |
316 | 317 | |
317 | 318 | /** Print assembly contiguity statistics header. */ |
328 | 329 | out << "LG50" << sep |
329 | 330 | << "NG50" << sep; |
330 | 331 | return out << "min" << sep |
331 | << "N80" << sep | |
332 | << "N75" << sep | |
332 | 333 | << "N50" << sep |
333 | << "N20" << sep | |
334 | << "N25" << sep | |
334 | 335 | << "E-size" << sep |
335 | 336 | << "max" << sep |
336 | 337 | << "sum" << sep |
363 | 364 | } |
364 | 365 | return out |
365 | 366 | << toEng(h.minimum()) << sep |
366 | << toEng(h.weightedPercentile(1 - 0.8)) << sep | |
367 | << toEng(h.weightedPercentile(1 - 0.75)) << sep | |
367 | 368 | << toEng(n50) << sep |
368 | << toEng(h.weightedPercentile(1 - 0.2)) << sep | |
369 | << toEng(h.weightedPercentile(1 - 0.25)) << sep | |
369 | 370 | << toEng((unsigned)h.expectedValue()) << sep |
370 | 371 | << toEng(h.maximum()) << sep |
371 | 372 | << toEng(sum); |
385 | 386 | << h.size() |
386 | 387 | << h.count(n50, INT_MAX) |
387 | 388 | << h.minimum() |
388 | << h.weightedPercentile(1 - 0.8) | |
389 | << h.weightedPercentile(1 - 0.75) | |
389 | 390 | << n50 |
390 | << h.weightedPercentile(1 - 0.2) | |
391 | << h.weightedPercentile(1 - 0.25) | |
391 | 392 | << (unsigned)h.expectedValue() |
392 | 393 | << h.maximum() |
393 | 394 | << sum; |
12 | 12 | ContigProperties.h \ |
13 | 13 | Dictionary.h \ |
14 | 14 | Estimate.h \ |
15 | Exception.h \ | |
15 | 16 | Fcontrol.cpp Fcontrol.h \ |
16 | 17 | Functional.h \ |
17 | 18 | Hash.h \ |
0 | 0 | #ifndef PMF_H |
1 | 1 | #define PMF_H 1 |
2 | 2 | |
3 | #include "Common/Exception.h" | |
3 | 4 | #include "Histogram.h" |
4 | 5 | #include <cassert> |
5 | 6 | #include <cmath> |
62 | 63 | |
63 | 64 | namespace std { |
64 | 65 | template<> |
65 | inline void swap(PMF&, PMF&) { assert(false); } | |
66 | inline void swap(PMF&, PMF&) NOEXCEPT { assert(false); } | |
66 | 67 | } |
67 | 68 | |
68 | 69 | #endif |
171 | 171 | case 'I': case 'X': case '=': |
172 | 172 | qlen += len; |
173 | 173 | clip1 += len; |
174 | // fallthrough | |
174 | 175 | case 'D': case 'N': case 'P': |
175 | 176 | if (a.align_length == 0) { |
176 | 177 | // Ignore a malformatted CIGAR string whose first |
187 | 188 | clip1 = 0; |
188 | 189 | break; |
189 | 190 | } |
191 | // fallthrough | |
190 | 192 | case 'H': case 'S': |
191 | 193 | qlen += len; |
192 | 194 | clip1 += len; |
321 | 323 | #if SAM_SEQ_QUAL |
322 | 324 | out << '\t' << o.seq |
323 | 325 | << '\t' << o.qual; |
326 | // note: leading tab ('\t') is already embedded in o.tags | |
324 | 327 | if (!o.tags.empty()) |
325 | out << '\t' << o.tags; | |
328 | out << o.tags; | |
326 | 329 | #else |
327 | 330 | out << "\t*\t*"; |
328 | 331 | #endif |
130 | 130 | /** The sequence */ |
131 | 131 | Sequence seq; |
132 | 132 | |
133 | FastaRecord() { } | |
133 | FastaRecord() : anchor(0) { } | |
134 | 134 | FastaRecord(const std::string& id, const std::string& comment, |
135 | 135 | const Sequence& seq) |
136 | 136 | : id(id), comment(comment), anchor(0), seq(seq) { } |
112 | 112 | cout << "n:NG50" << sep |
113 | 113 | << "NG50" << sep; |
114 | 114 | cout << "min" << sep |
115 | << "N80" << sep | |
115 | << "N75" << sep | |
116 | 116 | << "N50" << sep |
117 | << "N20" << sep | |
117 | << "N25" << sep | |
118 | 118 | << "E-size" << sep |
119 | 119 | << "max" << sep |
120 | 120 | << "sum" << sep |
129 | 129 | cout << "n:NG50" << sep |
130 | 130 | << "NG50" << sep; |
131 | 131 | cout << "min" << sep |
132 | << "N80" << sep | |
132 | << "N75" << sep | |
133 | 133 | << "N50" << sep |
134 | << "N20" << sep | |
134 | << "N25" << sep | |
135 | 135 | << "E-size" << sep |
136 | 136 | << "max" << sep |
137 | 137 | << "sum" << sep |
60 | 60 | " (maximum likelihood estimator)\n" |
61 | 61 | " --mean use the difference of the population mean\n" |
62 | 62 | " and the sample mean\n" |
63 | " --dist output graph in dist format [default]\n" | |
64 | " --dot output graph in dot format\n" | |
63 | " --dist output the graph in dist format [default]\n" | |
64 | " --dot output the graph in GraphViz format\n" | |
65 | " --gv output the graph in GraphViz format\n" | |
66 | " --gfa output the graph in GFA2 format\n" | |
67 | " --gfa2 output the graph in GFA2 format\n" | |
65 | 68 | " -j, --threads=N use N parallel threads [1]\n" |
66 | 69 | " -v, --verbose display verbose output\n" |
67 | 70 | " --help display this help and exit\n" |
118 | 121 | static const struct option longopts[] = { |
119 | 122 | { "dist", no_argument, &opt::format, DIST, }, |
120 | 123 | { "dot", no_argument, &opt::format, DOT, }, |
124 | { "gv", no_argument, &opt::format, DOT, }, | |
125 | { "gfa", no_argument, &opt::format, GFA2, }, | |
126 | { "gfa2", no_argument, &opt::format, GFA2, }, | |
121 | 127 | { "fr", no_argument, &opt::rf, false }, |
122 | 128 | { "rf", no_argument, &opt::rf, true }, |
123 | 129 | { "min-align", required_argument, NULL, 'l' }, |
274 | 280 | if (opt::format == DOT) { |
275 | 281 | #pragma omp critical(out) |
276 | 282 | out << get(g_contigNames, e) << " [" << est << "]\n"; |
283 | } else if (opt::format == GFA2) { | |
284 | // Output only one of the two complementary edges. | |
285 | if (len1 < opt::seedLen || e.first < e.second || e.first == e.second) | |
286 | #pragma omp critical(out) | |
287 | out << "G\t*" | |
288 | << '\t' << get(g_contigNames, e.first) | |
289 | << '\t' << get(g_contigNames, e.second) | |
290 | << '\t' << est.distance | |
291 | << '\t' << (int)ceilf(est.stdDev) | |
292 | << "\tFC:i:" << est.numPairs | |
293 | << '\n'; | |
277 | 294 | } else |
278 | 295 | out << ' ' << get(g_contigNames, id1) << ',' << est; |
279 | 296 | } else if (opt::verbose > 1) { |
316 | 333 | const PairsMap& x = dataMap[sense0 ^ opt::rf]; |
317 | 334 | for (PairsMap::const_iterator it = x.begin(); |
318 | 335 | it != x.end(); ++it) |
319 | writeEstimate(opt::format == DOT ? out : ss, | |
336 | writeEstimate(opt::format == DIST ? ss : out, | |
320 | 337 | ContigNode(id0, sense0), it->first, |
321 | 338 | len0, lengthVec[it->first.id()], |
322 | 339 | it->second, pmf); |
501 | 518 | "k=" << opt::k << " " |
502 | 519 | "s=" << opt::seedLen << " " |
503 | 520 | "n=" << opt::npairs << "]\n"; |
521 | else if (opt::format == GFA2) | |
522 | out << "H\tVN:Z:2.0\n"; | |
504 | 523 | |
505 | 524 | vector<int> vals = make_vector<int>() |
506 | 525 | << opt::k |
216 | 216 | } |
217 | 217 | |
218 | 218 | /** Read a graph in GFA format. */ |
219 | template <typename Graph> | |
220 | std::istream& read_gfa(std::istream& in, Graph& g) | |
219 | template <typename Graph, typename BetterEP> | |
220 | std::istream& read_gfa(std::istream& in, Graph& g, BetterEP betterEP) | |
221 | 221 | { |
222 | 222 | assert(in); |
223 | 223 | |
224 | 224 | typedef typename graph_traits<Graph>::vertex_descriptor V; |
225 | 225 | typedef typename vertex_property<Graph>::type VP; |
226 | typedef typename graph_traits<Graph>::edge_descriptor E; | |
226 | 227 | typedef typename edge_property<Graph>::type EP; |
227 | 228 | |
228 | 229 | // Add vertices if this graph is empty. |
350 | 351 | assert(in); |
351 | 352 | V u = find_vertex(uname, g); |
352 | 353 | V v = find_vertex(vname, g); |
353 | add_edge(u, v, ep, g); | |
354 | E e; | |
355 | bool found; | |
356 | boost::tie(e, found) = edge(u, v, g); | |
357 | if (found) { | |
358 | // Parallel edge | |
359 | EP& ref = g[e]; | |
360 | ref = betterEP(ref, ep); | |
361 | } else | |
362 | add_edge(u, v, ep, g); | |
354 | 363 | break; |
355 | 364 | } |
356 | 365 |
66 | 66 | case 'T': // HT: ASQG format |
67 | 67 | return read_asqg(in, g); |
68 | 68 | case '\t': // H: GAF format |
69 | return read_gfa(in, g); | |
69 | return read_gfa(in, g, betterEP); | |
70 | 70 | default: |
71 | 71 | std::cerr << "Unknown file format: `H" << c << "'\n"; |
72 | 72 | exit(EXIT_FAILURE); |
32 | 32 | #endif |
33 | 33 | |
34 | 34 | using namespace std; |
35 | using Konnector::BloomFilter; | |
35 | 36 | |
36 | 37 | #define PROGRAM "konnector" |
37 | 38 |
814 | 814 | result = ES_EXTENDED_TO_CYCLE; |
815 | 815 | else |
816 | 816 | result = ES_INTERNAL_CYCLE; |
817 | break; | |
817 | 818 | case DEAD_END: |
818 | 819 | result = ES_DEAD_END; |
819 | 820 | break; |
56 | 56 | Sealer \ |
57 | 57 | AdjList \ |
58 | 58 | lib/bloomfilter \ |
59 | lib/rolling-hash \ | |
59 | lib/nthash \ | |
60 | 60 | $(GTest) \ |
61 | 61 | $(UnitTest) |
62 | 62 |
640 | 640 | printContiguityStats(cerr, lengthHistogram, STATS_MIN_LENGTH) |
641 | 641 | << '\t' << opt::out << '\n'; |
642 | 642 | } |
643 | #if 0 | |
644 | // assembly contiguity statistics | |
645 | vector<int> vals = passContiguityStatsVal(lengthHistogram,200); | |
646 | vector<string> keys = make_vector<string>() | |
647 | << "n" | |
648 | << "n200" | |
649 | << "nN50" | |
650 | << "min" | |
651 | << "N80" | |
652 | << "N50" | |
653 | << "N20" | |
654 | << "Esize" | |
655 | << "max" | |
656 | << "sum" | |
657 | << "nNG50" | |
658 | << "NG50"; | |
659 | ||
660 | if (!opt::db.empty()) { | |
661 | for (unsigned a=0; a<vals.size(); a++) | |
662 | addToDb(db, keys[a], vals[a]); | |
663 | } | |
664 | #endif | |
665 | 643 | return 0; |
666 | 644 | } |
278 | 278 | where |
279 | 279 | help = putStr (usageInfo usage options) >> exitSuccess |
280 | 280 | tryHelp = "Try 'abyss-samtobreak --help' for more information." |
281 | version = "abyss-samtobreak (ABySS) 2.1.0\n" | |
281 | version = "abyss-samtobreak (ABySS) 2.1.1\n" | |
282 | 282 | usage = "Usage: samtobreak [OPTION]... [FILE]...\n\ |
283 | 283 | \Calculate contig and scaffold contiguity and correctness metrics.\n" |
284 | 284 |
97 | 97 | = AssemblyAlgorithms::coverageHistogram(m_data); |
98 | 98 | Histogram h(m_comm.reduce(myh.toVector())); |
99 | 99 | AssemblyAlgorithms::setCoverageParameters(h); |
100 | ||
101 | /* | |
102 | * If we have loaded the k-mer hash table from disk, then | |
103 | * the adjacency data for each k-mer has already been computed | |
104 | * in the hash table. | |
105 | * | |
106 | * `applyKmerCoverageThreshold` would not work correctly | |
107 | * in this case because it removes k-mer records from | |
108 | * the hash table without attempting to update the adjacency | |
109 | * info of neighbouring k-mers. | |
110 | */ | |
111 | if (!m_data.isAdjacencyLoaded() && opt::kc > 0) { | |
112 | m_comm.barrier(); | |
113 | size_t removed = AssemblyAlgorithms::applyKmerCoverageThreshold(m_data, opt::kc); | |
114 | logger(1) << "Removed " << removed | |
115 | << " low multiplicity k-mers" << std::endl; | |
116 | m_comm.reduce(removed); | |
117 | m_comm.reduce(m_data.size()); | |
118 | } | |
119 | ||
100 | 120 | EndState(); |
101 | 121 | SetState(NAS_WAITING); |
102 | 122 | break; |
103 | 123 | } |
104 | 124 | case NAS_GEN_ADJ: |
105 | 125 | m_comm.barrier(); |
126 | ||
106 | 127 | m_numBasesAdjSet = 0; |
107 | 128 | AssemblyAlgorithms::generateAdjacency(this); |
108 | 129 | EndState(); |
473 | 494 | = AssemblyAlgorithms::coverageHistogram(m_data); |
474 | 495 | Histogram h(m_comm.reduce(myh.toVector())); |
475 | 496 | AssemblyAlgorithms::setCoverageParameters(h); |
476 | EndState(); | |
477 | ||
497 | ||
498 | /* | |
499 | * If we have loaded the k-mer hash table from disk, then | |
500 | * the adjacency data for each k-mer has already been computed | |
501 | * in the hash table. | |
502 | * | |
503 | * `applyKmerCoverageThreshold` would not work correctly | |
504 | * in this case because it removes k-mer records from | |
505 | * the hash table without attempting to update the adjacency | |
506 | * info of neighbouring k-mers. | |
507 | */ | |
508 | if (!m_data.isAdjacencyLoaded() && opt::kc > 0) { | |
509 | cout << "Minimum k-mer multiplicity kc is " | |
510 | << opt::kc << '\n'; | |
511 | cout << "Removing low multiplicity k-mers..." << std::endl; | |
512 | m_comm.barrier(); | |
513 | size_t removed = AssemblyAlgorithms::applyKmerCoverageThreshold(m_data, opt::kc); | |
514 | logger(1) << "Removed " << removed | |
515 | << " low multiplicity k-mers" << std::endl; | |
516 | size_t sumRemoved = m_comm.reduce(removed); | |
517 | size_t remaining = m_comm.reduce(m_data.size()); | |
518 | cout << "Removed " << sumRemoved | |
519 | << " low-multiplicity k-mers, " << remaining | |
520 | << " k-mers remaining" << std::endl; | |
521 | } | |
522 | ||
523 | EndState(); | |
478 | 524 | SetState(m_data.isAdjacencyLoaded() |
479 | 525 | ? NAS_ERODE : NAS_GEN_ADJ); |
480 | 526 | break; |
41 | 41 | |
42 | 42 | Install [Linuxbrew](http://linuxbrew.sh/), and run the command |
43 | 43 | |
44 | brew install brewsci/bio/abyss | |
44 | brew install abyss | |
45 | 45 | |
46 | 46 | ## Install ABySS on macOS |
47 | 47 | |
48 | 48 | Install [Homebrew](https://brew.sh/), and run the command |
49 | 49 | |
50 | brew install brewsci/bio/abyss | |
50 | brew install abyss | |
51 | 51 | |
52 | 52 | ## Install ABySS on Windows |
53 | 53 | |
54 | 54 | Install [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/) and [Linuxbrew](http://linuxbrew.sh/), and run the command |
55 | 55 | |
56 | brew install brewsci/bio/abyss | |
56 | brew install abyss | |
57 | 57 | |
58 | 58 | ## Install ABySS on Debian or Ubuntu |
59 | 59 | |
96 | 96 | - [ARCS](https://github.com/bcgsc/arcs) to scaffold |
97 | 97 | - [Tigmint](https://github.com/bcgsc/tigmint) to correct assembly errors |
98 | 98 | |
99 | brew install brewsci/bio/arcs brewsci/bio/links | |
99 | brew install brewsci/bio/arcs brewsci/bio/links-scaffolder | |
100 | 100 | |
101 | 101 | ## Optional dependencies |
102 | 102 | |
187 | 187 | installed ABySS in `/opt/abyss`, add `/opt/abyss/bin` to your `PATH`: |
188 | 188 | |
189 | 189 | PATH=/opt/abyss/bin:$PATH |
190 | ||
191 | Before starting an assembly | |
192 | =========================== | |
193 | ||
194 | ABySS stores temporary files in `TMPDIR`, which is `/tmp` by default on most systems. If your default temporary disk volume is too small, set `TMPDIR` to a larger volume, such as `/var/tmp` or your home directory. | |
195 | ||
196 | export TMPDIR=/var/tmp | |
190 | 197 | |
191 | 198 | Assembling a paired-end library |
192 | 199 | =============================== |
653 | 653 | << "n200" |
654 | 654 | << "nN50" |
655 | 655 | << "min" |
656 | << "N80" | |
656 | << "N75" | |
657 | 657 | << "N50" |
658 | << "N20" | |
658 | << "N25" | |
659 | 659 | << "Esize" |
660 | 660 | << "max" |
661 | 661 | << "sum" |
45 | 45 | #endif |
46 | 46 | |
47 | 47 | using namespace std; |
48 | using Konnector::BloomFilter; | |
48 | 49 | #if USESEQAN |
49 | 50 | using namespace seqan; |
50 | 51 | #endif |
109 | 110 | " -s, --search-mem=N mem limit for graph searches; multiply by the\n" |
110 | 111 | " number of threads (-j) to get the total mem used\n" |
111 | 112 | " for graph traversal [500M]\n" |
113 | " -g, --gap-file=FILE write sealed gaps to FILE\n" | |
112 | 114 | " -t, --trace-file=FILE write graph search stats to FILE\n" |
113 | 115 | " -v, --verbose display verbose output\n" |
114 | 116 | " --help display this help and exit\n" |
203 | 205 | /** Output file for graph search stats */ |
204 | 206 | static string tracefilePath; |
205 | 207 | |
208 | /** Output file for sealed gaps */ | |
209 | static string gapfilePath; | |
210 | ||
206 | 211 | /** Mask bases not in flanks */ |
207 | 212 | static int mask = 0; |
208 | 213 | |
235 | 240 | size_t skipped; |
236 | 241 | }; |
237 | 242 | |
238 | static const char shortopts[] = "S:L:b:B:d:ef:F:G:i:Ij:k:lm:M:no:P:q:r:s:t:v"; | |
243 | static const char shortopts[] = "S:L:b:B:d:ef:F:G:g:i:Ij:k:lm:M:no:P:q:r:s:t:v"; | |
239 | 244 | |
240 | 245 | enum { OPT_HELP = 1, OPT_VERSION }; |
241 | 246 | |
272 | 277 | { "read-name", required_argument, NULL, 'r' }, |
273 | 278 | { "search-mem", required_argument, NULL, 's' }, |
274 | 279 | { "trace-file", required_argument, NULL, 't' }, |
280 | { "gap-file", required_argument, NULL, 'g' }, | |
275 | 281 | { "verbose", no_argument, NULL, 'v' }, |
276 | 282 | { "help", no_argument, NULL, OPT_HELP }, |
277 | 283 | { "version", no_argument, NULL, OPT_VERSION }, |
414 | 420 | string merge(const Graph& g, |
415 | 421 | unsigned k, |
416 | 422 | const Gap& gap, |
417 | FastaRecord &read1, | |
418 | FastaRecord &read2, | |
423 | const FastaRecord &read1, | |
424 | const FastaRecord &read2, | |
419 | 425 | const ConnectPairsParams& params, |
420 | 426 | Counters& g_count, |
421 | 427 | ofstream& traceStream) |
571 | 577 | map<FastaRecord, map<FastaRecord, Gap> > &flanks, |
572 | 578 | unsigned &gapsclosed, |
573 | 579 | ofstream &logStream, |
574 | ofstream &traceStream) | |
580 | ofstream &traceStream, | |
581 | ofstream &gapStream) | |
575 | 582 | { |
576 | 583 | map<FastaRecord, map<FastaRecord, Gap> >::iterator read1_it; |
577 | 584 | map<FastaRecord, Gap>::iterator read2_it; |
578 | 585 | unsigned uniqueGapsClosed = 0; |
579 | bool success; | |
580 | 586 | |
581 | 587 | Counters g_count; |
582 | 588 | g_count.noStartOrGoalKmer = 0; |
596 | 602 | |
597 | 603 | printLog(logStream, "Flanks inserted into k run = " + IntToString(flanks.size()) + "\n"); |
598 | 604 | |
599 | for (read1_it = flanks.begin(); read1_it != flanks.end();) { | |
600 | success = false; | |
605 | int counter = 0; | |
606 | vector<map<FastaRecord, map<FastaRecord, Gap> >::iterator> flanks_closed; | |
607 | #pragma omp parallel private(read1_it, read2_it) firstprivate(counter) | |
608 | for (read1_it = flanks.begin(); read1_it != flanks.end(); ++read1_it) { | |
601 | 609 | FastaRecord read1 = read1_it->first; |
602 | for (read2_it = flanks[read1].begin(); read2_it != flanks[read1].end(); read2_it++) { | |
610 | bool success = false; | |
611 | for (read2_it = flanks[read1].begin(); read2_it != flanks[read1].end(); ++read2_it, ++counter) { | |
612 | #if _OPENMP | |
613 | if (counter % omp_get_num_threads() != omp_get_thread_num()) | |
614 | continue; | |
615 | #endif | |
603 | 616 | FastaRecord read2 = read2_it->first; |
604 | 617 | |
605 | 618 | int startposition = read2_it->second.gapStart(); |
606 | 619 | string tempSeq = merge(g, k, read2_it->second, read1, read2, params, g_count, traceStream); |
607 | 620 | if (!tempSeq.empty()) { |
608 | 621 | success = true; |
622 | #pragma omp critical (allmerged) | |
609 | 623 | allmerged[read1.id.substr(0,read1.id.length()-2)][startposition] |
610 | 624 | = ClosedGap(read2_it->second, tempSeq); |
611 | //#pragma omp atomic | |
612 | gapsclosed++; | |
613 | //#pragma omp atomic | |
614 | uniqueGapsClosed++; | |
615 | if (gapsclosed % 100 == 0) | |
625 | #pragma omp atomic | |
626 | ++uniqueGapsClosed; | |
627 | #pragma omp critical (gapsclosed) | |
628 | if (++gapsclosed % 100 == 0) | |
616 | 629 | printLog(logStream, IntToString(gapsclosed) + " gaps closed so far\n"); |
630 | ||
631 | if (!opt::gapfilePath.empty()) | |
632 | #pragma omp critical (gapStream) | |
633 | gapStream << ">" << read1.id.substr(0,read1.id.length()-2) | |
634 | << "_" << read2_it->second.gapStart() << "-" << read2_it->second.gapEnd() | |
635 | << " LN:i:" << tempSeq.length() << '\n' | |
636 | << tempSeq << '\n'; | |
617 | 637 | } |
618 | 638 | } |
619 | 639 | if (success) { |
620 | flanks.erase(read1_it++); | |
640 | #pragma omp critical (flanks_closed) | |
641 | flanks_closed.push_back(read1_it); | |
621 | 642 | } |
622 | else | |
623 | read1_it++; | |
643 | } | |
644 | ||
645 | for (vector<map<FastaRecord, map<FastaRecord, Gap> >::iterator>::iterator it = flanks_closed.begin(); | |
646 | it != flanks_closed.end(); ++it) { | |
647 | if (flanks.count((*it)->first) > 0) | |
648 | flanks.erase(*it); | |
624 | 649 | } |
625 | 650 | |
626 | 651 | printLog(logStream, IntToString(uniqueGapsClosed) + " unique gaps closed for k" + IntToString(k) + "\n"); |
770 | 795 | opt::searchMem = SIToBytes(arg); break; |
771 | 796 | case 't': |
772 | 797 | arg >> opt::tracefilePath; break; |
798 | case 'g': | |
799 | arg >> opt::gapfilePath; break; | |
773 | 800 | case 'v': |
774 | 801 | opt::verbose++; break; |
775 | 802 | case OPT_HELP: |
871 | 898 | assert(traceStream.is_open()); |
872 | 899 | ConnectPairsResult::printHeaders(traceStream); |
873 | 900 | assert_good(traceStream, opt::tracefilePath); |
901 | } | |
902 | ||
903 | ofstream gapStream; | |
904 | if (!opt::gapfilePath.empty()) { | |
905 | gapStream.open(opt::gapfilePath.c_str()); | |
906 | assert(gapStream.is_open()); | |
907 | assert_good(gapStream, opt::gapfilePath); | |
874 | 908 | } |
875 | 909 | |
876 | 910 | string logOutputPath(opt::outputPrefix); |
1019 | 1053 | temp = "Starting K run with k = " + IntToString(opt::k) + "\n"; |
1020 | 1054 | printLog(logStream, temp); |
1021 | 1055 | |
1022 | kRun(params, opt::k, g, allmerged, flanks, gapsclosed, logStream, traceStream); | |
1056 | kRun(params, opt::k, g, allmerged, flanks, gapsclosed, logStream, traceStream, gapStream); | |
1023 | 1057 | |
1024 | 1058 | temp = "k" + IntToString(opt::k) + " run complete\n" |
1025 | 1059 | + "Total gaps closed so far = " + IntToString(gapsclosed) + "\n\n"; |
1072 | 1106 | traceStream.close(); |
1073 | 1107 | } |
1074 | 1108 | |
1109 | if (!opt::gapfilePath.empty()) { | |
1110 | assert_good(gapStream, opt::gapfilePath); | |
1111 | gapStream.close(); | |
1112 | } | |
1113 | ||
1075 | 1114 | return 0; |
1076 | 1115 | } |
8 | 8 | #include <iostream> |
9 | 9 | |
10 | 10 | using namespace std; |
11 | typedef RollingBloomDBG<BTL::BloomFilter> Graph; | |
11 | typedef RollingBloomDBG<BloomFilter> Graph; | |
12 | 12 | typedef graph_traits<Graph> GraphTraits; |
13 | 13 | |
14 | 14 | /* each vertex is represented by |
59 | 59 | * GACTCGG |
60 | 60 | */ |
61 | 61 | |
62 | BTL::BloomFilter bloom1(bloomSize, numHashes, k); | |
62 | BloomFilter bloom1(bloomSize, numHashes, k); | |
63 | 63 | |
64 | 64 | RollingHash("GACTC", numHashes, k).getHashes(hashes); |
65 | 65 | bloom1.insert(hashes); |
94 | 94 | * GACTCGG |
95 | 95 | */ |
96 | 96 | |
97 | BTL::BloomFilter bloom2(bloomSize, numHashes, k); | |
97 | BloomFilter bloom2(bloomSize, numHashes, k); | |
98 | 98 | |
99 | 99 | RollingHash("GACTC", numHashes, k).getHashes(hashes); |
100 | 100 | bloom2.insert(hashes); |
130 | 130 | * ACTCG |
131 | 131 | */ |
132 | 132 | |
133 | BTL::BloomFilter bloom3(bloomSize, numHashes, k); | |
133 | BloomFilter bloom3(bloomSize, numHashes, k); | |
134 | 134 | |
135 | 135 | RollingHash("TACTC", numHashes, k).getHashes(hashes); |
136 | 136 | bloom2.insert(hashes); |
7 | 7 | using namespace std; |
8 | 8 | using namespace boost; |
9 | 9 | |
10 | typedef RollingBloomDBG<BTL::BloomFilter> Graph; | |
10 | typedef RollingBloomDBG<BloomFilter> Graph; | |
11 | 11 | typedef graph_traits<Graph> GraphTraits; |
12 | 12 | typedef graph_traits<Graph>::vertex_descriptor V; |
13 | 13 | |
19 | 19 | const unsigned m_k; |
20 | 20 | const unsigned m_bloomSize; |
21 | 21 | const unsigned m_numHashes; |
22 | BTL::BloomFilter m_bloom; | |
22 | BloomFilter m_bloom; | |
23 | 23 | Graph m_graph; |
24 | 24 | |
25 | 25 | RollingBloomDBGTest() : m_k(5), m_bloomSize(100000), m_numHashes(2), |
148 | 148 | * CGACT-GACTC-ACTCG |
149 | 149 | */ |
150 | 150 | |
151 | BTL::BloomFilter bloom(m_bloomSize, m_numHashes, m_k); | |
151 | BloomFilter bloom(m_bloomSize, m_numHashes, m_k); | |
152 | 152 | Graph graph(bloom); |
153 | 153 | |
154 | 154 | const V CGACT("CGACT", RollingHash("CGACT", m_numHashes, m_k)); |
196 | 196 | const unsigned m_k; |
197 | 197 | const unsigned m_bloomSize; |
198 | 198 | const unsigned m_numHashes; |
199 | BTL::BloomFilter m_bloom; | |
199 | BloomFilter m_bloom; | |
200 | 200 | Graph m_graph; |
201 | 201 | const std::string m_spacedSeed; |
202 | 202 |
152 | 152 | ASSERT_EQ(rightKmerHash, middleKmerHash); |
153 | 153 | } |
154 | 154 | |
155 | TEST_F(RollingHashTest, setBase) | |
155 | TEST_F(RollingHashTest, setLastBase) | |
156 | 156 | { |
157 | 157 | MaskedKmer::mask().clear(); |
158 | 158 | |
159 | 159 | char kmer1[] = "ACGT"; |
160 | char kmer2[] = "ACCT"; | |
160 | char kmer2[] = "ACGA"; | |
161 | char kmer3[] = "GCGT"; | |
161 | 162 | |
162 | 163 | RollingHash hash1(kmer1, m_numHashes, m_k); |
163 | 164 | RollingHash hash2(kmer2, m_numHashes, m_k); |
165 | RollingHash hash3(kmer3, m_numHashes, m_k); | |
164 | 166 | |
165 | 167 | ASSERT_NE(hash2, hash1); |
166 | hash1.setBase(kmer1, 2, 'C'); | |
167 | ASSERT_EQ(0, strcmp(kmer1, kmer2)); | |
168 | hash1.setLastBase(kmer1, SENSE, 'A'); | |
168 | 169 | ASSERT_EQ(hash2, hash1); |
170 | ||
171 | hash1.reset(kmer1); | |
172 | ASSERT_NE(hash3, hash1); | |
173 | hash1.setLastBase(kmer1, ANTISENSE, 'G'); | |
174 | ASSERT_EQ(hash3, hash1); | |
169 | 175 | } |
170 | ||
171 | TEST_F(RollingHashTest, setBaseMasked) | |
172 | { | |
173 | MaskedKmer::setMask("1101"); | |
174 | ||
175 | char kmer1[] = "ACGT"; | |
176 | char kmer2[] = "ACCT"; | |
177 | ||
178 | RollingHash hash1(kmer1, m_numHashes, m_k); | |
179 | RollingHash hash2(kmer2, m_numHashes, m_k); | |
180 | ||
181 | /* hashes should agree since mismatch is in masked position */ | |
182 | ASSERT_EQ(hash2, hash1); | |
183 | ASSERT_NE(0, strcmp(kmer1, kmer2)); | |
184 | ||
185 | /* fix mismatch in masked position (hash values shouldn't change) */ | |
186 | hash1.setBase(kmer1, 2, 'C'); | |
187 | ASSERT_EQ(hash2, hash1); | |
188 | ASSERT_EQ(0, strcmp(kmer1, kmer2)); | |
189 | ||
190 | /* create mismatch in unmasked position (hash value should now differ) */ | |
191 | hash1.setBase(kmer1, 1, 'G'); | |
192 | ASSERT_NE(hash2, hash1); | |
193 | ASSERT_NE(0, strcmp(kmer1, kmer2)); | |
194 | } |
8 | 8 | #include <string> |
9 | 9 | |
10 | 10 | using namespace std; |
11 | using Konnector::BloomFilter; | |
11 | 12 | |
12 | 13 | TEST(BloomFilter, base) |
13 | 14 | { |
5 | 5 | #include <string> |
6 | 6 | |
7 | 7 | using namespace std; |
8 | using Konnector::BloomFilter; | |
8 | 9 | |
9 | 10 | /* |
10 | 11 | * Tests for getStartKmerPos() function, which does |
3 | 3 | |
4 | 4 | #include <gtest/gtest.h> |
5 | 5 | #include <string> |
6 | ||
7 | using Konnector::BloomFilter; | |
6 | 8 | |
7 | 9 | TEST(DBGBloom, BloomFilterPolymorphism) |
8 | 10 | { |
3 | 3 | #include <gtest/gtest.h> |
4 | 4 | |
5 | 5 | using namespace std; |
6 | using Konnector::BloomFilter; | |
6 | 7 | |
7 | 8 | // workaround: opt::k must be defined because |
8 | 9 | // it is used by write_dot(..) |
212 | 212 | BloomDBG/HashAgnosticCascadingBloomTest.cpp |
213 | 213 | BloomDBG_HashAgnosticCascadingBloom_CXXFLAGS = $(AM_CXXFLAGS) \ |
214 | 214 | $(OPENMP_CXXFLAGS) |
215 | BloomDBG_HashAgnosticCascadingBloom_LDADD = \ | |
216 | $(top_builddir)/DataLayer/libdatalayer.a \ | |
217 | $(top_builddir)/Common/libcommon.a \ | |
218 | $(LDADD) | |
215 | 219 | |
216 | 220 | check_PROGRAMS += BloomDBG_RollingBloomDBG |
217 | 221 | BloomDBG_RollingBloomDBG_SOURCES = BloomDBG/RollingBloomDBGTest.cpp |
3 | 3 | # Anthony Raymond <traymond@bcgsc.ca>. |
4 | 4 | |
5 | 5 | SHELL=bash -e -o pipefail |
6 | ifneq ($(shell command -v zsh),) | |
6 | ifeq ($(shell zsh -e -o pipefail -c 'true' 2>/dev/null; echo $$?), 0) | |
7 | 7 | # Set pipefail to ensure that all commands of a pipe succeed. |
8 | 8 | SHELL=zsh -e -o pipefail |
9 | 9 | # Report run time and memory usage with zsh. |
109 | 109 | endif |
110 | 110 | endif |
111 | 111 | |
112 | ifndef preserve_path | |
112 | 113 | # Determine the path to the ABySS executables |
113 | 114 | path?=$(shell dirname `command -v $(MAKEFILE_LIST)`) |
114 | 115 | ifdef path |
115 | 116 | PATH:=$(path):$(PATH) |
117 | endif | |
116 | 118 | endif |
117 | 119 | |
118 | 120 | ifdef db |
195 | 197 | ifdef c |
196 | 198 | abyssopt += -c$c |
197 | 199 | endif |
200 | ifdef kc | |
201 | abyssopt += --kc=$(kc) | |
202 | endif | |
198 | 203 | ifdef b |
199 | 204 | abyssopt += -b$b |
200 | 205 | pbopt += -b$b |
215 | 220 | endif |
216 | 221 | ifdef j |
217 | 222 | abyssopt += -j$j |
218 | endif | |
219 | ifdef kc | |
220 | abyssopt += --kc=$(kc) | |
221 | 223 | endif |
222 | 224 | ifdef x |
223 | 225 | abyssopt += -s$x |
392 | 394 | @echo 'Report bugs to https://github.com/bcgsc/abyss/issues or abyss-users@bcgsc.ca.' |
393 | 395 | |
394 | 396 | version: |
395 | @echo "abyss-pe (ABySS) 2.1.0" | |
397 | @echo "abyss-pe (ABySS) 2.1.1" | |
396 | 398 | @echo "Written by Shaun Jackman and Anthony Raymond." |
397 | 399 | @echo |
398 | 400 | @echo "Copyright 2012 Canada's Michael Smith Genome Science Centre" |
529 | 531 | |
530 | 532 | ifdef B |
531 | 533 | %-1.fa: |
532 | abyss-bloom-dbg $(abyssopt) $(ABYSS_OPTIONS) $(in) $(se) > $@ | |
534 | $(gtime) abyss-bloom-dbg $(abyssopt) $(ABYSS_OPTIONS) $(in) $(se) > $@ | |
533 | 535 | else ifdef K |
534 | 536 | |
535 | 537 | ifdef np |
536 | 538 | %-1.fa: |
537 | $(mpirun) -np $(np) abyss-paired-dbg-mpi $(abyssopt) $(ABYSS_OPTIONS) -o $*-1.fa $(in) $(se) | |
539 | $(gtime) $(mpirun) -np $(np) abyss-paired-dbg-mpi $(abyssopt) $(ABYSS_OPTIONS) -o $*-1.fa $(in) $(se) | |
538 | 540 | else |
539 | 541 | %-1.fa %-1.$g: |
540 | abyss-paired-dbg $(abyssopt) $(ABYSS_OPTIONS) -o $*-1.fa -g $*-1.$g $(in) $(se) | |
542 | $(gtime) abyss-paired-dbg $(abyssopt) $(ABYSS_OPTIONS) -o $*-1.fa -g $*-1.$g $(in) $(se) | |
541 | 543 | endif |
542 | 544 | |
543 | 545 | else ifdef np |
544 | 546 | %-1.fa: |
545 | $(mpirun) -np $(np) ABYSS-P $(abyssopt) $(ABYSS_OPTIONS) -o $@ $(in) $(se) | |
547 | $(gtime) $(mpirun) -np $(np) ABYSS-P $(abyssopt) $(ABYSS_OPTIONS) -o $@ $(in) $(se) | |
546 | 548 | else |
547 | 549 | %-1.fa: |
548 | ABYSS $(abyssopt) $(ABYSS_OPTIONS) -o $@ $(in) $(se) | |
550 | $(gtime) ABYSS $(abyssopt) $(ABYSS_OPTIONS) -o $@ $(in) $(se) | |
549 | 551 | endif |
550 | 552 | |
551 | 553 | # Find overlapping contigs |
552 | 554 | |
553 | 555 | %-1.$g: %-1.fa |
554 | AdjList $(alopt) --$g $< >$@ | |
556 | $(gtime) AdjList $(alopt) --$g $< >$@ | |
555 | 557 | |
556 | 558 | # Remove shim contigs |
557 | 559 | |
558 | 560 | %-2.$g1 %-1.path: %-1.$g %-1.fa |
559 | abyss-filtergraph $v --$g $(fgopt) $(FILTERGRAPH_OPTIONS) -k$k -g $*-2.$g1 $^ >$*-1.path | |
561 | $(gtime) abyss-filtergraph $v --$g $(fgopt) $(FILTERGRAPH_OPTIONS) -k$k -g $*-2.$g1 $^ >$*-1.path | |
560 | 562 | |
561 | 563 | %-2.fa %-2.$g: %-1.fa %-2.$g1 %-1.path |
562 | MergeContigs --$g $(mcopt) -g $*-2.$g -o $*-2.fa $^ | |
564 | $(gtime) MergeContigs --$g $(mcopt) -g $*-2.$g -o $*-2.fa $^ | |
563 | 565 | |
564 | 566 | # Pop bubbles |
565 | 567 | |
566 | 568 | %-2.path %-3.$g: %-2.fa %-2.$g |
567 | PopBubbles $v --$g -j$j -k$k $(SS) $(pbopt) $(POPBUBBLES_OPTIONS) -g $*-3.$g $^ >$*-2.path | |
569 | $(gtime) PopBubbles $v --$g -j$j -k$k $(SS) $(pbopt) $(POPBUBBLES_OPTIONS) -g $*-3.$g $^ >$*-2.path | |
568 | 570 | |
569 | 571 | %-3.fa: %-2.fa %-2.$g %-2.path |
570 | MergeContigs $(mcopt) -o $@ $^ | |
572 | $(gtime) MergeContigs $(mcopt) -o $@ $^ | |
571 | 573 | awk '!/^>/ {x[">" $$1]=1; next} {getline s} $$1 in x {print $$0 "\n" s}' \ |
572 | 574 | $*-2.path $*-1.fa >$*-indel.fa |
573 | 575 | |
580 | 582 | # Estimate distances between unitigs |
581 | 583 | |
582 | 584 | %-3.sam.gz %-3.hist: $(name)-3.fa |
583 | $(align) $(mapopt) $(strip $($*)) $< \ | |
585 | $(gtime) $(align) $(mapopt) $(strip $($*)) $< \ | |
584 | 586 | |$(fixmate) $(fmopt) -h $*-3.hist \ |
585 | 587 | |sort -snk3 -k4 \ |
586 | 588 | |$(gzip) >$*-3.sam.gz |
587 | 589 | |
588 | 590 | %-3.bam %-3.hist: $(name)-3.fa |
589 | $(align) $(mapopt) $(strip $($*)) $< \ | |
591 | $(gtime) $(align) $(mapopt) $(strip $($*)) $< \ | |
590 | 592 | |$(fixmate) $(fmopt) -h $*-3.hist \ |
591 | 593 | |sort -snk3 -k4 \ |
592 | 594 | |samtools view -Sb - -o $*-3.bam |
593 | 595 | |
594 | 596 | %-3.dist: %-3.sam.gz %-3.hist |
595 | 597 | gunzip -c $< \ |
598 | |$(gtime) $(DistanceEst) $(deopt) -o $@ $*-3.hist | |
599 | ||
600 | %-3.dist: %-3.bam %-3.hist | |
601 | $(gtime) samtools view -h $< \ | |
596 | 602 | |$(DistanceEst) $(deopt) -o $@ $*-3.hist |
597 | 603 | |
598 | %-3.dist: %-3.bam %-3.hist | |
599 | samtools view -h $< \ | |
600 | |$(DistanceEst) $(deopt) -o $@ $*-3.hist | |
601 | ||
602 | 604 | %-3.dist: $(name)-3.fa |
603 | $(align) $(mapopt) $(strip $($*)) $< \ | |
605 | $(gtime) $(align) $(mapopt) $(strip $($*)) $< \ | |
604 | 606 | |$(fixmate) $(fmopt) -h $*-3.hist \ |
605 | 607 | |sort -snk3 -k4 \ |
606 | 608 | |$(DistanceEst) $(deopt) -o $@ $*-3.hist |
609 | 611 | |
610 | 612 | ifneq ($(name)-3.dist, $(dist)) |
611 | 613 | $(name)-3.dist: $(name)-3.fa $(dist) |
612 | abyss-todot $v --dist -e $^ >$@ | |
614 | $(gtime) abyss-todot $v --dist -e $^ >$@ | |
613 | 615 | |
614 | 616 | $(name)-3.bam: $(addsuffix -3.bam, $(pe)) |
615 | samtools merge -r $@ $^ | |
617 | $(gtime) samtools merge -r $@ $^ | |
616 | 618 | endif |
617 | 619 | |
618 | 620 | # Find overlaps between contigs |
619 | 621 | |
620 | 622 | %-4.fa %-4.$g: %-3.fa %-3.$g %-3.dist |
621 | Overlap $v --$g $(SS) $(OVERLAP_OPTIONS) -k$k -g $*-4.$g -o $*-4.fa $^ | |
623 | $(gtime) Overlap $v --$g $(SS) $(OVERLAP_OPTIONS) -k$k -g $*-4.$g -o $*-4.fa $^ | |
622 | 624 | |
623 | 625 | # Assemble contigs |
624 | 626 | |
625 | 627 | %-4.path1: %-4.$g %-3.dist |
626 | SimpleGraph $v $(sgopt) $(SIMPLEGRAPH_OPTIONS) -j$j -k$k -o $@ $^ | |
628 | $(gtime) SimpleGraph $v $(sgopt) $(SIMPLEGRAPH_OPTIONS) -j$j -k$k -o $@ $^ | |
627 | 629 | |
628 | 630 | %-4.path2: %-4.path1 %-3.fa.fai %-4.fa.fai |
629 | 631 | cat $*-3.fa.fai $*-4.fa.fai \ |
630 | |MergePaths $(mpopt) $(MERGEPATHS_OPTIONS) -o $@ - $< | |
632 | |$(gtime) MergePaths $(mpopt) $(MERGEPATHS_OPTIONS) -o $@ - $< | |
631 | 633 | |
632 | 634 | %-4.path3: %-4.$g %-4.path2 |
633 | 635 | PathOverlap --assemble $(poopt) $(SS) $^ >$@ |
636 | 638 | |
637 | 639 | %-5.path %-5.fa %-5.$g: %-3.fa %-4.fa %-4.$g %-4.path3 |
638 | 640 | cat $(wordlist 1, 2, $^) \ |
639 | |PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -o $*-5.path -s $*-5.fa -g $*-5.$g - $(wordlist 3, 4, $^) | |
641 | |$(gtime) PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -o $*-5.path -s $*-5.fa -g $*-5.$g - $(wordlist 3, 4, $^) | |
640 | 642 | |
641 | 643 | %-6.fa: %-3.fa %-4.fa %-5.fa %-5.$g %-5.path |
642 | cat $(wordlist 1, 3, $^) |MergeContigs $(mcopt) -o $@ - $(wordlist 4, 5, $^) | |
644 | cat $(wordlist 1, 3, $^) |$(gtime) MergeContigs $(mcopt) -o $@ - $(wordlist 4, 5, $^) | |
643 | 645 | |
644 | 646 | else |
645 | 647 | |
648 | 650 | ln -sf $*-4.path3 $*-5.path |
649 | 651 | |
650 | 652 | %-cs.fa: %-3.fa %-4.fa %-4.$g %-4.path3 |
651 | cat $(wordlist 1, 2, $^) |MergeContigs $(mcopt) -o $@ - $(wordlist 3, 4, $^) | |
653 | cat $(wordlist 1, 2, $^) |$(gtime) MergeContigs $(mcopt) -o $@ - $(wordlist 3, 4, $^) | |
652 | 654 | |
653 | 655 | # Convert colour-space sequence to nucleotides |
654 | 656 | |
655 | 657 | %-6.fa: %-cs.fa |
656 | KAligner $v --seq -m -j$j -l$l $(in) $(se) $< \ | |
658 | $(gtime) KAligner $v --seq -m -j$j -l$l $(in) $(se) $< \ | |
657 | 659 | |Consensus $v -o $@ $< |
658 | 660 | |
659 | 661 | endif |
660 | 662 | |
661 | 663 | %-6.$g: %-5.$g %-5.path |
662 | PathOverlap --overlap $(poopt) --$g $^ >$@ | |
664 | $(gtime) PathOverlap --overlap $(poopt) --$g $^ >$@ | |
663 | 665 | |
664 | 666 | %-contigs.fa: %-6.fa |
665 | 667 | ln -sf $< $@ |
670 | 672 | # Estimate distances between contigs |
671 | 673 | |
672 | 674 | %-6.sam.gz %-6.hist: $(name)-6.fa |
673 | $(align) $(mapopt) $(strip $($*)) $< \ | |
675 | $(gtime) $(align) $(mapopt) $(strip $($*)) $< \ | |
674 | 676 | |$(fixmate) $(fmopt) -h $*-6.hist \ |
675 | 677 | |sort -snk3 -k4 \ |
676 | 678 | |$(gzip) >$*-6.sam.gz |
677 | 679 | |
678 | 680 | %-6.bam %-6.hist: $(name)-6.fa |
679 | $(align) $(mapopt) $(strip $($*)) $< \ | |
681 | $(gtime) $(align) $(mapopt) $(strip $($*)) $< \ | |
680 | 682 | |$(fixmate) $(fmopt) -h $*-6.hist \ |
681 | 683 | |sort -snk3 -k4 \ |
682 | 684 | |samtools view -Sb - -o $*-6.bam |
683 | 685 | |
684 | 686 | %-6.dist.dot: %-6.sam.gz %-6.hist |
685 | 687 | gunzip -c $< \ |
686 | |$(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist | |
688 | |$(gtime) $(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist | |
687 | 689 | |
688 | 690 | %-6.dist.dot: %-6.bam %-6.hist |
689 | 691 | samtools view -h $< \ |
690 | |$(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist | |
692 | |$(gtime) $(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist | |
691 | 693 | |
692 | 694 | %-6.dist.dot: $(name)-6.fa |
693 | $(align) $(mapopt) $(strip $($*)) $< \ | |
695 | $(gtime) $(align) $(mapopt) $(strip $($*)) $< \ | |
694 | 696 | |$(fixmate) $(fmopt) -h $*-6.hist \ |
695 | 697 | |sort -snk3 -k4 \ |
696 | 698 | |$(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist |
698 | 700 | # Scaffold |
699 | 701 | |
700 | 702 | %-6.path: $(name)-6.$g $(addsuffix -6.dist.dot, $(mp)) |
701 | abyss-scaffold $(scopt) -s$S -n$N -g $@.dot $(SCAFFOLD_OPTIONS) $^ >$@ | |
703 | $(gtime) abyss-scaffold $(scopt) -s$S -n$N -g $@.dot $(SCAFFOLD_OPTIONS) $^ >$@ | |
702 | 704 | |
703 | 705 | %-7.path %-7.$g %-7.fa: %-6.fa %-6.$g %-6.path |
704 | PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -s $*-7.fa -g $*-7.$g -o $*-7.path $^ | |
706 | $(gtime) PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -s $*-7.fa -g $*-7.$g -o $*-7.path $^ | |
705 | 707 | |
706 | 708 | %-8.fa: %-6.fa %-7.fa %-7.$g %-7.path |
707 | 709 | cat $(wordlist 1, 2, $^) \ |
708 | |MergeContigs $(mcopt) -o $@ - $(wordlist 3, 4, $^) | |
710 | |$(gtime) MergeContigs $(mcopt) -o $@ - $(wordlist 3, 4, $^) | |
709 | 711 | |
710 | 712 | %-8.$g: %-7.$g %-7.path |
711 | PathOverlap --overlap $(poopt) --$g $^ >$@ | |
713 | $(gtime) PathOverlap --overlap $(poopt) --$g $^ >$@ | |
712 | 714 | |
713 | 715 | # Scaffold using linked reads |
714 | 716 | ifdef lr |
716 | 718 | # Tigmint |
717 | 719 | |
718 | 720 | # Options for mapping the reads to the draft assembly. |
719 | lr_l=$l | |
721 | lr_l?=$l | |
720 | 722 | override lrmapopt=$v -j$j -l$(lr_l) $(LR_MAP_OPTIONS) |
721 | 723 | |
722 | 724 | # Options for abyss-scaffold |
723 | lr_s=1000-100000 | |
724 | lr_n=5-20 | |
725 | lr_s?=1000-100000 | |
726 | lr_n?=5-20 | |
725 | 727 | |
726 | 728 | # Minimum AS/Read length ratio |
727 | tigmint_as=0.65 | |
729 | tigmint_as?=0.65 | |
728 | 730 | |
729 | 731 | # Maximum number of mismatches |
730 | tigmint_nm=5 | |
732 | tigmint_nm?=5 | |
731 | 733 | |
732 | 734 | # Minimum mapping quality threshold |
733 | tigmint_mapq=0 | |
735 | tigmint_mapq?=0 | |
734 | 736 | |
735 | 737 | # Maximum distance between reads to be considered the same molecule |
736 | tigmint_d=50000 | |
738 | tigmint_d?=50000 | |
737 | 739 | |
738 | 740 | # Minimum number of spanning molecules |
739 | tigmint_n=10 | |
741 | tigmint_n?=10 | |
740 | 742 | |
741 | 743 | # Size of the window that must be spanned by moecules |
742 | tigmint_w=1000 | |
744 | tigmint_w?=1000 | |
743 | 745 | |
744 | 746 | # Align paired-end reads to the draft genome, sort by BX tag, |
745 | 747 | # and create molecule extents BED. |
746 | %.lr.bed: %.fa | |
747 | $(gtime) $(align) $(lrmapopt) $(lr_reads) $< \ | |
748 | %.lr.bed: %.fa.fai | |
749 | $(gtime) $(align) $(lrmapopt) $(lr_reads) $*.fa \ | |
748 | 750 | | samtools sort -@$j -tBX -l0 -T$$(mktemp -u -t $@.XXXXXX) \ |
749 | 751 | | tigmint-molecule -a $(tigmint_as) -n $(tigmint_nm) -q $(tigmint_mapq) -d $(tigmint_d) - \ |
750 | 752 | | sort -k1,1 -k2,2n -k3,3n >$@ |
751 | 753 | |
752 | 754 | # Align paired-end reads to the draft genome and sort by BX tag. |
753 | %.lr.sortbx.bam: %.fa | |
754 | $(gtime) $(align) $(lrmapopt) $(lr_reads) $< \ | |
755 | %.lr.sortbx.bam: %.fa.fai | |
756 | $(gtime) $(align) $(lrmapopt) $(lr_reads) $*.fa \ | |
755 | 757 | | samtools sort -@$j -tBX -T$$(mktemp -u -t $@.XXXXXX) -o $@ |
756 | 758 | |
757 | 759 | # Filter the BAM file, create molecule extents BED. |
764 | 766 | $(gtime) tigmint-cut -p$j -n$(tigmint_n) -w$(tigmint_w) -o $@ $*.fa $< |
765 | 767 | |
766 | 768 | # ARCS |
767 | arcs_c=2 | |
768 | arcs_d=0 | |
769 | arcs_e=30000 | |
770 | arcs_l=0 | |
771 | arcs_m=4-20000 | |
772 | arcs_r=0.05 | |
773 | arcs_s=98 | |
774 | arcs_z=500 | |
769 | arcs_c?=2 | |
770 | arcs_d?=0 | |
771 | arcs_e?=30000 | |
772 | arcs_l?=0 | |
773 | arcs_m?=4-20000 | |
774 | arcs_r?=0.05 | |
775 | arcs_s?=98 | |
776 | arcs_z?=500 | |
775 | 777 | |
776 | 778 | # Align reads and create a graph of linked contigs using ARCS. |
777 | 779 | %.arcs.dist.gv: %.fa |
789 | 791 | # Create a graph of linked contigs using ARCS. |
790 | 792 | %.arcs.dist.gv: %.lr.sortn.sam.gz |
791 | 793 | gunzip -c $< \ |
792 | | arcs $v -c$(arcs_c) -d$(arcs_d) -e$(arcs_e) -l$(arcs_l) -m$(arcs_m) -r$(arcs_r) -s$(arcs_s) -z$(arcs_z) \ | |
794 | |$(gtime) arcs $v -c$(arcs_c) -d$(arcs_d) -e$(arcs_e) -l$(arcs_l) -m$(arcs_m) -r$(arcs_r) -s$(arcs_s) -z$(arcs_z) \ | |
793 | 795 | -g $*.arcs.dist.gv --tsv=$*.arcs.tsv --barcode-counts=$*.arcs.barcode-counts.tsv /dev/stdin |
794 | 796 | |
795 | 797 | # Scaffold using ARCS and abyss-scaffold. |
796 | 798 | %.arcs.path: %.arcs.dist.gv |
797 | abyss-scaffold $(scopt) -s$(lr_s) -n$(lr_n) -g $@.dot $(LR_SCAFFOLD_OPTIONS) $< >$@ | |
799 | $(gtime) abyss-scaffold $(scopt) -s$(lr_s) -n$(lr_n) -g $@.dot $(LR_SCAFFOLD_OPTIONS) $< >$@ | |
798 | 800 | |
799 | 801 | # Create the FASTA file of ARCS scaffolds. |
800 | 802 | %.arcs.fa: %.fa %.arcs.path |
801 | MergeContigs $(mcopt) -o $@ $^ | |
803 | $(gtime) MergeContigs $(mcopt) -o $@ $^ | |
802 | 804 | |
803 | 805 | %-scaffolds.fa: %-8.tigmint.arcs.fa |
804 | 806 | ln -sf $< $@ |
817 | 819 | sealer_ks?=-k90 -k80 -k70 -k60 -k50 -k40 -k30 |
818 | 820 | |
819 | 821 | %-8_scaffold.fa: %-8.fa |
820 | abyss-sealer -v -j$j --print-flanks -o$*-8 -S$< $(sealer_ks) $(SEALER_OPTIONS) $(in) $(se) | |
822 | $(gtime) abyss-sealer -v -j$j --print-flanks -o$*-8 -S$< $(sealer_ks) $(SEALER_OPTIONS) $(in) $(se) | |
821 | 823 | |
822 | 824 | %-scaffolds-sealed.fa: %-8_scaffold.fa |
823 | 825 | ln -s $< $@ |
833 | 835 | # Transcriptome assisted scaffolding |
834 | 836 | |
835 | 837 | %.fa.bwt: %.fa |
836 | bwa index $< | |
838 | $(gtime) bwa index $< | |
837 | 839 | |
838 | 840 | %-8.sam.gz: $(name)-8.fa.bwt |
839 | bwa mem -a -t$j -S -P -k$l $(name)-8.fa $(strip $($*)) \ | |
841 | $(gtime) bwa mem -a -t$j -S -P -k$l $(name)-8.fa $(strip $($*)) \ | |
840 | 842 | |$(gzip) >$@ |
841 | 843 | |
842 | 844 | %-8.dist.dot: %-8.sam.gz |
843 | abyss-longseqdist -k$k $(LONGSEQDIST_OPTIONS) $< \ | |
845 | $(gtime) abyss-longseqdist -k$k $(LONGSEQDIST_OPTIONS) $< \ | |
844 | 846 | |grep -v "l=" >$@ |
845 | 847 | |
846 | 848 | %-8.path: $(name)-8.$g $(addsuffix -8.dist.dot, $(long)) |
847 | abyss-scaffold $(scopt) -s$S -n1 -g $@.$g $(SCAFFOLD_OPTIONS) $^ >$@ | |
849 | $(gtime) abyss-scaffold $(scopt) -s$S -n1 -g $@.$g $(SCAFFOLD_OPTIONS) $^ >$@ | |
848 | 850 | |
849 | 851 | %-9.path %-9.$g %-9.fa: %-8.fa %-8.$g %-8.path |
850 | PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -s $*-9.fa -g $*-9.$g -o $*-9.path $^ | |
852 | $(gtime) PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -s $*-9.fa -g $*-9.$g -o $*-9.path $^ | |
851 | 853 | |
852 | 854 | %-10.fa: %-8.fa %-9.fa %-9.$g %-9.path |
853 | 855 | cat $(wordlist 1, 2, $^) \ |
854 | |MergeContigs $(mcopt) -o $@ - $(wordlist 3, 4, $^) | |
856 | |$(gtime) MergeContigs $(mcopt) -o $@ - $(wordlist 3, 4, $^) | |
855 | 857 | |
856 | 858 | %-10.$g: %-9.$g %-9.path |
857 | PathOverlap --overlap $(poopt) --$g $^ >$@ | |
859 | $(gtime) PathOverlap --overlap $(poopt) --$g $^ >$@ | |
858 | 860 | |
859 | 861 | %-long-scaffs.fa: %-10.fa |
860 | 862 | ln -sf $< $@ |
880 | 882 | endif |
881 | 883 | |
882 | 884 | $(name)-unitigs.bam: %.bam: %.fa |
883 | $(align) $v -j$j -l$l $(ALIGNER_OPTIONS) $(se) $< \ | |
885 | $(gtime) $(align) $v -j$j -l$l $(ALIGNER_OPTIONS) $(se) $< \ | |
884 | 886 | |samtools view -Su - |samtools sort -o - - >$@ |
885 | 887 | |
886 | 888 | $(name)-contigs.bam $(name)-scaffolds.bam: %.bam: %.fa |
887 | $(align) $v -j$j -l$l $(ALIGNER_OPTIONS) \ | |
889 | $(gtime) $(align) $v -j$j -l$l $(ALIGNER_OPTIONS) \ | |
888 | 890 | $(call map, deref, $(sort $(lib) $(pe) $(mp))) $< \ |
889 | 891 | |$(fixmate) $v $(FIXMATE_OPTIONS) \ |
890 | 892 | |sort -snk3 -k4 \ |
893 | 895 | # Align the variants to the assembly |
894 | 896 | |
895 | 897 | %-variants.bam: %.fa.bwt |
896 | bwa bwasw -t$j $*.fa <(cat $(name)-bubbles.fa $(name)-indel.fa) \ | |
898 | $(gtime) bwa bwasw -t$j $*.fa <(cat $(name)-bubbles.fa $(name)-indel.fa) \ | |
897 | 899 | |samtools view -Su - |samtools sort -o - - >$@ |
898 | 900 | |
899 | 901 | %-variants.vcf.gz: %.fa %-variants.bam |
900 | samtools mpileup -Buf $^ |bcftools view -vp1 - |bgzip >$@ | |
902 | $(gtime) samtools mpileup -Buf $^ |bcftools view -vp1 - |bgzip >$@ | |
901 | 903 | |
902 | 904 | %.gz.tbi: %.gz |
903 | tabix -pvcf $< | |
905 | $(gtime) tabix -pvcf $< | |
904 | 906 | |
905 | 907 | # Calculate assembly contiguity statistics |
906 | 908 | |
937 | 939 | # Create an AGP file and FASTA file of scaftigs from scaffolds |
938 | 940 | |
939 | 941 | %.agp %-agp.fa: %.fa |
940 | abyss-fatoagp $(FATOAGP_OPTIONS) -f $*-agp.fa $< >$*.agp | |
942 | $(gtime) abyss-fatoagp $(FATOAGP_OPTIONS) -f $*-agp.fa $< >$*.agp | |
941 | 943 | |
942 | 944 | # Align the contigs to the reference |
943 | 945 | |
944 | 946 | %-$(ref).sam.gz: %.fa |
945 | bwa bwasw $(bwaswopt) $(BWASW_OPTIONS) $($(ref)) $< |$(gzip) >$@ | |
947 | $(gtime) bwa bwasw $(bwaswopt) $(BWASW_OPTIONS) $($(ref)) $< |$(gzip) >$@ | |
946 | 948 | |
947 | 949 | # Find breakpoints in the alignments |
948 | 950 | |
949 | 951 | %.break: %.sam.gz |
950 | abyss-samtobreak $(SAMTOBREAK_OPTIONS) $< >$@ | |
952 | $(gtime) abyss-samtobreak $(SAMTOBREAK_OPTIONS) $< >$@ | |
951 | 953 | |
952 | 954 | # Report ABySS configuration variable(s) and value(s) currently set. |
953 | 955 |
0 | 0 | AC_PREREQ(2.62) |
1 | AC_INIT(ABySS, 2.1.0, abyss-users@bcgsc.ca, abyss, | |
1 | AC_INIT(ABySS, 2.1.1, abyss-users@bcgsc.ca, abyss, | |
2 | 2 | http://www.bcgsc.ca/platform/bioinfo/software/abyss) |
3 | 3 | m4_include(m4/m4_ax_pthread.m4) |
4 | 4 | AM_INIT_AUTOMAKE(1.9.6 foreign subdir-objects) |
309 | 309 | BloomDBG/Makefile |
310 | 310 | DataBase/Makefile |
311 | 311 | lib/bloomfilter/Makefile |
312 | lib/rolling-hash/Makefile | |
312 | lib/nthash/Makefile | |
313 | 313 | ]) |
314 | 314 | |
315 | 315 | if test "$with_sparsehash" != "no" -a "$ac_cv_header_google_sparse_hash_map" != "yes"; then |
0 | .TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.1.0" "User Commands" | |
0 | .TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.1.1" "User Commands" | |
1 | 1 | .SH NAME |
2 | 2 | ABYSS \- assemble short reads into contigs |
3 | 3 | .SH SYNOPSIS |
0 | .TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.1.0" "User Commands" | |
0 | .TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.1.1" "User Commands" | |
1 | 1 | .SH NAME |
2 | 2 | abyss-pe - assemble reads into contigs |
3 | 3 | .SH SYNOPSIS |
0 | .TH abyss-tofastq "1" "2015-May" "ABySS 2.1.0" "User Commands" | |
0 | .TH abyss-tofastq "1" "2015-May" "ABySS 2.1.1" "User Commands" | |
1 | 1 | .SH NAME |
2 | 2 | abyss-tofastq \- convert various file formats to FASTQ format |
3 | 3 | .br |
19 | 19 | #include <cstdlib> |
20 | 20 | #include <stdio.h> |
21 | 21 | #include <cstring> |
22 | #include "lib/rolling-hash/rolling.h" | |
23 | 22 | |
24 | 23 | using namespace std; |
25 | 24 | |
32 | 31 | >> (((0x4332322132212110 >> ((x & 0xF) << 2)) & 0xF) << 2)) |
33 | 32 | >> ((0x4332322132212110 >> (((x & 0xF0) >> 2)) & 0xF) << 2)) & 0xf; |
34 | 33 | } |
35 | ||
36 | /* To avoid name collision with konnector `BloomFilter` class */ | |
37 | ||
38 | namespace BTL { | |
39 | 34 | |
40 | 35 | class BloomFilter { |
41 | 36 | public: |
98 | 93 | loadFilter(filterFilePath); |
99 | 94 | } |
100 | 95 | |
101 | void loadHeader(FILE *file) { | |
102 | ||
103 | FileHeader header; | |
104 | if (fread(&header, sizeof(struct FileHeader), 1, file) != 1) { | |
105 | cerr << "Failed to read Bloom filter file header" << endl; | |
106 | } | |
107 | char magic[9]; | |
108 | strncpy(magic, header.magic, 8); | |
109 | magic[8] = '\0'; | |
110 | ||
111 | // cerr << "Loading header... magic: " << | |
112 | // magic << " hlen: " << | |
113 | // header.hlen << " size: " << | |
114 | // header.size << " nhash: " << | |
115 | // header.nhash << " kmer: " << | |
116 | // header.kmer << " dFPR: " << | |
117 | // header.dFPR << " aFPR: " << | |
118 | // header.aFPR << " rFPR: " << | |
119 | // header.rFPR << " nEntry: " << | |
120 | // header.nEntry << " tEntry: " << | |
121 | // header.tEntry << endl; | |
122 | ||
123 | m_size = header.size; | |
124 | initSize(m_size); | |
125 | m_hashNum = header.nhash; | |
126 | m_kmerSize = header.kmer; | |
127 | } | |
128 | ||
129 | 96 | void loadFilter(const string &filterFilePath) |
130 | 97 | { |
131 | 98 | FILE *file = fopen(filterFilePath.c_str(), "rb"); |
132 | 99 | if (file == NULL) { |
133 | 100 | cerr << "file \"" << filterFilePath << "\" could not be read." |
134 | << endl; | |
101 | << endl; | |
135 | 102 | exit(1); |
136 | 103 | } |
137 | 104 | |
143 | 110 | fseek(file, lCurPos, 0); |
144 | 111 | if (fileSize != m_sizeInBytes) { |
145 | 112 | cerr << "Error: " << filterFilePath |
146 | << " does not match size given by its information file. Size: " | |
147 | << fileSize << " vs " << m_sizeInBytes << " bytes." << endl; | |
113 | << " does not match size given by its header. Size: " | |
114 | << fileSize << " vs " << m_sizeInBytes << " bytes." << endl; | |
148 | 115 | exit(1); |
149 | 116 | } |
150 | 117 | |
151 | 118 | size_t countRead = fread(m_filter, fileSize, 1, file); |
152 | 119 | if (countRead != 1 && fclose(file) != 0) { |
153 | 120 | cerr << "file \"" << filterFilePath << "\" could not be read." |
154 | << endl; | |
121 | << endl; | |
155 | 122 | exit(1); |
156 | 123 | } |
124 | } | |
125 | ||
126 | void loadHeader(FILE *file) { | |
127 | ||
128 | FileHeader header; | |
129 | if (fread(&header, sizeof(struct FileHeader), 1, file) != 1) { | |
130 | cerr << "Failed to header" << endl; | |
131 | } | |
132 | char magic[9]; | |
133 | strncpy(magic, header.magic, 8); | |
134 | magic[8] = '\0'; | |
135 | ||
136 | m_size = header.size; | |
137 | initSize(m_size); | |
138 | m_hashNum = header.nhash; | |
139 | m_kmerSize = header.kmer; | |
157 | 140 | } |
158 | 141 | |
159 | 142 | /* |
182 | 165 | } |
183 | 166 | } |
184 | 167 | |
185 | void insert(const char* kmer) { | |
186 | uint64_t hVal = getChval(kmer, m_kmerSize); | |
187 | for (unsigned i = 0; i < m_hashNum; i++) { | |
188 | size_t normalizedValue = (rol(varSeed, i) ^ hVal) % m_size; | |
189 | __sync_or_and_fetch(&m_filter[normalizedValue / bitsPerChar], | |
190 | bitMask[normalizedValue % bitsPerChar]); | |
191 | } | |
192 | } | |
193 | ||
194 | /* | |
168 | /* | |
169 | * Accepts a list of precomputed hash values. Faster than rehashing each time. | |
195 | 170 | * Returns if already inserted |
196 | 171 | */ |
197 | bool insertAndCheck(const char* kmer) { | |
198 | uint64_t hVal = getChval(kmer, m_kmerSize); | |
172 | bool insertAndCheck(const size_t precomputed[]) { | |
173 | //iterates through hashed values adding it to the filter | |
199 | 174 | bool found = true; |
200 | for (unsigned i = 0; i < m_hashNum; i++) { | |
201 | size_t normalizedValue = (rol(varSeed, i) ^ hVal) % m_size; | |
202 | found &= __sync_or_and_fetch( | |
175 | for (size_t i = 0; i < m_hashNum; ++i) { | |
176 | size_t normalizedValue = precomputed[i] % m_size; | |
177 | found &= __sync_fetch_and_or( | |
203 | 178 | &m_filter[normalizedValue / bitsPerChar], |
204 | bitMask[normalizedValue % bitsPerChar]); | |
179 | bitMask[normalizedValue % bitsPerChar]) | |
180 | >> (normalizedValue % bitsPerChar) & 1; | |
205 | 181 | } |
206 | 182 | return found; |
207 | 183 | } |
215 | 191 | bool found = true; |
216 | 192 | for (size_t i = 0; i < m_hashNum; ++i) { |
217 | 193 | size_t normalizedValue = precomputed.at(i) % m_size; |
218 | found &= __sync_or_and_fetch( | |
194 | found &= __sync_fetch_and_or( | |
219 | 195 | &m_filter[normalizedValue / bitsPerChar], |
220 | bitMask[normalizedValue % bitsPerChar]); | |
196 | bitMask[normalizedValue % bitsPerChar]) | |
197 | >> (normalizedValue % bitsPerChar) & 1; | |
221 | 198 | } |
222 | 199 | return found; |
223 | 200 | } |
250 | 227 | return true; |
251 | 228 | } |
252 | 229 | |
253 | /* | |
254 | * Single pass filtering, computes hash values on the fly | |
255 | */ | |
256 | bool contains(const char* kmer) const { | |
257 | uint64_t hVal = getChval(kmer, m_kmerSize); | |
258 | for (unsigned i = 0; i < m_hashNum; i++) { | |
259 | size_t normalizedValue = (rol(varSeed, i) ^ hVal) % m_size; | |
260 | unsigned char bit = bitMask[normalizedValue % bitsPerChar]; | |
261 | if ((m_filter[normalizedValue / bitsPerChar] & bit) == 0) | |
262 | return false; | |
263 | } | |
264 | return true; | |
265 | } | |
266 | ||
267 | void writeHeader(ostream &out) const { | |
230 | void writeHeader(std::ostream& out) const { | |
268 | 231 | FileHeader header; |
269 | 232 | strncpy(header.magic, "BlOOMFXX", 8); |
270 | 233 | char magic[9]; |
279 | 242 | header.nEntry = m_nEntry; |
280 | 243 | header.tEntry = m_tEntry; |
281 | 244 | |
282 | // cerr << "Writing header... magic: " | |
283 | // << magic << " hlen: " | |
284 | // << header.hlen << " size: " | |
285 | // << header.size << " nhash: " | |
286 | // << header.nhash << " kmer: " | |
287 | // << header.kmer << " dFPR: " | |
288 | // << header.dFPR << " aFPR: " | |
289 | // << header.aFPR << " rFPR: " | |
290 | // << header.rFPR << " nEntry: " | |
291 | // << header.nEntry << " tEntry: " | |
292 | // << header.tEntry << endl; | |
293 | ||
294 | 245 | out.write(reinterpret_cast<char*>(&header), sizeof(struct FileHeader)); |
295 | 246 | assert(out); |
296 | 247 | } |
297 | 248 | |
298 | 249 | /** Serialize the Bloom filter to a stream */ |
299 | // void storeFilter(std::ostream& out) const | |
300 | 250 | friend std::ostream& operator<<(std::ostream& out, const BloomFilter& o) |
301 | 251 | { |
302 | 252 | assert(out); |
318 | 268 | void storeFilter(string const &filterFilePath) const { |
319 | 269 | ofstream myFile(filterFilePath.c_str(), ios::out | ios::binary); |
320 | 270 | |
321 | cerr << "Storing filter. Filter is " << m_sizeInBytes << "bytes." | |
271 | cerr << "Storing filter. Filter is " << m_sizeInBytes << " bytes." | |
322 | 272 | << endl; |
323 | 273 | |
324 | 274 | myFile << *this; |
325 | 275 | myFile.close(); |
276 | assert(myFile); | |
326 | 277 | } |
327 | 278 | |
328 | 279 | size_t getPop() const { |
329 | 280 | size_t i, popBF = 0; |
330 | #pragma omp parallel for reduction(+:popBF) | |
281 | //#pragma omp parallel for reduction(+:popBF) | |
331 | 282 | for (i = 0; i < (m_size + 7) / 8; i++) |
332 | 283 | popBF = popBF + popCnt(m_filter[i]); |
333 | 284 | return popBF; |
340 | 291 | unsigned getKmerSize() const { |
341 | 292 | return m_kmerSize; |
342 | 293 | } |
343 | ||
344 | // void setdFPR(double value) { | |
345 | // m_dFPR = value; | |
346 | // } | |
347 | 294 | |
348 | 295 | /* |
349 | 296 | * Calculates that False positive rate that a redundant entry is actually |
362 | 309 | * Return FPR based on popcount |
363 | 310 | */ |
364 | 311 | double getFPR() const { |
365 | return pow(double(getPop())/double(m_size), m_hashNum); | |
312 | return pow(double(getPop())/double(m_size), double(m_hashNum)); | |
366 | 313 | } |
367 | 314 | |
368 | 315 | /* |
396 | 343 | ~BloomFilter() { |
397 | 344 | delete[] m_filter; |
398 | 345 | } |
399 | private: | |
346 | protected: | |
400 | 347 | BloomFilter(const BloomFilter& that); //to prevent copy construction |
401 | 348 | |
402 | 349 | /* |
450 | 397 | * Calculates the optimal FPR to use based on hash functions |
451 | 398 | */ |
452 | 399 | double calcFPR_hashNum(unsigned hashFunctNum) const { |
453 | return pow(2, -hashFunctNum); | |
400 | return pow(2, -double(hashFunctNum)); | |
454 | 401 | } |
455 | 402 | |
456 | 403 | uint8_t* m_filter; |
463 | 410 | uint64_t m_tEntry; |
464 | 411 | }; |
465 | 412 | |
466 | } // end namespace 'BTL' | |
467 | ||
468 | 413 | #endif /* BLOOMFILTER_H_ */ |
0 | 0 | These files come from: |
1 | 1 | |
2 | 2 | * https://github.com/bcgsc/bloomfilter |
3 | * commit f1232c2 | |
4 | * modifications were made to BloomFilter.h (TODO: merge back to source repo) | |
3 | * commit 2e5e9d4 |
0 | EXTRA_DIST = README.md |
0 | nthash.hpp is taken from https://github.com/bcgsc/ntHash.git, commit 07e3f4d |
0 | /* | |
1 | * | |
2 | * nthash.hpp | |
3 | * Author: Hamid Mohamadi | |
4 | * Genome Sciences Centre, | |
5 | * British Columbia Cancer Agency | |
6 | */ | |
7 | ||
8 | #ifndef NT_HASH_H | |
9 | #define NT_HASH_H | |
10 | ||
11 | #include <stdint.h> | |
12 | ||
13 | // offset for the complement base in the random seeds table | |
14 | const uint8_t cpOff = 0x07; | |
15 | ||
16 | // shift for gerenerating multiple hash values | |
17 | const int multiShift = 27; | |
18 | ||
19 | // seed for gerenerating multiple hash values | |
20 | static const uint64_t multiSeed = 0x90b45d39fb6da1fa; | |
21 | ||
22 | // 64-bit random seeds corresponding to bases and their complements | |
23 | static const uint64_t seedA = 0x3c8bfbb395c60474; | |
24 | static const uint64_t seedC = 0x3193c18562a02b4c; | |
25 | static const uint64_t seedG = 0x20323ed082572324; | |
26 | static const uint64_t seedT = 0x295549f54be24456; | |
27 | static const uint64_t seedN = 0x0000000000000000; | |
28 | ||
29 | static const uint64_t seedTab[256] = { | |
30 | seedN, seedT, seedN, seedG, seedA, seedN, seedN, seedC, // 0..7 | |
31 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 8..15 | |
32 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 16..23 | |
33 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 24..31 | |
34 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 32..39 | |
35 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 40..47 | |
36 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 48..55 | |
37 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 56..63 | |
38 | seedN, seedA, seedN, seedC, seedN, seedN, seedN, seedG, // 64..71 | |
39 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 72..79 | |
40 | seedN, seedN, seedN, seedN, seedT, seedN, seedN, seedN, // 80..87 | |
41 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 88..95 | |
42 | seedN, seedA, seedN, seedC, seedN, seedN, seedN, seedG, // 96..103 | |
43 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 104..111 | |
44 | seedN, seedN, seedN, seedN, seedT, seedN, seedN, seedN, // 112..119 | |
45 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 120..127 | |
46 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 128..135 | |
47 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 136..143 | |
48 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 144..151 | |
49 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 152..159 | |
50 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 160..167 | |
51 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 168..175 | |
52 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 176..183 | |
53 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 184..191 | |
54 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 192..199 | |
55 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 200..207 | |
56 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 208..215 | |
57 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 216..223 | |
58 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 224..231 | |
59 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 232..239 | |
60 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 240..247 | |
61 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN // 248..255 | |
62 | }; | |
63 | ||
64 | // rotate "v" to the left 1 position | |
65 | inline uint64_t rol1(const uint64_t v) { | |
66 | return (v << 1) | (v >> 63); | |
67 | } | |
68 | ||
69 | // rotate "v" to the right by 1 position | |
70 | inline uint64_t ror1(const uint64_t v) { | |
71 | return (v >> 1) | (v << 63); | |
72 | } | |
73 | ||
74 | // rotate 31-left bits of "v" to the left by "s" positions | |
75 | inline uint64_t rol31(const uint64_t v, unsigned s) { | |
76 | s%=31; | |
77 | return ((v << s) | (v >> (31 - s))) & 0x7FFFFFFF; | |
78 | } | |
79 | ||
80 | // rotate 33-right bits of "v" to the left by "s" positions | |
81 | inline uint64_t rol33(const uint64_t v, unsigned s) { | |
82 | s%=33; | |
83 | return ((v << s) | (v >> (33 - s))) & 0x1FFFFFFFF; | |
84 | } | |
85 | ||
86 | // swap bit 0 with bit 33 in "v" | |
87 | inline uint64_t swapbits033(const uint64_t v) { | |
88 | uint64_t x = (v ^ (v >> 33)) & 1; | |
89 | return v ^ (x | (x << 33)); | |
90 | } | |
91 | ||
92 | // swap bit 32 with bit 63 in "v" | |
93 | inline uint64_t swapbits3263(const uint64_t v) { | |
94 | uint64_t x = ((v >> 32) ^ (v >> 63)) & 1; | |
95 | return v ^ ((x << 32) | (x << 63)); | |
96 | } | |
97 | ||
98 | // forward-strand hash value of the base kmer, i.e. fhval(kmer_0) | |
99 | inline uint64_t NTF64(const char * kmerSeq, const unsigned k) { | |
100 | uint64_t hVal=0; | |
101 | for(unsigned i=0; i<k; i++) { | |
102 | hVal = rol1(hVal); | |
103 | hVal = swapbits033(hVal); | |
104 | hVal ^= seedTab[(unsigned char)kmerSeq[i]]; | |
105 | } | |
106 | return hVal; | |
107 | } | |
108 | ||
109 | // reverse-strand hash value of the base kmer, i.e. rhval(kmer_0) | |
110 | inline uint64_t NTR64(const char * kmerSeq, const unsigned k) { | |
111 | uint64_t hVal=0; | |
112 | for(unsigned i=0; i<k; i++) { | |
113 | hVal = rol1(hVal); | |
114 | hVal = swapbits033(hVal); | |
115 | hVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]&cpOff]; | |
116 | } | |
117 | return hVal; | |
118 | } | |
119 | ||
120 | // forward-strand ntHash for sliding k-mers | |
121 | inline uint64_t NTF64(const uint64_t fhVal, const unsigned k, const unsigned char charOut, const unsigned char charIn) { | |
122 | uint64_t hVal = rol1(fhVal); | |
123 | hVal = swapbits033(hVal); | |
124 | hVal ^= seedTab[charIn]; | |
125 | uint64_t lBits = seedTab[charOut] >> 33; | |
126 | uint64_t rBits = seedTab[charOut] & 0x1FFFFFFFF; | |
127 | uint64_t sOut = (rol31(lBits,k) << 33) | (rol33(rBits,k)); | |
128 | hVal ^= sOut; | |
129 | return hVal; | |
130 | } | |
131 | ||
132 | // reverse-complement ntHash for sliding k-mers | |
133 | inline uint64_t NTR64(const uint64_t rhVal, const unsigned k, const unsigned char charOut, const unsigned char charIn) { | |
134 | uint64_t lBits = seedTab[charIn&cpOff] >> 33; | |
135 | uint64_t rBits = seedTab[charIn&cpOff] & 0x1FFFFFFFF; | |
136 | uint64_t sIn = (rol31(lBits,k) << 33) | (rol33(rBits,k)); | |
137 | uint64_t hVal = rhVal ^ sIn; | |
138 | hVal ^= seedTab[charOut&cpOff]; | |
139 | hVal = ror1(hVal); | |
140 | hVal = swapbits3263(hVal); | |
141 | return hVal; | |
142 | } | |
143 | ||
144 | // canonical ntBase | |
145 | inline uint64_t NTC64(const char * kmerSeq, const unsigned k) { | |
146 | uint64_t fhVal=0, rhVal=0; | |
147 | fhVal=NTF64(kmerSeq, k); | |
148 | rhVal=NTR64(kmerSeq, k); | |
149 | return (rhVal<fhVal)? rhVal : fhVal; | |
150 | } | |
151 | ||
152 | // canonical ntHash | |
153 | inline uint64_t NTC64(const char * kmerSeq, const unsigned k, uint64_t& fhVal, uint64_t& rhVal) { | |
154 | fhVal = NTF64(kmerSeq, k); | |
155 | rhVal = NTR64(kmerSeq, k); | |
156 | return (rhVal<fhVal)? rhVal : fhVal; | |
157 | } | |
158 | ||
159 | // canonical ntHash for sliding k-mers | |
160 | inline uint64_t NTC64(const unsigned char charOut, const unsigned char charIn, const unsigned k, uint64_t& fhVal, uint64_t& rhVal) { | |
161 | fhVal = NTF64(fhVal, k, charOut, charIn); | |
162 | rhVal = NTR64(rhVal, k, charOut, charIn); | |
163 | return (rhVal<fhVal)? rhVal : fhVal; | |
164 | } | |
165 | ||
166 | // forward-strand ntHash for sliding k-mers to the left | |
167 | inline uint64_t NTF64L(const uint64_t rhVal, const unsigned k, const unsigned char charOut, const unsigned char charIn) { | |
168 | uint64_t lBits = seedTab[charIn] >> 33; | |
169 | uint64_t rBits = seedTab[charIn] & 0x1FFFFFFFF; | |
170 | uint64_t sIn = (rol31(lBits,k) << 33) | (rol33(rBits,k)); | |
171 | uint64_t hVal = rhVal ^ sIn; | |
172 | hVal ^= seedTab[charOut]; | |
173 | hVal = ror1(hVal); | |
174 | hVal = swapbits3263(hVal); | |
175 | return hVal; | |
176 | } | |
177 | ||
178 | // reverse-complement ntHash for sliding k-mers to the left | |
179 | inline uint64_t NTR64L(const uint64_t fhVal, const unsigned k, const unsigned char charOut, const unsigned char charIn) { | |
180 | uint64_t hVal = rol1(fhVal); | |
181 | hVal = swapbits033(hVal); | |
182 | hVal ^= seedTab[charIn&cpOff]; | |
183 | uint64_t lBits = seedTab[charOut&cpOff] >> 33; | |
184 | uint64_t rBits = seedTab[charOut&cpOff] & 0x1FFFFFFFF; | |
185 | uint64_t sOut = (rol31(lBits,k) << 33) | (rol33(rBits,k)); | |
186 | hVal ^= sOut; | |
187 | return hVal; | |
188 | } | |
189 | ||
190 | // canonical ntHash for sliding k-mers to the left | |
191 | inline uint64_t NTC64L(const unsigned char charOut, const unsigned char charIn, const unsigned k, uint64_t& fhVal, uint64_t& rhVal) { | |
192 | fhVal = NTF64L(fhVal, k, charOut, charIn); | |
193 | rhVal = NTR64L(rhVal, k, charOut, charIn); | |
194 | return (rhVal<fhVal)? rhVal : fhVal; | |
195 | } | |
196 | ||
197 | // ntBase with seeding option | |
198 | inline uint64_t NTF64(const char * kmerSeq, const unsigned k, const unsigned seed) { | |
199 | uint64_t hVal=NTF64(kmerSeq, k); | |
200 | if(seed==0) return hVal; | |
201 | hVal *= seed ^ k * multiSeed; | |
202 | hVal ^= hVal >> multiShift; | |
203 | return hVal; | |
204 | } | |
205 | ||
206 | // canonical ntBase with seeding option | |
207 | inline uint64_t NTC64(const char * kmerSeq, const unsigned k, const unsigned seed) { | |
208 | uint64_t hVal = NTC64(kmerSeq,k); | |
209 | if(seed==0) return hVal; | |
210 | hVal *= seed ^ k * multiSeed; | |
211 | hVal ^= hVal >> multiShift; | |
212 | return hVal; | |
213 | } | |
214 | ||
215 | // multihash ntHash, ntBase | |
216 | inline void NTM64(const char * kmerSeq, const unsigned k, const unsigned m, uint64_t *hVal) { | |
217 | uint64_t bVal=0, tVal=0; | |
218 | bVal = NTF64(kmerSeq, k); | |
219 | hVal[0] = bVal; | |
220 | for(unsigned i=1; i<m; i++) { | |
221 | tVal = bVal * (i ^ k * multiSeed); | |
222 | tVal ^= tVal >> multiShift; | |
223 | hVal[i] = tVal; | |
224 | } | |
225 | } | |
226 | ||
227 | // one extra hash for given base hash | |
228 | inline uint64_t NTE64(const uint64_t hVal, const unsigned k, const unsigned i) { | |
229 | uint64_t tVal = hVal; | |
230 | tVal *= (i ^ k * multiSeed); | |
231 | tVal ^= tVal >> multiShift; | |
232 | return tVal; | |
233 | } | |
234 | ||
235 | // multihash ntHash for sliding k-mers | |
236 | inline void NTM64(const unsigned char charOut, const unsigned char charIn, const unsigned k, const unsigned m, uint64_t *hVal) { | |
237 | uint64_t bVal=0, tVal=0; | |
238 | bVal = NTF64(hVal[0], k, charOut, charIn); | |
239 | hVal[0] = bVal; | |
240 | for(unsigned i=1; i<m; i++) { | |
241 | tVal = bVal * (i ^ k * multiSeed); | |
242 | tVal ^= tVal >> multiShift; | |
243 | hVal[i] = tVal; | |
244 | } | |
245 | } | |
246 | ||
247 | // canonical multihash ntBase | |
248 | inline void NTMC64(const char * kmerSeq, const unsigned k, const unsigned m, uint64_t *hVal) { | |
249 | uint64_t bVal=0, tVal=0; | |
250 | bVal = NTC64(kmerSeq, k); | |
251 | hVal[0] = bVal; | |
252 | for(unsigned i=1; i<m; i++) { | |
253 | tVal = bVal * (i ^ k * multiSeed); | |
254 | tVal ^= tVal >> multiShift; | |
255 | hVal[i] = tVal; | |
256 | } | |
257 | } | |
258 | ||
259 | // canonical multihash ntHash | |
260 | inline void NTMC64(const char * kmerSeq, const unsigned k, const unsigned m, uint64_t& fhVal, uint64_t& rhVal, uint64_t *hVal) { | |
261 | uint64_t bVal=0, tVal=0; | |
262 | bVal = NTC64(kmerSeq, k, fhVal, rhVal); | |
263 | hVal[0] = bVal; | |
264 | for(unsigned i=1; i<m; i++) { | |
265 | tVal = bVal * (i ^ k * multiSeed); | |
266 | tVal ^= tVal >> multiShift; | |
267 | hVal[i] = tVal; | |
268 | } | |
269 | } | |
270 | ||
271 | // canonical multihash ntHash for sliding k-mers | |
272 | inline void NTMC64(const unsigned char charOut, const unsigned char charIn, const unsigned k, const unsigned m, uint64_t& fhVal, uint64_t& rhVal, uint64_t *hVal) { | |
273 | uint64_t bVal=0, tVal=0; | |
274 | bVal = NTC64(charOut, charIn, k, fhVal, rhVal); | |
275 | hVal[0] = bVal; | |
276 | for(unsigned i=1; i<m; i++) { | |
277 | tVal = bVal * (i ^ k * multiSeed); | |
278 | tVal ^= tVal >> multiShift; | |
279 | hVal[i] = tVal; | |
280 | } | |
281 | } | |
282 | ||
283 | /* | |
284 | * ignoring k-mers containing nonACGT using ntHash function | |
285 | */ | |
286 | ||
287 | // canonical ntBase | |
288 | inline bool NTC4(const char *kmerSeq, const unsigned k, uint64_t& hVal, unsigned& locN) { | |
289 | hVal=0; | |
290 | locN=0; | |
291 | uint64_t fhVal=0,rhVal=0; | |
292 | for(int i=k-1; i>=0; i--) { | |
293 | if(seedTab[(unsigned char)kmerSeq[i]]==seedN) { | |
294 | locN=i; | |
295 | return false; | |
296 | } | |
297 | fhVal = rol1(fhVal); | |
298 | fhVal = swapbits033(fhVal); | |
299 | fhVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]]; | |
300 | ||
301 | rhVal = rol1(rhVal); | |
302 | rhVal = swapbits033(rhVal); | |
303 | rhVal ^= seedTab[(unsigned char)kmerSeq[i]&cpOff]; | |
304 | } | |
305 | hVal = (rhVal<fhVal)? rhVal : fhVal; | |
306 | return true; | |
307 | } | |
308 | ||
309 | // canonical multihash ntBase | |
310 | inline bool NTMC64(const char *kmerSeq, const unsigned k, const unsigned m, unsigned& locN, uint64_t* hVal) { | |
311 | uint64_t bVal=0, tVal=0, fhVal=0, rhVal=0; | |
312 | locN=0; | |
313 | for(int i=k-1; i>=0; i--) { | |
314 | if(seedTab[(unsigned char)kmerSeq[i]]==seedN) { | |
315 | locN=i; | |
316 | return false; | |
317 | } | |
318 | fhVal = rol1(fhVal); | |
319 | fhVal = swapbits033(fhVal); | |
320 | fhVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]]; | |
321 | ||
322 | rhVal = rol1(rhVal); | |
323 | rhVal = swapbits033(rhVal); | |
324 | rhVal ^= seedTab[(unsigned char)kmerSeq[i]&cpOff]; | |
325 | } | |
326 | bVal = (rhVal<fhVal)? rhVal : fhVal; | |
327 | hVal[0] = bVal; | |
328 | for(unsigned i=1; i<m; i++) { | |
329 | tVal = bVal * (i ^ k * multiSeed); | |
330 | tVal ^= tVal >> multiShift; | |
331 | hVal[i] = tVal; | |
332 | } | |
333 | return true; | |
334 | } | |
335 | ||
336 | // canonical ntHash | |
337 | inline bool NTC64(const char *kmerSeq, const unsigned k, uint64_t& fhVal, uint64_t& rhVal, uint64_t& hVal, unsigned& locN) { | |
338 | hVal=fhVal=rhVal=0; | |
339 | locN=0; | |
340 | for(int i=k-1; i>=0; i--) { | |
341 | if(seedTab[(unsigned char)kmerSeq[i]]==seedN) { | |
342 | locN=i; | |
343 | return false; | |
344 | } | |
345 | fhVal = rol1(fhVal); | |
346 | fhVal = swapbits033(fhVal); | |
347 | fhVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]]; | |
348 | ||
349 | rhVal = rol1(rhVal); | |
350 | rhVal = swapbits033(rhVal); | |
351 | rhVal ^= seedTab[(unsigned char)kmerSeq[i]&cpOff]; | |
352 | } | |
353 | hVal = (rhVal<fhVal)? rhVal : fhVal; | |
354 | return true; | |
355 | } | |
356 | ||
357 | // canonical multihash ntHash | |
358 | inline bool NTMC64(const char *kmerSeq, const unsigned k, const unsigned m, uint64_t& fhVal, uint64_t& rhVal, unsigned& locN, uint64_t* hVal) { | |
359 | fhVal=rhVal=0; | |
360 | uint64_t bVal=0, tVal=0; | |
361 | locN=0; | |
362 | for(int i=k-1; i>=0; i--) { | |
363 | if(seedTab[(unsigned char)kmerSeq[i]]==seedN) { | |
364 | locN=i; | |
365 | return false; | |
366 | } | |
367 | fhVal = rol1(fhVal); | |
368 | fhVal = swapbits033(fhVal); | |
369 | fhVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]]; | |
370 | ||
371 | rhVal = rol1(rhVal); | |
372 | rhVal = swapbits033(rhVal); | |
373 | rhVal ^= seedTab[(unsigned char)kmerSeq[i]&cpOff]; | |
374 | } | |
375 | bVal = (rhVal<fhVal)? rhVal : fhVal; | |
376 | hVal[0] = bVal; | |
377 | for(unsigned i=1; i<m; i++) { | |
378 | tVal = bVal * (i ^ k * multiSeed); | |
379 | tVal ^= tVal >> multiShift; | |
380 | hVal[i] = tVal; | |
381 | } | |
382 | return true; | |
383 | } | |
384 | ||
385 | // strand-aware canonical multihash ntHash | |
386 | inline bool NTMC64(const char *kmerSeq, const unsigned k, const unsigned m, uint64_t& fhVal, uint64_t& rhVal, unsigned& locN, uint64_t* hVal, bool& hStn) { | |
387 | fhVal=rhVal=0; | |
388 | uint64_t bVal=0, tVal=0; | |
389 | locN=0; | |
390 | for(int i=k-1; i>=0; i--) { | |
391 | if(seedTab[(unsigned char)kmerSeq[i]]==seedN) { | |
392 | locN=i; | |
393 | return false; | |
394 | } | |
395 | fhVal = rol1(fhVal); | |
396 | fhVal = swapbits033(fhVal); | |
397 | fhVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]]; | |
398 | ||
399 | rhVal = rol1(rhVal); | |
400 | rhVal = swapbits033(rhVal); | |
401 | rhVal ^= seedTab[(unsigned char)kmerSeq[i]&cpOff]; | |
402 | } | |
403 | hStn = rhVal<fhVal; | |
404 | bVal = hStn? rhVal : fhVal; | |
405 | hVal[0] = bVal; | |
406 | for(unsigned i=1; i<m; i++) { | |
407 | tVal = bVal * (i ^ k * multiSeed); | |
408 | tVal ^= tVal >> multiShift; | |
409 | hVal[i] = tVal; | |
410 | } | |
411 | return true; | |
412 | } | |
413 | ||
414 | // starnd-aware canonical multihash ntHash for sliding k-mers | |
415 | inline void NTMC64(const unsigned char charOut, const unsigned char charIn, const unsigned k, const unsigned m, uint64_t& fhVal, uint64_t& rhVal, uint64_t *hVal, bool &hStn) { | |
416 | uint64_t bVal=0, tVal=0; | |
417 | bVal = NTC64(charOut, charIn, k, fhVal, rhVal); | |
418 | hStn = rhVal<fhVal; | |
419 | hVal[0] = bVal; | |
420 | for(unsigned i=1; i<m; i++) { | |
421 | tVal = bVal * (i ^ k * multiSeed); | |
422 | tVal ^= tVal >> multiShift; | |
423 | hVal[i] = tVal; | |
424 | } | |
425 | } | |
426 | ||
427 | // masking canonical ntHash using spaced seed pattern | |
428 | inline uint64_t maskHash(uint64_t &fkVal, uint64_t &rkVal, const char * seedSeq, const char * kmerSeq, const unsigned k) { | |
429 | uint64_t fsVal=fkVal, rsVal=rkVal; | |
430 | for(unsigned i=0; i<k; i++) { | |
431 | if(seedSeq[i]!='1') { | |
432 | uint64_t lfBits = seedTab[(unsigned char)kmerSeq[i]] >> 33; | |
433 | uint64_t rfBits = seedTab[(unsigned char)kmerSeq[i]] & 0x1FFFFFFFF; | |
434 | uint64_t sfMask = (rol31(lfBits,k-1-i) << 33) | (rol33(rfBits,k-1-i)); | |
435 | fsVal ^= sfMask; | |
436 | ||
437 | uint64_t lrBits = seedTab[(unsigned char)kmerSeq[i]&cpOff] >> 33; | |
438 | uint64_t rrBits = seedTab[(unsigned char)kmerSeq[i]&cpOff] & 0x1FFFFFFFF; | |
439 | uint64_t srMask = (rol31(lrBits,i) << 33) | (rol33(rrBits,i)); | |
440 | rsVal ^= srMask; | |
441 | } | |
442 | } | |
443 | return (rsVal<fsVal)? rsVal : fsVal; | |
444 | } | |
445 | ||
446 | #endif |
0 | #ifndef ROLLING_HASH_H | |
1 | #define ROLLING_HASH_H | |
2 | ||
3 | #include <stdint.h> | |
4 | ||
5 | // offset for the complement base in the random seeds table | |
6 | const int cpOff = -20; | |
7 | ||
8 | // shift for gerenerating multiple hash values | |
9 | const int varShift = 27; | |
10 | ||
11 | // seed for gerenerating multiple hash values | |
12 | const uint64_t varSeed = 10427061540882326010ul; | |
13 | ||
14 | // 64-bit random seed table corresponding to bases and their complements | |
15 | static const uint64_t seedTab[256] = { | |
16 | 0, 0, 0, 0, 0, 0, 0, 0, // 0..7 | |
17 | 0, 0, 0, 0, 0, 0, 0, 0, // 8..15 | |
18 | 0, 0, 0, 0, 0, 0, 0, 0, // 16..23 | |
19 | 0, 0, 0, 0, 0, 0, 0, 0, // 24..31 | |
20 | 0, 0, 0, 0, 0, 0, 0, 0, // 32..39 | |
21 | 0, 0, 0, 0, 0, 2978368046464386134ul, 0, 2319985823310095140ul, // 40..47 | |
22 | 0, 0, 0, 3572411708064410444ul, 0, 0, 0, 0, // 48..55 | |
23 | 0, 0, 0, 0, 0, 0, 0, 0, // 56..63 | |
24 | 4362857412768957556ul, 4362857412768957556ul, 0, 3572411708064410444ul, 0, 0, 0, 2319985823310095140ul, // 64..71 | |
25 | 0, 0, 0, 0, 0, 2978368046464386134ul, 0, 2319985823310095140ul, // 72..79 | |
26 | 0, 0, 0, 3572411708064410444ul, 2978368046464386134ul, 0, 0, 0, // 80..87 | |
27 | 0, 0, 0, 0, 0, 0, 0, 0, // 88..95 | |
28 | 4362857412768957556ul, 4362857412768957556ul, 0, 3572411708064410444ul, 0, 0, 0, 2319985823310095140ul, // 96..103 | |
29 | 0, 0, 0, 0, 0, 0, 0, 0, // 104..111 | |
30 | 0, 0, 0, 0, 2978368046464386134ul, 0, 0, 0, // 112..119 | |
31 | 0, 0, 0, 0, 0, 0, 0, 0, // 120..127 | |
32 | 0, 0, 0, 0, 0, 0, 0, 0, // 128..135 | |
33 | 0, 0, 0, 0, 0, 0, 0, 0, // 136..143 | |
34 | 0, 0, 0, 0, 0, 0, 0, 0, // 144..151 | |
35 | 0, 0, 0, 0, 0, 0, 0, 0, // 152..159 | |
36 | 0, 0, 0, 0, 0, 0, 0, 0, // 160..167 | |
37 | 0, 0, 0, 0, 0, 0, 0, 0, // 168..175 | |
38 | 0, 0, 0, 0, 0, 0, 0, 0, // 176..183 | |
39 | 0, 0, 0, 0, 0, 0, 0, 0, // 184..191 | |
40 | 0, 0, 0, 0, 0, 0, 0, 0, // 192..199 | |
41 | 0, 0, 0, 0, 0, 0, 0, 0, // 200..207 | |
42 | 0, 0, 0, 0, 0, 0, 0, 0, // 208..215 | |
43 | 0, 0, 0, 0, 0, 0, 0, 0, // 216..223 | |
44 | 0, 0, 0, 0, 0, 0, 0, 0, // 224..231 | |
45 | 0, 0, 0, 0, 0, 0, 0, 0, // 232..239 | |
46 | 0, 0, 0, 0, 0, 0, 0, 0, // 240..247 | |
47 | 0, 0, 0, 0, 0, 0, 0, 0 // 248..255 | |
48 | }; | |
49 | ||
50 | // rotate "v" to the left by "s" positions | |
51 | inline uint64_t rol(const uint64_t v, const int s) { | |
52 | return (v << s) | (v >> (64 - s)); | |
53 | } | |
54 | ||
55 | // rotate "v" to the right by "s" positions | |
56 | inline uint64_t ror(const uint64_t v, const int s) { | |
57 | return (v >> s) | (v << (64 - s)); | |
58 | } | |
59 | ||
60 | // forward-strand hash value of the base kmer, i.e. fhval(kmer_0) | |
61 | inline uint64_t getFhval(const char * kmerSeq, const unsigned k) { | |
62 | uint64_t hVal=0; | |
63 | for(unsigned i=0; i<k; i++) | |
64 | hVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i); | |
65 | return hVal; | |
66 | } | |
67 | ||
68 | // reverse-strand hash value of the base kmer, i.e. rhval(kmer_0) | |
69 | inline uint64_t getRhval(const char * kmerSeq, const unsigned k) { | |
70 | uint64_t hVal=0; | |
71 | for(unsigned i=0; i<k; i++) | |
72 | hVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i); | |
73 | return hVal; | |
74 | } | |
75 | ||
76 | // cannonical hash value of the base kmer, i.e. rhval(kmer_0) | |
77 | inline uint64_t getChval(const char * kmerSeq, const unsigned k) { | |
78 | uint64_t fhVal = getFhval(kmerSeq, k); | |
79 | uint64_t rhVal = getRhval(kmerSeq, k); | |
80 | return (rhVal<fhVal)? rhVal : fhVal; | |
81 | } | |
82 | ||
83 | // initialize forward-strand hash value of the first kmer, i.e. fhval(kmer_0) | |
84 | inline uint64_t initHashes(const char * kmerSeq, const unsigned k) { | |
85 | return getFhval(kmerSeq, k); | |
86 | } | |
87 | ||
88 | // initialize cannonical hash value of the first kmer, i.e. chval(kmer_0) | |
89 | inline uint64_t initHashes(const char * kmerSeq, const unsigned k, uint64_t& fhVal, uint64_t& rhVal) { | |
90 | fhVal = getFhval(kmerSeq, k); | |
91 | rhVal = getRhval(kmerSeq, k); | |
92 | return (rhVal<fhVal)? rhVal : fhVal; | |
93 | } | |
94 | ||
95 | // recursive forward-strand hash value for next k-mer | |
96 | inline uint64_t rollHashesRight(const uint64_t fhVal, const unsigned char charOut, const unsigned char charIn, const unsigned k) { | |
97 | return(rol(fhVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn]); | |
98 | } | |
99 | ||
100 | // recursive cannonical hash value for next k-mer | |
101 | inline uint64_t rollHashesRight(uint64_t& fhVal, uint64_t& rhVal, const unsigned char charOut, const unsigned char charIn, const unsigned k) { | |
102 | fhVal = rol(fhVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn]; | |
103 | rhVal = ror(rhVal, 1) ^ ror(seedTab[charOut+cpOff], 1) ^ rol(seedTab[charIn+cpOff], k-1); | |
104 | return (rhVal<fhVal)? rhVal : fhVal; | |
105 | } | |
106 | ||
107 | // recursive forward-strand hash value for prev k-mer | |
108 | inline uint64_t rollHashesLeft(const uint64_t fhVal, const unsigned char charIn, const unsigned char charOut, const unsigned k) { | |
109 | return(ror(fhVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1)); | |
110 | } | |
111 | ||
112 | // recursive canonical hash value for prev k-mer | |
113 | inline uint64_t rollHashesLeft(uint64_t& fhVal, uint64_t& rhVal, const unsigned char charIn, const unsigned char charOut, const unsigned k) { | |
114 | fhVal = ror(fhVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1); | |
115 | rhVal = rol(rhVal, 1) ^ rol(seedTab[charOut+cpOff], k) ^ seedTab[charIn+cpOff]; | |
116 | return (rhVal<fhVal)? rhVal : fhVal; | |
117 | } | |
118 | ||
119 | // change a single base and update forward-strand hash value accordingly | |
120 | inline uint64_t setBase(uint64_t fhVal, char* kmerSeq, unsigned pos, char base, unsigned k) | |
121 | { | |
122 | fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos); | |
123 | kmerSeq[pos] = base; | |
124 | fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos); | |
125 | return fhVal; | |
126 | } | |
127 | ||
128 | // change a single base and update hash values accordingly | |
129 | inline uint64_t setBase(uint64_t& fhVal, uint64_t& rhVal, char* kmerSeq, unsigned pos, char base, unsigned k) | |
130 | { | |
131 | fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos); | |
132 | rhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]+cpOff], pos); | |
133 | kmerSeq[pos] = base; | |
134 | fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos); | |
135 | rhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]+cpOff], pos); | |
136 | return (rhVal<fhVal)? rhVal : fhVal; | |
137 | } | |
138 | ||
139 | /** | |
140 | * Compute multiple pseudo-independent hash values from a seed hash value. | |
141 | * | |
142 | * @param hashes array for storing computed hash values | |
143 | * @param seedVal seed value for multi-hash calculation | |
144 | * @param numHashes number of hash values to compute | |
145 | * @param k-mer size | |
146 | */ | |
147 | inline void multiHash(uint64_t hashes[], uint64_t seedVal, unsigned numHashes, unsigned k) | |
148 | { | |
149 | for (unsigned i = 0; i < numHashes; i++) { | |
150 | hashes[i] = seedVal * (i ^ k * varSeed); | |
151 | hashes[i] ^= hashes[i] >> varShift; | |
152 | } | |
153 | } | |
154 | ||
155 | // spaced-seed hash values | |
156 | ||
157 | /** | |
158 | * Calculate forward-strand spaced seed hash value of the base kmer, i.e. fhval(kmer_0) | |
159 | * | |
160 | * @param kVal set to forward-strand hash value for unmasked k-mer | |
161 | * @param seedSeq bitmask indicating "don't care" positions for hashing | |
162 | * @param kmerSeq k-mer to be hashed | |
163 | * @param k k-mer size | |
164 | * @return hash value for masked forward-strand k-mer | |
165 | */ | |
166 | inline uint64_t getFhval(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned k) { | |
167 | kVal=0; | |
168 | uint64_t sVal=0; | |
169 | for(unsigned i=0; i<k; i++) { | |
170 | kVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i); | |
171 | if(seedSeq[i]=='1') | |
172 | sVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i); | |
173 | } | |
174 | return sVal; | |
175 | } | |
176 | ||
177 | /** | |
178 | * Calculate reverse-strand spaced seed hash value of the base kmer, i.e. rhval(kmer_0) | |
179 | * | |
180 | * @param kVal set to reverse-strand hash value for unmasked k-mer | |
181 | * @param seedSeq bitmask indicating "don't care" positions for hashing | |
182 | * @param kmerSeq k-mer to be hashed | |
183 | * @param k k-mer size | |
184 | * @return hash for masked reverse-strand k-mer | |
185 | */ | |
186 | // reverse-strand spaced seed hash value of the base kmer, i.e. rhval(kmer_0) | |
187 | inline uint64_t getRhval(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned k) { | |
188 | kVal=0; | |
189 | uint64_t sVal=0; | |
190 | for(unsigned i=0; i<k; i++) { | |
191 | kVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i); | |
192 | if(seedSeq[i]=='1') | |
193 | sVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i); | |
194 | } | |
195 | return sVal; | |
196 | } | |
197 | ||
198 | /** | |
199 | * Recursive forward-strand spaced seed hash value for next k-mer | |
200 | * | |
201 | * @param kVal hash value for current k-mer unmasked and in forward orientation | |
202 | * @param seedSeq bitmask indicating "don't care" positions for hashing | |
203 | * @param kmerSeq sequence for *current* k-mer (not the k-mer we are rolling into) | |
204 | * @param charIn new base we are rolling in from the right | |
205 | * @param k k-mer size | |
206 | * @return hash for masked k-mer in forward orientation | |
207 | */ | |
208 | inline uint64_t rollHashesRight(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) { | |
209 | const unsigned charOut = kmerSeq[0]; | |
210 | kVal = rol(kVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn]; | |
211 | uint64_t sVal=kVal; | |
212 | for(unsigned i=1; i<k-1; i++) { | |
213 | if(seedSeq[i]!='1') | |
214 | sVal ^= rol(seedTab[(unsigned char)kmerSeq[i+1]], k-1-i); | |
215 | } | |
216 | return sVal; | |
217 | } | |
218 | ||
219 | /** | |
220 | * Recursive forward-strand spaced seed hash value for prev k-mer | |
221 | * | |
222 | * @param kVal hash value for current k-mer unmasked and in forward orientation | |
223 | * @param seedSeq bitmask indicating "don't care" positions for hashing | |
224 | * @param kmerSeq sequence for current k-mer (not the k-mer we are rolling into) | |
225 | * @param charIn new base we are rolling in from the left | |
226 | * @param k k-mer size | |
227 | * @return hash for masked k-mer in forward orientation | |
228 | */ | |
229 | inline uint64_t rollHashesLeft(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) { | |
230 | const unsigned charOut = kmerSeq[k-1]; | |
231 | kVal = ror(kVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1); | |
232 | uint64_t sVal=kVal; | |
233 | for(unsigned i=1; i<k-1; i++) { | |
234 | if(seedSeq[i]!='1') | |
235 | sVal ^= rol(seedTab[(unsigned char)kmerSeq[i-1]], k-1-i); | |
236 | } | |
237 | return sVal; | |
238 | } | |
239 | ||
240 | /** | |
241 | * Recursive canonical spaced seed hash value for next k-mer | |
242 | * | |
243 | * @param fkVal hash value for current k-mer unmasked and in forward orientation | |
244 | * @param rkVal hash value for current k-mer unmasked and in reverse complement orientation | |
245 | * @param seedSeq bitmask indicating "don't care" positions for hashing | |
246 | * @param kmerSeq sequence for current k-mer (not the k-mer we are rolling into) | |
247 | * @param charIn new base we are rolling in from the right | |
248 | * @param k k-mer size | |
249 | * @return canonical hash value for masked k-mer | |
250 | */ | |
251 | inline uint64_t rollHashesRight(uint64_t &fkVal, uint64_t &rkVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) { | |
252 | const unsigned charOut = kmerSeq[0]; | |
253 | fkVal = rol(fkVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn]; | |
254 | rkVal = ror(rkVal, 1) ^ ror(seedTab[charOut+cpOff], 1) ^ rol(seedTab[charIn+cpOff], k-1); | |
255 | uint64_t fsVal=fkVal, rsVal=rkVal; | |
256 | for(unsigned i=1; i<k-1; i++) { | |
257 | if(seedSeq[i]!='1') { | |
258 | fsVal ^= rol(seedTab[(unsigned char)kmerSeq[i+1]], k-1-i); | |
259 | rsVal ^= rol(seedTab[(unsigned char)kmerSeq[i+1]+cpOff], i); | |
260 | } | |
261 | } | |
262 | return (rsVal<fsVal)? rsVal : fsVal; | |
263 | } | |
264 | ||
265 | /** | |
266 | * Recursive canonical spaced seed hash value for prev k-mer | |
267 | * | |
268 | * @param fkVal hash value for current k-mer unmasked and in forward orientation | |
269 | * @param rkVal hash value for current k-mer unmasked and in reverse complement orientation | |
270 | * @param seedSeq bitmask indicating "don't care" positions for hashing | |
271 | * @param kmerSeq sequence for current k-mer (not the k-mer we are rolling into) | |
272 | * @param charIn new base we are rolling in from the left | |
273 | * @param k k-mer size | |
274 | * @return canonical hash value for masked k-mer | |
275 | */ | |
276 | inline uint64_t rollHashesLeft(uint64_t &fkVal, uint64_t &rkVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) { | |
277 | const unsigned charOut = kmerSeq[k-1]; | |
278 | fkVal = ror(fkVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1); | |
279 | rkVal = rol(rkVal, 1) ^ rol(seedTab[charOut+cpOff], k) ^ seedTab[charIn+cpOff]; | |
280 | uint64_t fsVal=fkVal, rsVal=rkVal; | |
281 | for(unsigned i=1; i<k-1; i++) { | |
282 | if(seedSeq[i]!='1') { | |
283 | fsVal ^= rol(seedTab[(unsigned char)kmerSeq[i-1]], k-1-i); | |
284 | rsVal ^= rol(seedTab[(unsigned char)kmerSeq[i-1]+cpOff], i); | |
285 | } | |
286 | } | |
287 | return (rsVal<fsVal)? rsVal : fsVal; | |
288 | } | |
289 | ||
290 | /** | |
291 | * Change a single base and recompute spaced seed hash values | |
292 | * | |
293 | * @param fkVal hash value for current k-mer unmasked and in forward orientation | |
294 | * @param rkVal hash value for current k-mer unmasked and in reverse complement orientation | |
295 | * @param seedSeq bitmask indicating "don't care" positions for hashing | |
296 | * @param kmerSeq sequence for current k-mer | |
297 | * @param pos position of base to change | |
298 | * @param base new base value | |
299 | * @param k k-mer size | |
300 | * @return updated canonical hash value for masked k-mer | |
301 | */ | |
302 | inline uint64_t setBase(uint64_t& fkVal, uint64_t& rkVal, const char * seedSeq, char * kmerSeq, unsigned pos, char base, unsigned k) | |
303 | { | |
304 | setBase(fkVal, rkVal, kmerSeq, pos, base, k); | |
305 | uint64_t fsVal=fkVal, rsVal=rkVal; | |
306 | for(unsigned i=0; i<k; i++) { | |
307 | if(seedSeq[i]!='1') { | |
308 | fsVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i); | |
309 | rsVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i); | |
310 | } | |
311 | } | |
312 | return (rsVal<fsVal)? rsVal : fsVal; | |
313 | } | |
314 | ||
315 | #endif |