Codebase list abyss / upstream/2.2.4
New upstream version 2.2.4 Michael R. Crusoe 4 years ago
28 changed file(s) with 3017 addition(s) and 2824 deletion(s). Raw diff Collapse all Expand all
0 #include "Assembly/Options.h"
01 #if PAIRED_DBG
1 # include "PairedDBG/SequenceCollection.h"
2 # include "PairedDBG/PairedDBGAlgorithms.h"
2 #include "PairedDBG/PairedDBGAlgorithms.h"
3 #include "PairedDBG/SequenceCollection.h"
34 #else
4 # include "Assembly/SequenceCollection.h"
5 #include "Assembly/SequenceCollection.h"
56 #endif
67 #include "Assembly/AssemblyAlgorithms.h"
78 #include "Assembly/DotWriter.h"
9 #include "DataBase/DB.h"
10
811 #include <algorithm>
912 #include <cstdio> // for setvbuf
1013 #include <fstream>
1114 #include <iostream>
1215 #include <sstream>
13 #include "DataBase/DB.h"
1416
1517 using namespace std;
1618
1719 DB db;
1820
19 static void removeLowCoverageContigs(SequenceCollectionHash& g)
21 static void
22 removeLowCoverageContigs(SequenceCollectionHash& g)
2023 {
2124 AssemblyAlgorithms::markAmbiguous(&g);
2225
2326 cout << "Removing low-coverage contigs "
24 "(mean k-mer coverage < " << opt::coverage << ")\n";
27 "(mean k-mer coverage < "
28 << opt::coverage << ")\n";
2529 AssemblyAlgorithms::assemble(&g);
2630 AssemblyAlgorithms::splitAmbiguous(&g);
2731
2832 opt::coverage = 0;
2933 }
3034
31 static void popBubbles(SequenceCollectionHash& g)
35 static void
36 popBubbles(SequenceCollectionHash& g)
3237 {
3338 cout << "Popping bubbles" << endl;
3439 ofstream out;
3843 cout << "Removed " << numPopped << " bubbles\n";
3944 }
4045
41 static void write_graph(const string& path,
42 const SequenceCollectionHash& c)
46 static void
47 write_graph(const string& path, const SequenceCollectionHash& c)
4348 {
4449 if (path.empty())
4550 return;
4853 DotWriter::write(out, c);
4954 }
5055
51 static void assemble(const string& pathIn, const string& pathOut)
56 static void
57 assemble(const string& pathIn, const string& pathOut)
5258 {
5359 Timer timer(__func__);
5460 SequenceCollectionHash g;
5561
5662 if (!pathIn.empty())
5763 AssemblyAlgorithms::loadSequences(&g, pathIn.c_str());
58 for_each(opt::inFiles.begin(), opt::inFiles.end(), bind1st(
59 ptr_fun(AssemblyAlgorithms::loadSequences<SequenceCollectionHash>),
60 &g));
64 for_each(opt::inFiles.begin(), opt::inFiles.end(), [&g](std::string s) {
65 AssemblyAlgorithms::loadSequences(&g, s);
66 });
6167 size_t numLoaded = g.size();
6268 if (!opt::db.empty())
6369 addToDb(db, "loadedKmer", numLoaded);
6975 exit(EXIT_FAILURE);
7076 }
7177
72 AssemblyAlgorithms::setCoverageParameters(
73 AssemblyAlgorithms::coverageHistogram(g));
78 AssemblyAlgorithms::setCoverageParameters(AssemblyAlgorithms::coverageHistogram(g));
7479
7580 if (opt::kc > 0) {
7681 cout << "Minimum k-mer multiplicity kc is " << opt::kc << endl;
7782 cout << "Removing low-multiplicity k-mers" << endl;
7883 size_t removed = AssemblyAlgorithms::applyKmerCoverageThreshold(g, opt::kc);
79 cout << "Removed " << removed
80 << " low-multiplicity k-mers, " << g.size()
81 << " k-mers remaining" << std::endl;
84 cout << "Removed " << removed << " low-multiplicity k-mers, " << g.size()
85 << " k-mers remaining" << std::endl;
8286 }
8387
8488 cout << "Generating adjacency" << endl;
121125
122126 size_t numAssembled = g.size();
123127 size_t numRemoved = numLoaded - numAssembled;
124 cout << "Removed " << numRemoved << " k-mer.\n"
125 "The signal-to-noise ratio (SNR) is "
126 << 10 * log10((double)numAssembled / numRemoved)
127 << " dB.\n";
128 cout << "Removed " << numRemoved
129 << " k-mer.\n"
130 "The signal-to-noise ratio (SNR) is "
131 << 10 * log10((double)numAssembled / numRemoved) << " dB.\n";
128132 }
129133
130 int main(int argc, char* const* argv)
134 int
135 main(int argc, char* const* argv)
131136 {
132137 Timer timer("Total");
133138
141146
142147 bool krange = opt::kMin != opt::kMax;
143148 if (krange)
144 cout << "Assembling k=" << opt::kMin << "-" << opt::kMax
145 << ":" << opt::kStep << endl;
149 cout << "Assembling k=" << opt::kMin << "-" << opt::kMax << ":" << opt::kStep << endl;
146150
147151 if (!opt::db.empty()) {
148 init(db,
149 opt::getUvalue(),
150 opt::getVvalue(),
151 "ABYSS",
152 opt::getCommand(),
153 opt::getMetaValue());
152 init(
153 db,
154 opt::getUvalue(),
155 opt::getVvalue(),
156 "ABYSS",
157 opt::getCommand(),
158 opt::getMetaValue());
154159 addToDb(db, "SS", opt::ss);
155160 addToDb(db, "k", opt::kmerSize);
156161 addToDb(db, "singleK", opt::singleKmerSize);
174179 opt::erodeStrand = (unsigned)-1;
175180 opt::coverage = -1;
176181 opt::trimLen = k;
177 opt::bubbleLen = 3*k;
182 opt::bubbleLen = 3 * k;
178183 }
179184
180185 ostringstream k0, k1;
1818 /** A container of BranchRecord. */
1919 class BranchGroup
2020 {
21 public:
22 typedef std::vector<BranchRecord> BranchGroupData;
23 typedef BranchGroupData::iterator iterator;
24 typedef BranchGroupData::const_iterator const_iterator;
25
26 BranchGroup()
27 : m_dir(SENSE), m_maxNumBranches(0),
28 m_noExt(false), m_status(BGS_ACTIVE)
29 { }
30
31 BranchGroup(extDirection dir, size_t maxNumBranches,
32 const BranchRecord::V &origin)
33 : m_dir(dir), m_origin(origin),
34 m_maxNumBranches(maxNumBranches), m_noExt(false),
35 m_status(BGS_ACTIVE)
36 {
37 m_branches.reserve(m_maxNumBranches);
38 }
39
40 BranchGroup(extDirection dir, size_t maxNumBranches,
41 const BranchRecord::V &origin, const BranchRecord& branch)
42 : m_dir(dir), m_origin(origin),
43 m_maxNumBranches(maxNumBranches), m_noExt(false),
44 m_status(BGS_ACTIVE)
45 {
46 m_branches.reserve(m_maxNumBranches);
47 m_branches.push_back(branch);
48 }
49
50 BranchGroup(const BranchGroup& o)
51 : m_branches(o.m_branches), m_dir(o.m_dir),
52 m_origin(o.m_origin),
53 m_maxNumBranches(o.m_maxNumBranches),
54 m_noExt(o.m_noExt), m_status(o.m_status)
55 {
56 m_branches.reserve(m_maxNumBranches);
57 }
58
59 /** Add a branch to this group. */
60 BranchRecord& addBranch(const BranchRecord& branch)
61 {
62 assert(m_branches.size() < m_maxNumBranches);
63 m_branches.push_back(branch);
64 return m_branches.back();
65 }
66
67 /** Add a branch to this group and extend the new branch with
68 * the given k-mer. */
69 void addBranch(const BranchRecord& branch,
70 const BranchRecord::V& kmer)
71 {
72 if (m_branches.size() < m_maxNumBranches)
73 addBranch(branch).push_back(
74 std::make_pair(kmer, BranchRecord::VP()));
75 else
76 m_status = BGS_TOOMANYBRANCHES;
77 }
78
79 /** Return the specified branch. */
80 BranchRecord& operator [](unsigned id)
81 {
82 return m_branches[id];
83 }
84
85 /** Return the number of branches in this group. */
86 size_t size() const { return m_branches.size(); }
87
88 /** Return whether a branch contains the specified k-mer at
89 * the index i. */
90 bool exists(unsigned i, const BranchRecord::V& kmer) const
91 {
92 for (BranchGroupData::const_iterator it
93 = m_branches.begin();
94 it != m_branches.end(); ++it)
95 if (it->exists(i, kmer))
96 return true;
97 return false;
98 }
99
100 // return the current status of the branch
101 BranchGroupStatus getStatus() const { return m_status; }
102
103 // set the no extension flag
104 void setNoExtension() { m_noExt = true; }
105
106 // is the no extension flag set?
107 bool isNoExt() const { return m_noExt; }
108
109 // return the direction of growth
110 extDirection getDirection() const { return m_dir; }
111
112 iterator begin() { return m_branches.begin(); }
113 iterator end() { return m_branches.end(); }
114 const_iterator begin() const { return m_branches.begin(); }
115 const_iterator end() const { return m_branches.end(); }
116
117 // Check the stop conditions for the bubble growth
118 BranchGroupStatus
119 updateStatus(unsigned maxLength)
120 {
121 assert(m_branches.size() <= m_maxNumBranches);
122
123 if (m_status != BGS_ACTIVE)
124 return m_status;
125
126 // Check if the no extension flag is set
127 if(m_noExt)
128 {
129 m_status = BGS_NOEXT;
130 return m_status;
131 }
132
133 // Check if any branches are too long or any sequence has a loop
134 for (BranchGroupData::const_iterator iter = m_branches.begin();
135 iter != m_branches.end(); ++iter) {
136 if (iter->isTooLong(maxLength)) {
137 m_status = BGS_TOOLONG;
21 public:
22 typedef std::vector<BranchRecord> BranchGroupData;
23 typedef BranchGroupData::iterator iterator;
24 typedef BranchGroupData::const_iterator const_iterator;
25
26 BranchGroup()
27 : m_dir(SENSE)
28 , m_maxNumBranches(0)
29 , m_noExt(false)
30 , m_status(BGS_ACTIVE)
31 {}
32
33 BranchGroup(extDirection dir, size_t maxNumBranches, const BranchRecord::V& origin)
34 : m_dir(dir)
35 , m_origin(origin)
36 , m_maxNumBranches(maxNumBranches)
37 , m_noExt(false)
38 , m_status(BGS_ACTIVE)
39 {
40 m_branches.reserve(m_maxNumBranches);
41 }
42
43 BranchGroup(
44 extDirection dir,
45 size_t maxNumBranches,
46 const BranchRecord::V& origin,
47 const BranchRecord& branch)
48 : m_dir(dir)
49 , m_origin(origin)
50 , m_maxNumBranches(maxNumBranches)
51 , m_noExt(false)
52 , m_status(BGS_ACTIVE)
53 {
54 m_branches.reserve(m_maxNumBranches);
55 m_branches.push_back(branch);
56 }
57
58 BranchGroup(const BranchGroup& o)
59 : m_branches(o.m_branches)
60 , m_dir(o.m_dir)
61 , m_origin(o.m_origin)
62 , m_maxNumBranches(o.m_maxNumBranches)
63 , m_noExt(o.m_noExt)
64 , m_status(o.m_status)
65 {
66 m_branches.reserve(m_maxNumBranches);
67 }
68
69 /** Add a branch to this group. */
70 BranchRecord& addBranch(const BranchRecord& branch)
71 {
72 assert(m_branches.size() < m_maxNumBranches);
73 m_branches.push_back(branch);
74 return m_branches.back();
75 }
76
77 /** Add a branch to this group and extend the new branch with
78 * the given k-mer. */
79 void addBranch(const BranchRecord& branch, const BranchRecord::V& kmer)
80 {
81 if (m_branches.size() < m_maxNumBranches)
82 addBranch(branch).push_back(std::make_pair(kmer, BranchRecord::VP()));
83 else
84 m_status = BGS_TOOMANYBRANCHES;
85 }
86
87 /** Return the specified branch. */
88 BranchRecord& operator[](unsigned id) { return m_branches[id]; }
89
90 /** Return the number of branches in this group. */
91 size_t size() const { return m_branches.size(); }
92
93 /** Return whether a branch contains the specified k-mer at
94 * the index i. */
95 bool exists(unsigned i, const BranchRecord::V& kmer) const
96 {
97 for (BranchGroupData::const_iterator it = m_branches.begin(); it != m_branches.end(); ++it)
98 if (it->exists(i, kmer))
99 return true;
100 return false;
101 }
102
103 // return the current status of the branch
104 BranchGroupStatus getStatus() const { return m_status; }
105
106 // set the no extension flag
107 void setNoExtension() { m_noExt = true; }
108
109 // is the no extension flag set?
110 bool isNoExt() const { return m_noExt; }
111
112 // return the direction of growth
113 extDirection getDirection() const { return m_dir; }
114
115 iterator begin() { return m_branches.begin(); }
116 iterator end() { return m_branches.end(); }
117 const_iterator begin() const { return m_branches.begin(); }
118 const_iterator end() const { return m_branches.end(); }
119
120 // Check the stop conditions for the bubble growth
121 BranchGroupStatus updateStatus(unsigned maxLength)
122 {
123 assert(m_branches.size() <= m_maxNumBranches);
124
125 if (m_status != BGS_ACTIVE)
126 return m_status;
127
128 // Check if the no extension flag is set
129 if (m_noExt) {
130 m_status = BGS_NOEXT;
138131 return m_status;
139132 }
140 }
141
142 BranchGroupData::const_iterator it = m_branches.begin();
143 const BranchRecord::V& lastSeq = it->back().first;
144 while (++it != m_branches.end())
145 if (it->back().first != lastSeq)
146 return m_status = BGS_ACTIVE;
147
148 // All the branches of the bubble have joined.
149 // Remove the last base, which is identical for every branch.
150 std::for_each(m_branches.begin(), m_branches.end(),
151 std::mem_fun_ref(&BranchRecord::pop_back));
152
153 // Sort the branches by coverage.
154 sort_by_transform(m_branches.begin(), m_branches.end(),
155 std::mem_fun_ref(&BranchRecord::calculateBranchMultiplicity));
156 reverse(m_branches.begin(), m_branches.end());
157
158 return m_status = BGS_JOINED;
159 }
160
161 /** Return whether any branches of this group are active. */
162 bool
163 isActive() const
164 {
165 for (BranchGroupData::const_iterator it = m_branches.begin();
166 it != m_branches.end(); ++it)
167 if (it->isActive())
168 return true;
169 return false;
170 }
171
172 /** Return whether this branch is extendable. */
173 bool
174 isExtendable()
175 {
176 if (m_noExt)
133
134 // Check if any branches are too long or any sequence has a loop
135 for (BranchGroupData::const_iterator iter = m_branches.begin(); iter != m_branches.end();
136 ++iter) {
137 if (iter->isTooLong(maxLength)) {
138 m_status = BGS_TOOLONG;
139 return m_status;
140 }
141 }
142
143 BranchGroupData::const_iterator it = m_branches.begin();
144 const BranchRecord::V& lastSeq = it->back().first;
145 while (++it != m_branches.end())
146 if (it->back().first != lastSeq)
147 return m_status = BGS_ACTIVE;
148
149 // All the branches of the bubble have joined.
150 // Remove the last base, which is identical for every branch.
151 std::for_each(
152 m_branches.begin(), m_branches.end(), [](BranchRecord& b) { return b.pop_back(); });
153
154 // Sort the branches by coverage.
155 std::function<int(const BranchRecord&)> lambda = [](const BranchRecord& b) {
156 return b.calculateBranchMultiplicity();
157 };
158 sort_by_transform(m_branches.begin(), m_branches.end(), lambda);
159 reverse(m_branches.begin(), m_branches.end());
160
161 return m_status = BGS_JOINED;
162 }
163
164 /** Return whether any branches of this group are active. */
165 bool isActive() const
166 {
167 for (BranchGroupData::const_iterator it = m_branches.begin(); it != m_branches.end(); ++it)
168 if (it->isActive())
169 return true;
177170 return false;
178
179 // A group is extendable when all the branches are the same
180 // length. All the branches are lockstepped for growth.
181 BranchGroupData::iterator it = m_branches.begin();
182 unsigned length = it++->size();
183 for (; it != m_branches.end(); ++it)
184 if (it->size() != length)
171 }
172
173 /** Return whether this branch is extendable. */
174 bool isExtendable()
175 {
176 if (m_noExt)
185177 return false;
186 return true;
187 }
188
189 /** Return whether this branch is ambiguous at its origin. Also
190 * returns false if the origin of the branch has since been deleted.
191 */
192 bool
193 isAmbiguous(const SequenceCollectionHash& g) const
194 {
195 // Get fresh data from the collection to check that this bubble
196 // does in fact still exist.
197 const BranchRecord::VP& data = g.getSeqAndData(m_origin).second;
198 return data.deleted() ? false : data.isAmbiguous(m_dir);
199 }
200
201 private:
202 BranchGroup& operator =(const BranchGroup& o);
203
204 BranchGroupData m_branches;
205 extDirection m_dir;
206 BranchRecord::V m_origin;
207 size_t m_maxNumBranches;
208 bool m_noExt;
209 BranchGroupStatus m_status;
178
179 // A group is extendable when all the branches are the same
180 // length. All the branches are lockstepped for growth.
181 BranchGroupData::iterator it = m_branches.begin();
182 unsigned length = it++->size();
183 for (; it != m_branches.end(); ++it)
184 if (it->size() != length)
185 return false;
186 return true;
187 }
188
189 /** Return whether this branch is ambiguous at its origin. Also
190 * returns false if the origin of the branch has since been deleted.
191 */
192 bool isAmbiguous(const SequenceCollectionHash& g) const
193 {
194 // Get fresh data from the collection to check that this bubble
195 // does in fact still exist.
196 const BranchRecord::VP& data = g.getSeqAndData(m_origin).second;
197 return data.deleted() ? false : data.isAmbiguous(m_dir);
198 }
199
200 private:
201 BranchGroup& operator=(const BranchGroup& o);
202
203 BranchGroupData m_branches;
204 extDirection m_dir;
205 BranchRecord::V m_origin;
206 size_t m_maxNumBranches;
207 bool m_noExt;
208 BranchGroupStatus m_status;
210209 };
211210
212211 #endif
9898 " -n, --num-locks=N number of write locks on bloom filter [1000]\n"
9999 " -q, --trim-quality=N trim bases from the ends of reads whose\n"
100100 " quality is less than the threshold\n"
101 " -t, --bloom-type=STR 'konnector' or 'rolling-hash' [konnector]\n"
101 " -t, --bloom-type=STR 'konnector', 'rolling-hash', or 'counting' [konnector]\n"
102102 " --standard-quality zero quality is `!' (33)\n"
103103 " default for FASTQ and SAM files\n"
104104 " --illumina-quality zero quality is `@' (64)\n"
9090 }
9191
9292 /**
93 * Returns the sum of all kmer multiplicities in `seq` by querying `bloom`
94 */
95 template<typename CountingBloomT>
96 inline static unsigned
97 getSeqAbsoluteKmerCoverage(const Sequence& seq, const CountingBloomT& bloom)
98 {
99 const unsigned k = bloom.getKmerSize();
100 const unsigned numHashes = bloom.getHashNum();
101 assert(seq.length() >= k);
102 unsigned coverage = 0;
103 for (RollingHashIterator it(seq, numHashes, k); it != RollingHashIterator::end();
104 ++it) {
105 coverage += bloom.minCount(*it);
106 }
107 return coverage;
108 }
109
110 /**
93111 * Translate a DNA sequence to an equivalent path in the
94112 * de Bruijn graph.
95113 */
169187 size_t contigID;
170188 /** length of contig (bp) */
171189 unsigned length;
190 /** coverage of contig */
191 unsigned coverage;
172192 /** FASTA ID of seeding read */
173193 std::string readID;
174194 /** Type of sequence used to seed contig (branch k-mer or full read) */
432452 inline static void
433453 printContig(
434454 const Sequence& seq,
455 unsigned length,
456 unsigned coverage,
435457 size_t contigID,
436458 const std::string& readID,
437459 unsigned k,
447469
448470 /* add FASTA comment indicating extended read id */
449471 std::ostringstream comment;
472 comment << length << ' ' << coverage << ' ';
450473 comment << "read:" << readID;
451474 assert(id.good());
452475 contig.id = id.str();
509532 * Output a contig sequence if it is not redundant, i.e. it has not already
510533 * been generated from a different read / thread of execution.
511534 */
512 template<typename AssembledKmerSetT, typename AssemblyStreamsT>
535 template<typename SolidKmerSetT, typename AssembledKmerSetT, typename AssemblyStreamsT>
513536 inline static void
514537 outputContig(
515538 const Path<Vertex>& contigPath,
516539 ContigRecord& rec,
540 SolidKmerSetT& solidKmerSet,
517541 AssembledKmerSetT& assembledKmerSet,
518542 KmerHash& contigEndKmers,
519543 const AssemblyParams& params,
571595 if (!redundant) {
572596 #pragma omp critical(fasta)
573597 {
598 rec.length = seq.length();
599 rec.coverage = getSeqAbsoluteKmerCoverage(seq, solidKmerSet);
600
574601 /* add contig to output FASTA */
575 printContig(seq, counters.contigID, rec.readID, params.k, streams.out);
602 printContig(seq, rec.length, rec.coverage, counters.contigID, rec.readID, params.k, streams.out);
576603
577604 /* add contig to checkpoint FASTA file */
578605 if (params.checkpointsEnabled())
579 printContig(seq, counters.contigID, rec.readID, params.k, streams.checkpointOut);
606 printContig(seq, rec.length, rec.coverage, counters.contigID, rec.readID, params.k, streams.checkpointOut);
580607
581608 rec.contigID = counters.contigID;
582 rec.length = seq.length();
583609
584610 counters.contigID++;
585611 counters.basesAssembled += seq.length();
841867
842868 /* output contig to FASTA file */
843869 outputContig(
844 contigPath, contigRec, assembledKmerSet, contigEndKmers, params, counters, streams);
870 contigPath, contigRec, solidKmerSet, assembledKmerSet, contigEndKmers, params, counters, streams);
845871 }
846872
847873 /* mark contig k-mers as visited */
0 2020-01-30 Johnathan Wong <jowong@bcgsc.ca>
1
2 * Release version 2.2.4
3
4 General:
5 * Refactor deprecated functions in clang-8
6
7 Sealer:
8 * Remove unsupported -D option from help page
9
10 abyss-bloom:
11 * Add counting Bloom Filter instruction to help page
12
13 abyss-bloom-dbg:
14 * Report coverage information of unitigs
15
16
017 2019-09-20 Johnathan Wong <jowong@bcgsc.ca>
118
219 * Release version 2.2.3
2222 static T SENTINEL() { return std::numeric_limits<T>::max(); }
2323
2424 public:
25 /** Count the occurrences of the symbols of [first, last). */
26 template<typename It>
27 void assign(It first, It last)
28 {
29 assert(first < last);
30 m_data.clear();
2531
26 /** Count the occurrences of the symbols of [first, last). */
27 template<typename It>
28 void assign(It first, It last)
29 {
30 assert(first < last);
31 m_data.clear();
32 // Determine the size of the alphabet ignoring the sentinel.
33 T n = 0;
34 for (It it = first; it != last; ++it)
35 if (*it != SENTINEL())
36 n = std::max(n, *it);
37 n++;
3238
33 // Determine the size of the alphabet ignoring the sentinel.
34 T n = 0;
35 for (It it = first; it != last; ++it)
36 if (*it != SENTINEL())
37 n = std::max(n, *it);
38 n++;
39 assert(n < std::numeric_limits<T>::max());
40 m_data.resize(n, wat_array::BitArray(last - first));
3941
40 assert(n < std::numeric_limits<T>::max());
41 m_data.resize(n, wat_array::BitArray(last - first));
42 size_t i = 0;
43 for (It it = first; it != last; ++it, ++i) {
44 T c = *it;
45 if (c == SENTINEL())
46 continue;
47 assert(c < m_data.size());
48 m_data[c].SetBit(1, i);
49 }
4250
43 size_t i = 0;
44 for (It it = first; it != last; ++it, ++i) {
45 T c = *it;
46 if (c == SENTINEL())
47 continue;
48 assert(c < m_data.size());
49 m_data[c].SetBit(1, i);
51 std::for_each(
52 m_data.begin(), m_data.end(), [](wat_array::BitArray& b) { return b.Build(); });
5053 }
5154
52 std::for_each(m_data.begin(), m_data.end(),
53 std::mem_fun_ref(&wat_array::BitArray::Build));
54 }
55 /** Return the size of the string. */
56 size_t size() const
57 {
58 assert(!m_data.empty());
59 return m_data.front().length();
60 }
5561
56 /** Return the size of the string. */
57 size_t size() const
58 {
59 assert(!m_data.empty());
60 return m_data.front().length();
61 }
62 /** Return the number of occurrences of the specified symbol. */
63 size_t count(T c) const { return m_data[c].one_num(); }
6264
63 /** Return the number of occurrences of the specified symbol. */
64 size_t count(T c) const
65 {
66 return m_data[c].one_num();
67 }
65 /** Return the count of symbol c in s[0, i). */
66 size_t rank(T c, size_t i) const { return m_data[c].Rank(1, i); }
6867
69 /** Return the count of symbol c in s[0, i). */
70 size_t rank(T c, size_t i) const
71 {
72 return m_data[c].Rank(1, i);
73 }
68 /** Return the symbol at the specified position. */
69 T at(size_t i) const
70 {
71 assert(!m_data.empty());
72 assert(i < m_data.front().length());
73 for (Data::const_iterator it = m_data.begin(); it != m_data.end(); ++it)
74 if (it->Lookup(i))
75 return it - m_data.begin();
76 return std::numeric_limits<T>::max();
77 }
7478
75 /** Return the symbol at the specified position. */
76 T at(size_t i) const
77 {
78 assert(!m_data.empty());
79 assert(i < m_data.front().length());
80 for (Data::const_iterator it = m_data.begin();
81 it != m_data.end(); ++it)
82 if (it->Lookup(i))
83 return it - m_data.begin();
84 return std::numeric_limits<T>::max();
85 }
79 /** Store this data structure. */
80 friend std::ostream& operator<<(std::ostream& out, const BitArrays& o)
81 {
82 uint32_t n = o.m_data.size();
83 out.write(reinterpret_cast<char*>(&n), sizeof n);
84 for (Data::const_iterator it = o.m_data.begin(); it != o.m_data.end(); ++it)
85 it->Save(out);
86 return out;
87 }
8688
87 /** Store this data structure. */
88 friend std::ostream& operator<<(std::ostream& out, const BitArrays& o)
89 {
90 uint32_t n = o.m_data.size();
91 out.write(reinterpret_cast<char*>(&n), sizeof n);
92 for (Data::const_iterator it = o.m_data.begin();
93 it != o.m_data.end(); ++it)
94 it->Save(out);
95 return out;
96 }
97
98 /** Load this data structure. */
99 friend std::istream& operator>>(std::istream& in, BitArrays& o)
100 {
101 o.m_data.clear();
102 uint32_t n = 0;
103 if (!in.read(reinterpret_cast<char*>(&n), sizeof n))
89 /** Load this data structure. */
90 friend std::istream& operator>>(std::istream& in, BitArrays& o)
91 {
92 o.m_data.clear();
93 uint32_t n = 0;
94 if (!in.read(reinterpret_cast<char*>(&n), sizeof n))
95 return in;
96 assert(n > 0);
97 assert(n < std::numeric_limits<T>::max());
98 o.m_data.resize(n);
99 for (Data::iterator it = o.m_data.begin(); it != o.m_data.end(); ++it)
100 it->Load(in);
104101 return in;
105 assert(n > 0);
106 assert(n < std::numeric_limits<T>::max());
107 o.m_data.resize(n);
108 for (Data::iterator it = o.m_data.begin();
109 it != o.m_data.end(); ++it)
110 it->Load(in);
111 return in;
112 }
102 }
113103
114104 private:
115105 typedef std::vector<wat_array::BitArray> Data;
88 #include "ContigPath.h"
99 #include "ContigProperties.h"
1010 #include "FastaReader.h"
11 #include "IOUtil.h"
12 #include "Uncompress.h"
1311 #include "Graph/ContigGraph.h"
1412 #include "Graph/ContigGraphAlgorithms.h"
1513 #include "Graph/DirectedGraph.h"
1614 #include "Graph/GraphIO.h"
1715 #include "Graph/GraphUtil.h"
16 #include "IOUtil.h"
17 #include "Uncompress.h"
18 #include <algorithm>
1819 #include <boost/lambda/bind.hpp>
1920 #include <boost/lambda/lambda.hpp>
20 #include <algorithm>
2121 #include <fstream>
2222 #include <functional>
2323 #include <getopt.h>
3636 #define PROGRAM "abyss-filtergraph"
3737
3838 static const char VERSION_MESSAGE[] =
39 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
40 "Written by Tony Raymond.\n"
41 "\n"
42 "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
39 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
40 "Written by Tony Raymond.\n"
41 "\n"
42 "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
4343
4444 static const char USAGE_MESSAGE[] =
45 "Usage: " PROGRAM " -k<kmer> [OPTION]... ADJ [FASTA]\n"
46 "Remove short contigs that do not contribute any relevant\n"
47 "information to the assembly.\n"
48 "\n"
49 " Arguments:\n"
50 "\n"
51 " ADJ contig adjacency graph\n"
52 " FASTA contigs to check consistency of ADJ edges\n"
53 "\n"
54 " Options:\n"
55 "\n"
56 " -k, --kmer=N k-mer size\n"
57 " --SS expect contigs to be oriented correctly\n"
58 " --no-SS no assumption about contig orientation\n"
59 " -T, --island=N remove islands shorter than N [0]\n"
60 " -t, --tip=N remove tips shorter than N [0]\n"
61 " -l, --length=N remove contigs shorter than N [0]\n"
62 " -L, --max-length=N remove contigs longer than N [0]\n"
63 " -c, --coverage=FLOAT remove contigs with mean k-mer coverage less than FLOAT [0]\n"
64 " -C, --max-coverage=FLOAT remove contigs with mean k-mer coverage at least FLOAT [0]\n"
65 " --shim remove filler contigs that only contribute\n"
66 " to adjacency [default]\n"
67 " --no-shim disable filler contigs removal\n"
68 " --shim-max-degree=N only remove shims where the smaller of \n"
69 " in/out degree is smaller than N [1]\n"
70 " -m, --min-overlap=N require a minimum overlap of N bases [10]\n"
71 " --assemble assemble unambiguous paths\n"
72 " --no-assemble disable assembling of paths [default]\n"
73 " -g, --graph=FILE write the contig adjacency graph to FILE\n"
74 " -i, --ignore=FILE ignore contigs seen in FILE\n"
75 " -r, --remove=FILE remove contigs seen in FILE\n"
76 " --adj output the graph in ADJ format [default]\n"
77 " --asqg output the graph in ASQG format\n"
78 " --dot output the graph in GraphViz format\n"
79 " --gfa output the graph in GFA1 format\n"
80 " --gfa1 output the graph in GFA1 format\n"
81 " --gfa2 output the graph in GFA2 format\n"
82 " --gv output the graph in GraphViz format\n"
83 " --sam output the graph in SAM format\n"
84 " -v, --verbose display verbose output\n"
85 " --help display this help and exit\n"
86 " --version output version information and exit\n"
87 "\n"
88 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
45 "Usage: " PROGRAM " -k<kmer> [OPTION]... ADJ [FASTA]\n"
46 "Remove short contigs that do not contribute any relevant\n"
47 "information to the assembly.\n"
48 "\n"
49 " Arguments:\n"
50 "\n"
51 " ADJ contig adjacency graph\n"
52 " FASTA contigs to check consistency of ADJ edges\n"
53 "\n"
54 " Options:\n"
55 "\n"
56 " -k, --kmer=N k-mer size\n"
57 " --SS expect contigs to be oriented correctly\n"
58 " --no-SS no assumption about contig orientation\n"
59 " -T, --island=N remove islands shorter than N [0]\n"
60 " -t, --tip=N remove tips shorter than N [0]\n"
61 " -l, --length=N remove contigs shorter than N [0]\n"
62 " -L, --max-length=N remove contigs longer than N [0]\n"
63 " -c, --coverage=FLOAT remove contigs with mean k-mer coverage less than FLOAT [0]\n"
64 " -C, --max-coverage=FLOAT remove contigs with mean k-mer coverage at least FLOAT [0]\n"
65 " --shim remove filler contigs that only contribute\n"
66 " to adjacency [default]\n"
67 " --no-shim disable filler contigs removal\n"
68 " --shim-max-degree=N only remove shims where the smaller of \n"
69 " in/out degree is smaller than N [1]\n"
70 " -m, --min-overlap=N require a minimum overlap of N bases [10]\n"
71 " --assemble assemble unambiguous paths\n"
72 " --no-assemble disable assembling of paths [default]\n"
73 " -g, --graph=FILE write the contig adjacency graph to FILE\n"
74 " -i, --ignore=FILE ignore contigs seen in FILE\n"
75 " -r, --remove=FILE remove contigs seen in FILE\n"
76 " --adj output the graph in ADJ format [default]\n"
77 " --asqg output the graph in ASQG format\n"
78 " --dot output the graph in GraphViz format\n"
79 " --gfa output the graph in GFA1 format\n"
80 " --gfa1 output the graph in GFA1 format\n"
81 " --gfa2 output the graph in GFA2 format\n"
82 " --gv output the graph in GraphViz format\n"
83 " --sam output the graph in SAM format\n"
84 " -v, --verbose display verbose output\n"
85 " --help display this help and exit\n"
86 " --version output version information and exit\n"
87 "\n"
88 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
8989
9090 namespace opt {
91 unsigned k; // used by ContigProperties
92
93 /** Run a strand-specific RNA-Seq assembly. */
94 static int ss;
95
96 /** Remove island contigs less than this length. */
97 static unsigned minIslandLen = 0;
98
99 /** Remove tips less than this length. */
100 static unsigned minTipLen = 0;
101
102 /** Remove all contigs less than this length. */
103 static unsigned minLen = 0;
104
105 /** Remove all contigs more than this length. */
106 static unsigned maxLen = 0;
107
108 /** Remove contigs with mean k-mer coverage less than this threshold. */
109 static float minCoverage = 0;
110
111 /** Remove contigs with mean k-mer coverage at least this threshold. */
112 static float maxCoverage = 0;
113
114 /** Remove short contigs that don't contribute any sequence. */
115 static int shim = 1;
116
117 /** Only remove shims where the smaller of in/out degree is small
118 * enough. */
119 static unsigned shimMaxDegree = 1;
120
121 /** Assemble unambiguous paths. */
122 static int assemble = 0;
123
124 /** Write the contig adjacency graph to this file. */
125 static string graphPath;
126
127 /** Contigs to ignore. */
128 static string ignorePath;
129
130 /** Contigs to remove. */
131 static string removePath;
132
133 /** The minimum overlap allowed between two contigs. */
134 static int minOverlap = 10;
135
136 /** Output graph format. */
137 int format = ADJ; // used by ContigProperties
91 unsigned k; // used by ContigProperties
92
93 /** Run a strand-specific RNA-Seq assembly. */
94 static int ss;
95
96 /** Remove island contigs less than this length. */
97 static unsigned minIslandLen = 0;
98
99 /** Remove tips less than this length. */
100 static unsigned minTipLen = 0;
101
102 /** Remove all contigs less than this length. */
103 static unsigned minLen = 0;
104
105 /** Remove all contigs more than this length. */
106 static unsigned maxLen = 0;
107
108 /** Remove contigs with mean k-mer coverage less than this threshold. */
109 static float minCoverage = 0;
110
111 /** Remove contigs with mean k-mer coverage at least this threshold. */
112 static float maxCoverage = 0;
113
114 /** Remove short contigs that don't contribute any sequence. */
115 static int shim = 1;
116
117 /** Only remove shims where the smaller of in/out degree is small
118 * enough. */
119 static unsigned shimMaxDegree = 1;
120
121 /** Assemble unambiguous paths. */
122 static int assemble = 0;
123
124 /** Write the contig adjacency graph to this file. */
125 static string graphPath;
126
127 /** Contigs to ignore. */
128 static string ignorePath;
129
130 /** Contigs to remove. */
131 static string removePath;
132
133 /** The minimum overlap allowed between two contigs. */
134 static int minOverlap = 10;
135
136 /** Output graph format. */
137 int format = ADJ; // used by ContigProperties
138138 }
139139
140140 static const char shortopts[] = "c:C:g:i:r:k:l:L:m:t:T:v";
141141
142 enum { OPT_HELP = 1, OPT_VERSION, OPT_SHIM_MAX_DEG };
142 enum
143 {
144 OPT_HELP = 1,
145 OPT_VERSION,
146 OPT_SHIM_MAX_DEG
147 };
143148
144149 static const struct option longopts[] = {
145 { "adj", no_argument, &opt::format, ADJ },
146 { "asqg", no_argument, &opt::format, ASQG },
147 { "dot", no_argument, &opt::format, DOT },
148 { "gfa", no_argument, &opt::format, GFA1 },
149 { "gfa1", no_argument, &opt::format, GFA1 },
150 { "gfa2", no_argument, &opt::format, GFA2 },
151 { "gv", no_argument, &opt::format, DOT },
152 { "sam", no_argument, &opt::format, SAM },
153 { "graph", required_argument, NULL, 'g' },
154 { "ignore", required_argument, NULL, 'i' },
155 { "remove", required_argument, NULL, 'r' },
156 { "SS", no_argument, &opt::ss, 1 },
157 { "no-SS", no_argument, &opt::ss, 0 },
158 { "kmer", required_argument, NULL, 'k' },
159 { "island", required_argument, NULL, 'T' },
160 { "tip", required_argument, NULL, 't' },
161 { "length", required_argument, NULL, 'l' },
162 { "max-length", required_argument, NULL, 'L' },
163 { "coverage", required_argument, NULL, 'c' },
164 { "max-coverage", required_argument, NULL, 'C' },
165 { "shim", no_argument, &opt::shim, 1 },
166 { "no-shim", no_argument, &opt::shim, 0 },
150 { "adj", no_argument, &opt::format, ADJ },
151 { "asqg", no_argument, &opt::format, ASQG },
152 { "dot", no_argument, &opt::format, DOT },
153 { "gfa", no_argument, &opt::format, GFA1 },
154 { "gfa1", no_argument, &opt::format, GFA1 },
155 { "gfa2", no_argument, &opt::format, GFA2 },
156 { "gv", no_argument, &opt::format, DOT },
157 { "sam", no_argument, &opt::format, SAM },
158 { "graph", required_argument, NULL, 'g' },
159 { "ignore", required_argument, NULL, 'i' },
160 { "remove", required_argument, NULL, 'r' },
161 { "SS", no_argument, &opt::ss, 1 },
162 { "no-SS", no_argument, &opt::ss, 0 },
163 { "kmer", required_argument, NULL, 'k' },
164 { "island", required_argument, NULL, 'T' },
165 { "tip", required_argument, NULL, 't' },
166 { "length", required_argument, NULL, 'l' },
167 { "max-length", required_argument, NULL, 'L' },
168 { "coverage", required_argument, NULL, 'c' },
169 { "max-coverage", required_argument, NULL, 'C' },
170 { "shim", no_argument, &opt::shim, 1 },
171 { "no-shim", no_argument, &opt::shim, 0 },
167172 { "shim-max-degree", required_argument, NULL, OPT_SHIM_MAX_DEG },
168 { "assemble", no_argument, &opt::assemble, 1 },
169 { "no-assemble", no_argument, &opt::assemble, 0 },
170 { "min-overlap", required_argument, NULL, 'm' },
171 { "verbose", no_argument, NULL, 'v' },
172 { "help", no_argument, NULL, OPT_HELP },
173 { "version", no_argument, NULL, OPT_VERSION },
173 { "assemble", no_argument, &opt::assemble, 1 },
174 { "no-assemble", no_argument, &opt::assemble, 0 },
175 { "min-overlap", required_argument, NULL, 'm' },
176 { "verbose", no_argument, NULL, 'v' },
177 { "help", no_argument, NULL, OPT_HELP },
178 { "version", no_argument, NULL, OPT_VERSION },
174179 { NULL, 0, NULL, 0 }
175180 };
176181
177182 static vector<ContigID> g_removed;
178183
179184 /** Contig adjacency graph. */
180 typedef ContigGraph<DirectedGraph<ContigProperties, Distance> > Graph;
185 typedef ContigGraph<DirectedGraph<ContigProperties, Distance>> Graph;
181186 typedef Graph::vertex_descriptor vertex_descriptor;
182187 typedef Graph::edge_descriptor edge_descriptor;
183188
184189 /** Data for verbose output. */
185 static struct {
190 static struct
191 {
186192 unsigned removed;
187193 unsigned tails;
188194 unsigned too_long;
193199 } g_count;
194200
195201 /** Returns if the contig can be removed from the graph. */
196 static bool removable(const Graph* pg, vertex_descriptor v)
202 static bool
203 removable(const Graph* pg, vertex_descriptor v)
197204 {
198205 typedef graph_traits<Graph> GTraits;
199206 typedef GTraits::out_edge_iterator OEit;
247254 if (g[*maxvw].distance < g[*vw].distance)
248255 maxvw = vw;
249256
250 if (g[*maxuv].distance + (int)g[v].length + g[*maxvw].distance >
251 -opt::minOverlap) {
257 if (g[*maxuv].distance + (int)g[v].length + g[*maxvw].distance > -opt::minOverlap) {
252258 g_count.too_long++;
253259 return false;
254260 }
256262 }
257263
258264 /** Data to store information of an edge. */
259 struct EdgeInfo {
265 struct EdgeInfo
266 {
260267 vertex_descriptor u;
261268 vertex_descriptor w;
262269 edge_bundle_type<Graph>::type ep;
263270
264271 EdgeInfo(vertex_descriptor u, vertex_descriptor w, int ep)
265 : u(u), w(w), ep(ep) {}
266 EdgeInfo() : u(), w(), ep() {}
272 : u(u)
273 , w(w)
274 , ep(ep)
275 {}
276 EdgeInfo()
277 : u()
278 , w()
279 , ep()
280 {}
267281 };
268282
269283 /** Returns a list of edges that may be added when the vertex v is
270284 * removed. */
271 static bool findNewEdges(const Graph& g, vertex_descriptor v,
272 vector<EdgeInfo>& eds, vector<bool>& markedContigs)
285 static bool
286 findNewEdges(
287 const Graph& g,
288 vertex_descriptor v,
289 vector<EdgeInfo>& eds,
290 vector<bool>& markedContigs)
273291 {
274292 typedef graph_traits<Graph> GTraits;
275293 typedef GTraits::vertex_descriptor V;
287305 // for every edge from u->v and v->w we must add an edge u->w
288306 for (IEit uv = iei0; uv != iei1; ++uv) {
289307 for (OEit vw = oei0; vw != oei1; ++vw) {
290 int x = g[*uv].distance + (int)g[v].length +
291 g[*vw].distance;
308 int x = g[*uv].distance + (int)g[v].length + g[*vw].distance;
292309 assert(x <= 0);
293310 EdgeInfo ed(source(*uv, g), target(*vw, g), x);
294311 eds.push_back(ed);
298315 marked.push_back(ed.w);
299316 }
300317 }
301 for (vector<V>::const_iterator it = marked.begin();
302 it != marked.end(); it++)
318 for (vector<V>::const_iterator it = marked.begin(); it != marked.end(); it++)
303319 markedContigs[get(vertex_index, g, *it)] = true;
304320 return true;
305321 }
306322
307323 /** Adds all edges described in the vector eds. */
308 static void addNewEdges(Graph& g, const vector<EdgeInfo>& eds)
309 {
310 for (vector<EdgeInfo>::const_iterator edsit = eds.begin();
311 edsit != eds.end(); ++edsit) {
324 static void
325 addNewEdges(Graph& g, const vector<EdgeInfo>& eds)
326 {
327 for (vector<EdgeInfo>::const_iterator edsit = eds.begin(); edsit != eds.end(); ++edsit) {
312328 // Don't add parallel edges! This can happen when removing a palindrome.
313329 if (edge(edsit->u, edsit->w, g).second) {
314330 g_count.parallel_edge++;
319335 }
320336 }
321337
322 static void removeContig(vertex_descriptor v, Graph& g)
323 {
324 clear_vertex(v, g);
325 remove_vertex(v, g);
326 g_removed.push_back(get(vertex_contig_index, g, v));
327 g_count.removed++;
338 static void
339 removeContig(vertex_descriptor v, Graph& g)
340 {
341 clear_vertex(v, g);
342 remove_vertex(v, g);
343 g_removed.push_back(get(vertex_contig_index, g, v));
344 g_count.removed++;
328345 }
329346
330347 /** Remove the specified contig from the adjacency graph. */
331 static void removeContigs(Graph& g, vector<vertex_descriptor>& sc)
348 static void
349 removeContigs(Graph& g, vector<vertex_descriptor>& sc)
332350 {
333351 typedef graph_traits<Graph> GTraits;
334352 typedef GTraits::vertex_descriptor V;
337355 out.reserve(sc.size());
338356
339357 vector<bool> markedContigs(g.num_vertices());
340 for (vector<vertex_descriptor>::iterator it = sc.begin();
341 it != sc.end(); ++it) {
358 for (vector<vertex_descriptor>::iterator it = sc.begin(); it != sc.end(); ++it) {
342359 V v = *it;
343360 if (opt::verbose > 0 && ++g_count.checked % 10000000 == 0)
344 cerr << "Removed " << g_count.removed << "/"
345 << g_count.checked
346 << " vertices that have been checked.\n";
361 cerr << "Removed " << g_count.removed << "/" << g_count.checked
362 << " vertices that have been checked.\n";
347363
348364 if (markedContigs[get(vertex_index, g, v)]) {
349365 out.push_back(v);
365381 }
366382
367383 /** Return the value of the bit at the specified index. */
368 struct Marked : unary_function<vertex_descriptor, bool> {
384 struct Marked : unary_function<vertex_descriptor, bool>
385 {
369386 typedef vector<bool> Data;
370387 Marked(const Graph& g, const Data& data)
371 : m_g(g), m_data(data) { }
372 bool operator()(vertex_descriptor u) const
373 {
374 return m_data[get(vertex_contig_index, m_g, u)];
375 }
388 : m_g(g)
389 , m_data(data)
390 {}
391 bool operator()(vertex_descriptor u) const { return m_data[get(vertex_contig_index, m_g, u)]; }
392
376393 private:
377394 const Graph& m_g;
378395 const Data& m_data;
379396 };
380397
381398 /** Finds all potentially removable contigs in the graph. */
382 static void findShortContigs(const Graph& g, const vector<bool>& seen,
383 vector<vertex_descriptor>& sc)
399 static void
400 findShortContigs(const Graph& g, const vector<bool>& seen, vector<vertex_descriptor>& sc)
384401 {
385402 typedef graph_traits<Graph> GTraits;
386403 typedef GTraits::vertex_iterator Vit;
387404 Vit first, second;
388405 tie(first, second) = vertices(g);
389 ::copy_if(first, second, back_inserter(sc),
390 !boost::lambda::bind(Marked(g, seen), _1) && boost::lambda::bind(removable, &g, _1));
406 ::copy_if(
407 first,
408 second,
409 back_inserter(sc),
410 !boost::lambda::bind(Marked(g, seen), _1) && boost::lambda::bind(removable, &g, _1));
391411 }
392412
393413 /** Functor used for sorting contigs based on degree, then size,
394414 * and then ID. */
395 struct sortContigs {
415 struct sortContigs
416 {
396417 const Graph& g;
397418
398 sortContigs(const Graph& g) : g(g) {}
399
400 template <typename V>
401 bool operator() (V a, V b)
419 sortContigs(const Graph& g)
420 : g(g)
421 {}
422
423 template<typename V>
424 bool operator()(V a, V b)
402425 {
403426 const ContigProperties& ap = g[a];
404427 const ContigProperties& bp = g[b];
406429 unsigned dega = out_degree(a, g) * in_degree(a, g);
407430 unsigned degb = out_degree(b, g) * in_degree(b, g);
408431
409 return dega != degb ? dega < degb
410 : ap.length != bp.length ? ap.length < bp.length
411 : a < b;
432 return dega != degb ? dega < degb : ap.length != bp.length ? ap.length < bp.length : a < b;
412433 }
413434 };
414435
415 struct ShorterThanX : unary_function<vertex_descriptor, bool> {
436 struct ShorterThanX : unary_function<vertex_descriptor, bool>
437 {
416438 const Graph& g;
417439 const vector<bool>& seen;
418440 size_t x;
419441
420442 ShorterThanX(const Graph& g, const vector<bool>& seen, size_t x)
421 : g(g), seen(seen), x(x) { }
443 : g(g)
444 , seen(seen)
445 , x(x)
446 {}
422447
423448 bool operator()(vertex_descriptor y) const
424449 {
425 return g[y].length < x && !get(vertex_removed, g, y)
426 && !seen[get(vertex_contig_index, g, y)];
450 return g[y].length < x && !get(vertex_removed, g, y) &&
451 !seen[get(vertex_contig_index, g, y)];
427452 }
428453 };
429454
430 struct LongerThanX : unary_function<vertex_descriptor, bool> {
455 struct LongerThanX : unary_function<vertex_descriptor, bool>
456 {
431457 const Graph& g;
432458 const vector<bool>& seen;
433459 size_t x;
434460
435461 LongerThanX(const Graph& g, const vector<bool>& seen, size_t x)
436 : g(g), seen(seen), x(x) { }
462 : g(g)
463 , seen(seen)
464 , x(x)
465 {}
437466
438467 bool operator()(vertex_descriptor y) const
439468 {
440 return g[y].length > x && !get(vertex_removed, g, y)
441 && !seen[get(vertex_contig_index, g, y)];
469 return g[y].length > x && !get(vertex_removed, g, y) &&
470 !seen[get(vertex_contig_index, g, y)];
442471 }
443472 };
444473
445 struct CoverageLessThan : unary_function<vertex_descriptor, bool> {
474 struct CoverageLessThan : unary_function<vertex_descriptor, bool>
475 {
446476 const Graph& g;
447477 const vector<bool>& seen;
448478 float minCov;
449479
450480 CoverageLessThan(const Graph& g, const vector<bool>& seen, float minCov)
451 : g(g), seen(seen), minCov(minCov) { }
481 : g(g)
482 , seen(seen)
483 , minCov(minCov)
484 {}
452485
453486 bool operator()(vertex_descriptor u) const
454487 {
455488 assert(opt::k > 0);
456489 float meanCoverage = (float)g[u].coverage / (g[u].length - opt::k + 1);
457 return meanCoverage < minCov && !get(vertex_removed, g, u)
458 && !seen[get(vertex_contig_index, g, u)];
490 return meanCoverage < minCov && !get(vertex_removed, g, u) &&
491 !seen[get(vertex_contig_index, g, u)];
459492 }
460493 };
461494
462 static void removeShims(Graph& g, const vector<bool>& seen)
495 static void
496 removeShims(Graph& g, const vector<bool>& seen)
463497 {
464498 if (opt::verbose > 0)
465499 cerr << "Removing shim contigs from the graph...\n";
467501 findShortContigs(g, seen, shortContigs);
468502 for (unsigned i = 0; !shortContigs.empty(); ++i) {
469503 if (opt::verbose > 0)
470 cerr << "Pass " << i + 1 << ": Checking "
471 << shortContigs.size() << " contigs.\n";
472 sort(shortContigs.begin(), shortContigs.end(),
473 sortContigs(g));
504 cerr << "Pass " << i + 1 << ": Checking " << shortContigs.size() << " contigs.\n";
505 sort(shortContigs.begin(), shortContigs.end(), sortContigs(g));
474506 removeContigs(g, shortContigs);
475507 }
476508 if (opt::verbose > 0) {
477509 cerr << "Shim removal stats:\n";
478 cerr << "Removed: " << g_count.removed/2
479 << " Too Complex: " << g_count.too_complex/2
480 << " Tails: " << g_count.tails/2
481 << " Too Long: " << g_count.too_long/2
482 << " Self Adjacent: " << g_count.self_adj/2
483 << " Parallel Edges: " << g_count.parallel_edge/2 << '\n';
484 }
485 }
486
487 template <typename pred>
488 static void removeContigs_if(Graph& g, pred p)
510 cerr << "Removed: " << g_count.removed / 2 << " Too Complex: " << g_count.too_complex / 2
511 << " Tails: " << g_count.tails / 2 << " Too Long: " << g_count.too_long / 2
512 << " Self Adjacent: " << g_count.self_adj / 2
513 << " Parallel Edges: " << g_count.parallel_edge / 2 << '\n';
514 }
515 }
516
517 template<typename pred>
518 static void
519 removeContigs_if(Graph& g, pred p)
489520 {
490521 typedef graph_traits<Graph> GTraits;
491522 typedef GTraits::vertex_iterator Vit;
495526 vector<V> sc;
496527 ::copy_if(first, second, back_inserter(sc), p);
497528 remove_vertex_if(g, sc.begin(), sc.end(), True<V>());
498 transform(sc.begin(), sc.end(), back_inserter(g_removed),
499 mem_fun_ref(&ContigNode::contigIndex));
529 transform(sc.begin(), sc.end(), back_inserter(g_removed), [](const ContigNode& c) {
530 return c.contigIndex();
531 });
500532 if (opt::verbose > 0)
501 cerr << "Removed " << sc.size()/2 << " contigs.\n";
533 cerr << "Removed " << sc.size() / 2 << " contigs.\n";
502534 }
503535
504536 /** Contig sequences. */
506538 static Contigs g_contigs;
507539
508540 /** Return the sequence of vertex u. */
509 static string getSequence(const Graph& g, vertex_descriptor u)
541 static string
542 getSequence(const Graph& g, vertex_descriptor u)
510543 {
511544 size_t i = get(vertex_contig_index, g, u);
512545 assert(i < g_contigs.size());
515548 }
516549
517550 /** Return whether the specified edge is inconsistent. */
518 struct is_edge_inconsistent : unary_function<edge_descriptor, bool> {
551 struct is_edge_inconsistent : unary_function<edge_descriptor, bool>
552 {
519553 const Graph& g;
520554
521555 is_edge_inconsistent(const Graph& g)
522 : g(g) { }
556 : g(g)
557 {}
523558
524559 bool operator()(edge_descriptor e) const
525560 {
540575 }
541576 };
542577
543 template <typename It>
544 static void remove_edge(Graph& g, It first, It last)
578 template<typename It>
579 static void
580 remove_edge(Graph& g, It first, It last)
545581 {
546582 for (; first != last; first++)
547583 remove_edge(*first, g);
548584 }
549585
550586 template<typename pred>
551 static void removeEdges_if(Graph& g, pred p)
587 static void
588 removeEdges_if(Graph& g, pred p)
552589 {
553590 typedef graph_traits<Graph> GTraits;
554591 typedef GTraits::edge_iterator Eit;
564601 }
565602 }
566603
567 int main(int argc, char** argv)
604 int
605 main(int argc, char** argv)
568606 {
569607 string commandLine;
570608 {
571609 ostringstream ss;
572610 char** last = argv + argc - 1;
573 copy(argv, last, ostream_iterator<const char *>(ss, " "));
611 copy(argv, last, ostream_iterator<const char*>(ss, " "));
574612 ss << *last;
575613 commandLine = ss.str();
576614 }
577615
578616 bool die = false;
579 for (int c; (c = getopt_long(argc, argv,
580 shortopts, longopts, NULL)) != -1;) {
617 for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
581618 istringstream arg(optarg != NULL ? optarg : "");
582619 switch (c) {
583 case '?':
620 case '?':
584621 die = true;
585622 break;
586 case 'c':
623 case 'c':
587624 arg >> opt::minCoverage;
588625 break;
589 case 'C':
626 case 'C':
590627 arg >> opt::maxCoverage;
591628 break;
592 case 'l':
629 case 'l':
593630 arg >> opt::minLen;
594631 break;
595 case 'L':
632 case 'L':
596633 arg >> opt::maxLen;
597634 break;
598 case 'm':
635 case 'm':
599636 arg >> opt::minOverlap;
600637 break;
601 case 'g':
638 case 'g':
602639 arg >> opt::graphPath;
603640 break;
604 case 'i':
641 case 'i':
605642 arg >> opt::ignorePath;
606643 break;
607 case 'r':
644 case 'r':
608645 arg >> opt::removePath;
609646 break;
610 case 'k':
647 case 'k':
611648 arg >> opt::k;
612649 break;
613 case 'T':
650 case 'T':
614651 arg >> opt::minIslandLen;
615652 break;
616 case 't':
653 case 't':
617654 arg >> opt::minTipLen;
618655 break;
619 case 'v':
656 case 'v':
620657 opt::verbose++;
621658 break;
622 case OPT_SHIM_MAX_DEG:
659 case OPT_SHIM_MAX_DEG:
623660 arg >> opt::shimMaxDegree;
624661 break;
625 case OPT_HELP:
662 case OPT_HELP:
626663 cout << USAGE_MESSAGE;
627664 exit(EXIT_SUCCESS);
628 case OPT_VERSION:
665 case OPT_VERSION:
629666 cout << VERSION_MESSAGE;
630667 exit(EXIT_SUCCESS);
631668 }
632669 if (optarg != NULL && !arg.eof()) {
633 cerr << PROGRAM ": invalid option: `-"
634 << (char)c << optarg << "'\n";
670 cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n";
635671 exit(EXIT_FAILURE);
636672 }
637673 }
638674
639675 if (opt::minOverlap < 0) {
640676 cerr << PROGRAM ": "
641 << "--min-overlap must be a positive integer.\n";
677 << "--min-overlap must be a positive integer.\n";
642678 die = true;
643679 }
644680
645681 if (opt::k <= 0) {
646 cerr << PROGRAM ": " << "missing -k,--kmer option\n";
682 cerr << PROGRAM ": "
683 << "missing -k,--kmer option\n";
647684 die = true;
648685 }
649686
658695 }
659696
660697 if (die) {
661 cerr << "Try `" << PROGRAM
662 << " --help' for more information.\n";
698 cerr << "Try `" << PROGRAM << " --help' for more information.\n";
663699 exit(EXIT_FAILURE);
664700 }
665701
668704 {
669705 string adjPath(argv[optind++]);
670706 if (opt::verbose > 0)
671 cerr << "Loading graph from file: " << adjPath
672 << '\n';
707 cerr << "Loading graph from file: " << adjPath << '\n';
673708 ifstream fin(adjPath.c_str());
674709 assert_good(fin, adjPath);
675710 fin >> g;
697732 size_t b = g_removed.size();
698733 while (in >> s) {
699734 size_t i = get(g_contigNames, s);
700 removeContig(ContigNode(i,0), g);
735 removeContig(ContigNode(i, 0), g);
701736 }
702737 assert(in.eof());
703738 if (opt::verbose)
704 cerr << "Removed " << g_removed.size() - b
705 << " contigs.\n";
739 cerr << "Removed " << g_removed.size() - b << " contigs.\n";
706740 }
707741
708742 // Remove shims.
712746 // Remove islands.
713747 if (opt::minIslandLen > 0) {
714748 size_t s = g_removed.size();
715 removeIslands_if(g, back_inserter(g_removed),
716 ShorterThanX(g, seen, opt::minIslandLen));
749 removeIslands_if(g, back_inserter(g_removed), ShorterThanX(g, seen, opt::minIslandLen));
717750 if (opt::verbose)
718 cerr << "Removed " << g_removed.size() - s
719 << " islands.\n";
751 cerr << "Removed " << g_removed.size() - s << " islands.\n";
720752 }
721753
722754 // Remove tips.
725757 s = g_removed.size();
726758 do {
727759 prev = g_removed.size();
728 pruneTips_if(g, back_inserter(g_removed),
729 ShorterThanX(g, seen, opt::minTipLen));
760 pruneTips_if(g, back_inserter(g_removed), ShorterThanX(g, seen, opt::minTipLen));
730761 } while (prev < g_removed.size());
731762 if (opt::verbose)
732 cerr << "Removed " << g_removed.size() - s
733 << " tips.\n";
763 cerr << "Removed " << g_removed.size() - s << " tips.\n";
734764 }
735765
736766 // Remove short contigs.
747777
748778 // Remove contigs with high mean k-mer coverage.
749779 if (opt::maxCoverage > 0)
750 removeContigs_if(g,
751 std::not1(CoverageLessThan(g, seen, opt::maxCoverage)));
780 removeContigs_if(g, std::not1(CoverageLessThan(g, seen, opt::maxCoverage)));
752781
753782 // Remove inconsistent edges of spaceseeds
754783 if (argc - optind == 1) {
792821 else
793822 assemble(g, back_inserter(paths));
794823 g_contigNames.unlock();
795 for (ContigPaths::const_iterator it = paths.begin();
796 it != paths.end(); ++it) {
824 for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) {
797825 ContigNode u(numContigs + it - paths.begin(), false);
798826 string name = createContigName();
799827 put(vertex_name, g, u, name);
66 #include "Common/Functional.h"
77 #include "Common/Iterator.h"
88 #include "Graph/ContigGraph.h"
9 #include <algorithm>
910 #include <boost/graph/graph_traits.hpp>
10 #include <algorithm>
1111 #include <cassert>
1212 #include <functional>
1313 #include <set>
1717
1818 /** Return true if the edge e is a palindrome. */
1919 template<typename Graph>
20 struct IsPalindrome : std::unary_function<
21 typename graph_traits<Graph>::edge_descriptor, bool>
22 {
23 IsPalindrome(const Graph& g) : m_g(g) { }
24 bool operator()(
25 typename graph_traits<Graph>::edge_descriptor e) const
20 struct IsPalindrome : std::unary_function<typename graph_traits<Graph>::edge_descriptor, bool>
21 {
22 IsPalindrome(const Graph& g)
23 : m_g(g)
24 {}
25 bool operator()(typename graph_traits<Graph>::edge_descriptor e) const
2626 {
27 return source(e, m_g)
28 == get(vertex_complement, m_g, target(e, m_g));
29 }
27 return source(e, m_g) == get(vertex_complement, m_g, target(e, m_g));
28 }
29
3030 private:
3131 const Graph& m_g;
3232 };
3333
3434 /** Return whether the outgoing edge of vertex u is contiguous. */
3535 template<typename Graph>
36 bool contiguous_out(const Graph& g,
37 typename graph_traits<Graph>::vertex_descriptor u)
38 {
39 return out_degree(u, g) == 1
40 && in_degree(*adjacent_vertices(u, g).first, g) == 1;
36 bool
37 contiguous_out(const Graph& g, typename graph_traits<Graph>::vertex_descriptor u)
38 {
39 return out_degree(u, g) == 1 && in_degree(*adjacent_vertices(u, g).first, g) == 1;
4140 }
4241
4342 /** Return whether the incoming edge of vertex u is contiguous. */
4443 template<typename Graph>
45 bool contiguous_in(const Graph& g,
46 typename graph_traits<Graph>::vertex_descriptor u)
44 bool
45 contiguous_in(const Graph& g, typename graph_traits<Graph>::vertex_descriptor u)
4746 {
4847 return contiguous_out(g, get(vertex_complement, g, u));
4948 }
5049
5150 /** Add the outgoing edges of vertex u to vertex uout. */
5251 template<typename Graph>
53 void copy_out_edges(Graph &g,
54 typename Graph::vertex_descriptor u,
55 typename Graph::vertex_descriptor uout)
56 {
57 typedef typename graph_traits<Graph>::vertex_descriptor
58 vertex_descriptor;
59 typedef typename graph_traits<Graph>::out_edge_iterator
60 out_edge_iterator;
52 void
53 copy_out_edges(
54 Graph& g,
55 typename Graph::vertex_descriptor u,
56 typename Graph::vertex_descriptor uout)
57 {
58 typedef typename graph_traits<Graph>::vertex_descriptor vertex_descriptor;
59 typedef typename graph_traits<Graph>::out_edge_iterator out_edge_iterator;
6160 typedef typename edge_property<Graph>::type edge_property_type;
6261 assert(u != uout);
63 std::pair<out_edge_iterator, out_edge_iterator>
64 edges = g.out_edges(u);
62 std::pair<out_edge_iterator, out_edge_iterator> edges = g.out_edges(u);
6563 bool palindrome = false;
6664 edge_property_type palindrome_ep;
6765 for (out_edge_iterator e = edges.first; e != edges.second; ++e) {
8684
8785 /** Add the incoming edges of vertex u to vertex v. */
8886 template<typename Graph>
89 void copy_in_edges(Graph& g,
90 typename Graph::vertex_descriptor u,
91 typename Graph::vertex_descriptor v)
92 {
93 copy_out_edges(g,
94 get(vertex_complement, g, u),
95 get(vertex_complement, g, v));
87 void
88 copy_in_edges(Graph& g, typename Graph::vertex_descriptor u, typename Graph::vertex_descriptor v)
89 {
90 copy_out_edges(g, get(vertex_complement, g, u), get(vertex_complement, g, v));
9691 }
9792
9893 /** Assemble a path of unambigous out edges starting at vertex u.
9994 * u itself is not copied to out.
10095 */
10196 template<typename Graph, typename OutIt>
102 OutIt extend(const Graph& g,
103 typename Graph::vertex_descriptor u, OutIt out)
104 {
105 typedef typename graph_traits<Graph>::vertex_descriptor
106 vertex_descriptor;
97 OutIt
98 extend(const Graph& g, typename Graph::vertex_descriptor u, OutIt out)
99 {
100 typedef typename graph_traits<Graph>::vertex_descriptor vertex_descriptor;
107101 std::set<vertex_descriptor> seen;
108102 while (out_degree(u, g) == 1 && seen.insert(u).second) {
109103 u = *adjacent_vertices(u, g).first;
115109 /** Assemble an unambiguous path starting at vertex u.
116110 * Every edge must satisfy the predicate. */
117111 template<typename Graph, typename OutIt, typename Predicate>
118 OutIt assemble_if(const Graph& g,
119 typename Graph::vertex_descriptor u, OutIt out,
120 Predicate pred)
121 {
122 typedef typename graph_traits<Graph>::edge_descriptor
123 edge_descriptor;
112 OutIt
113 assemble_if(const Graph& g, typename Graph::vertex_descriptor u, OutIt out, Predicate pred)
114 {
115 typedef typename graph_traits<Graph>::edge_descriptor edge_descriptor;
124116 while (contiguous_out(g, u)) {
125117 edge_descriptor e = *out_edges(u, g).first;
126118 if (!pred(e))
137129 * are removed as well.
138130 */
139131 template<typename Graph, typename It, typename Predicate>
140 void remove_vertex_if(Graph& g, It first, It last, Predicate p)
141 {
142 for_each_if(first, last,
143 bind1st(std::mem_fun(&Graph::clear_vertex), &g), p);
144 for_each_if(first, last,
145 bind1st(std::mem_fun(&Graph::remove_vertex), &g), p);
132 void
133 remove_vertex_if(Graph& g, It first, It last, Predicate p)
134 {
135 for_each_if(
136 first, last, [&g](const ContigNode& c) { return g.clear_vertex(c); }, p);
137 for_each_if(
138 first, last, [&g](const ContigNode& c) { return g.remove_vertex(c); }, p);
146139 }
147140
148141 /** Add the vertex and edge propeties of the path [first, last). */
149142 template<typename Graph, typename It, typename VP>
150 VP addProp(const Graph& g, It first, It last, const VP*)
151 {
152 typedef typename graph_traits<Graph>::vertex_descriptor
153 vertex_descriptor;
143 VP
144 addProp(const Graph& g, It first, It last, const VP*)
145 {
146 typedef typename graph_traits<Graph>::vertex_descriptor vertex_descriptor;
154147 assert(first != last);
155148 VP vp = get(vertex_bundle, g, *first);
156149 for (It it = first + 1; it != last; ++it) {
163156 }
164157
165158 template<typename Graph, typename It>
166 no_property addProp(const Graph&, It, It, const no_property*)
159 no_property
160 addProp(const Graph&, It, It, const no_property*)
167161 {
168162 return no_property();
169163 }
170164
171165 template<typename Graph, typename It>
172 typename vertex_property<Graph>::type addProp(const Graph& g,
173 It first, It last)
174 {
175 return addProp(g, first, last,
176 (typename vertex_property<Graph>::type*)NULL);
166 typename vertex_property<Graph>::type
167 addProp(const Graph& g, It first, It last)
168 {
169 return addProp(g, first, last, (typename vertex_property<Graph>::type*)NULL);
177170 }
178171
179172 /** Merge the vertices in the sequence [first, last).
184177 typename graph_traits<Graph>::vertex_descriptor
185178 merge(Graph& g, It first, It last)
186179 {
187 typedef typename graph_traits<Graph>::vertex_descriptor
188 vertex_descriptor;
180 typedef typename graph_traits<Graph>::vertex_descriptor vertex_descriptor;
189181 assert(first != last);
190182 vertex_descriptor u = add_vertex(addProp(g, first, last), g);
191183 copy_in_edges(g, *first, u);
196188 /** Assemble unambiguous paths. Write the paths to out.
197189 * Every edge must satisfy the predicate. */
198190 template<typename Graph, typename OutIt, typename Predicate>
199 OutIt assemble_if(Graph& g, OutIt out, Predicate pred0)
191 OutIt
192 assemble_if(Graph& g, OutIt out, Predicate pred0)
200193 {
201194 typedef typename Graph::vertex_iterator vertex_iterator;
202195 // pred(e) = !isPalindrome(e) && pred0(e)
203 binary_compose<std::logical_and<bool>,
204 std::unary_negate<IsPalindrome<Graph> >, Predicate>
205 pred(compose2(std::logical_and<bool>(),
206 std::not1(IsPalindrome<Graph>(g)), pred0));
196 binary_compose<std::logical_and<bool>, std::unary_negate<IsPalindrome<Graph>>, Predicate> pred(
197 compose2(std::logical_and<bool>(), std::not1(IsPalindrome<Graph>(g)), pred0));
207198 std::pair<vertex_iterator, vertex_iterator> uit = g.vertices();
208199 for (vertex_iterator u = uit.first; u != uit.second; ++u) {
209 if (!contiguous_out(g, *u) || contiguous_in(g, *u)
210 || !pred(*out_edges(*u, g).first))
200 if (!contiguous_out(g, *u) || contiguous_in(g, *u) || !pred(*out_edges(*u, g).first))
211201 continue;
212202 typename output_iterator_traits<OutIt>::value_type path;
213203 assemble_if(g, *u, back_inserter(path), pred);
214204 assert(path.size() >= 2);
215205 assert(path.front() != path.back());
216206 merge(g, path.begin(), path.end());
217 remove_vertex_if(g, path.begin(), path.end(),
218 not1(std::mem_fun_ref(&ContigNode::ambiguous)));
207 remove_vertex_if(
208 g, path.begin(), path.end(), [](const ContigNode& c) { return !c.ambiguous(); });
219209 *out++ = path;
220210 }
221211 return out;
223213
224214 /** Assemble unambiguous paths. Write the paths to out. */
225215 template<typename Graph, typename OutIt>
226 OutIt assemble(Graph& g, OutIt out)
227 {
228 typedef typename graph_traits<Graph>::edge_descriptor
229 edge_descriptor;
216 OutIt
217 assemble(Graph& g, OutIt out)
218 {
219 typedef typename graph_traits<Graph>::edge_descriptor edge_descriptor;
230220 return assemble_if(g, out, True<edge_descriptor>());
231221 }
232222
233223 /** Return true if the edge e is +ve sense. */
234224 template<typename Graph>
235 struct IsPositive : std::unary_function<
236 typename graph_traits<Graph>::edge_descriptor, bool>
237 {
238 IsPositive(const Graph& g) : m_g(g) { }
239 bool operator()(
240 typename graph_traits<Graph>::edge_descriptor e) const
225 struct IsPositive : std::unary_function<typename graph_traits<Graph>::edge_descriptor, bool>
226 {
227 IsPositive(const Graph& g)
228 : m_g(g)
229 {}
230 bool operator()(typename graph_traits<Graph>::edge_descriptor e) const
241231 {
242 return !get(vertex_sense, m_g, source(e, m_g))
243 && !get(vertex_sense, m_g, target(e, m_g));
244 }
232 return !get(vertex_sense, m_g, source(e, m_g)) && !get(vertex_sense, m_g, target(e, m_g));
233 }
234
245235 private:
246236 const Graph& m_g;
247237 };
249239 /** Assemble unambiguous paths in forward orientation only.
250240 * Write the paths to out. */
251241 template<typename Graph, typename OutIt>
252 OutIt assemble_stranded(Graph& g, OutIt out)
242 OutIt
243 assemble_stranded(Graph& g, OutIt out)
253244 {
254245 return assemble_if(g, out, IsPositive<Graph>(g));
255246 }
259250 * deg+(v) = 0, and p(v) is true.
260251 * Stores all removed vertices in result.
261252 */
262 template <typename Graph, typename OutputIt, typename Pred>
263 OutputIt pruneTips_if(Graph& g, OutputIt result, Pred p)
253 template<typename Graph, typename OutputIt, typename Pred>
254 OutputIt
255 pruneTips_if(Graph& g, OutputIt result, Pred p)
264256 {
265257 typedef typename graph_traits<Graph>::adjacency_iterator Vit;
266258 typedef typename graph_traits<Graph>::vertex_iterator Uit;
276268 std::pair<Vit, Vit> vrange = adjacent_vertices(u, g);
277269 for (Vit vit = vrange.first; vit != vrange.second; ++vit) {
278270 V v = *vit;
279 //assert(v != u);
271 // assert(v != u);
280272 if (out_degree(v, g) == 0 && p(v))
281273 tips.push_back(v);
282274 }
284276
285277 /** Remove the tips. */
286278 remove_vertex_if(g, tips.begin(), tips.end(), True<V>());
287 std::transform(tips.begin(), tips.end(), result,
288 std::mem_fun_ref(&ContigNode::contigIndex));
289
279 std::transform(
280 tips.begin(), tips.end(), result, [](const ContigNode& c) { return c.contigIndex(); });
290281 return result;
291282 }
292283
293284 /** Return true if the vertex is a normal 1-in 0-out tip. */
294285 template<typename Graph>
295 struct IsTip : std::unary_function<
296 typename graph_traits<Graph>::vertex_descriptor, bool>
297 {
298 IsTip(const Graph& g) : m_g(g) { }
299 bool operator()(
300 typename graph_traits<Graph>::vertex_descriptor v) const
286 struct IsTip : std::unary_function<typename graph_traits<Graph>::vertex_descriptor, bool>
287 {
288 IsTip(const Graph& g)
289 : m_g(g)
290 {}
291 bool operator()(typename graph_traits<Graph>::vertex_descriptor v) const
301292 {
302293 return in_degree(v, m_g) == 1;
303294 }
295
304296 private:
305297 const Graph& m_g;
306298 };
310302 * and deg-(v) = 1 and deg+(v) = 0.
311303 * Stores all removed vertices in result.
312304 */
313 template <typename Graph, typename OutputIt>
314 OutputIt pruneTips(Graph& g, OutputIt result)
305 template<typename Graph, typename OutputIt>
306 OutputIt
307 pruneTips(Graph& g, OutputIt result)
315308 {
316309 return pruneTips_if(g, result, IsTip<Graph>(g));
317310 }
321314 * true.
322315 * Stores all removed vertices in result.
323316 */
324 template <typename Graph, typename OutputIt, typename Pred>
325 OutputIt removeIslands_if(Graph& g, OutputIt result, Pred p)
317 template<typename Graph, typename OutputIt, typename Pred>
318 OutputIt
319 removeIslands_if(Graph& g, OutputIt result, Pred p)
326320 {
327321 typedef typename graph_traits<Graph>::vertex_iterator Uit;
328322 typedef typename graph_traits<Graph>::vertex_descriptor V;
343337 }
344338
345339 /** Add missing complementary edges. */
346 template <typename DG>
347 size_t addComplementaryEdges(ContigGraph<DG>& g)
340 template<typename DG>
341 size_t
342 addComplementaryEdges(ContigGraph<DG>& g)
348343 {
349344 typedef ContigGraph<DG> Graph;
350345 typedef graph_traits<Graph> GTraits;
00 #ifndef ALIGNER_H
11 #define ALIGNER_H 1
22
3 #include "config.h"
4 #include "KAligner/Options.h"
53 #include "Alignment.h"
64 #include "ConstString.h"
75 #include "Functional.h"
6 #include "KAligner/Options.h"
87 #include "Kmer.h"
98 #include "UnorderedMap.h"
9 #include "config.h"
1010 #include <cassert>
1111 #include <cstdlib>
1212 #include <cstring> // for strcpy
2626 {
2727 uint32_t contig;
2828 uint32_t pos; // 0 indexed
29 Position(uint32_t contig = std::numeric_limits<uint32_t>::max(),
30 uint32_t pos = std::numeric_limits<uint32_t>::max())
31 : contig(contig), pos(pos) { }
29 Position(
30 uint32_t contig = std::numeric_limits<uint32_t>::max(),
31 uint32_t pos = std::numeric_limits<uint32_t>::max())
32 : contig(contig)
33 , pos(pos)
34 {}
3235
3336 /** Mark this seed as a duplicate. */
34 void setDuplicate(const char* thisContig, const char* otherContig,
35 const Sequence& kmer)
37 void setDuplicate(const char* thisContig, const char* otherContig, const Sequence& kmer)
3638 {
3739 if (opt::multimap == opt::IGNORE)
3840 contig = std::numeric_limits<uint32_t>::max();
3941 else {
40 std::cerr << "error: duplicate k-mer in "
41 << thisContig
42 << " also in "
43 << otherContig
44 << ": " << kmer << '\n';
42 std::cerr << "error: duplicate k-mer in " << thisContig << " also in " << otherContig
43 << ": " << kmer << '\n';
4544 exit(EXIT_FAILURE);
4645 }
4746 }
4847
4948 /** Return whether this seed is a duplciate. */
50 bool isDuplicate() const
51 {
52 return contig == std::numeric_limits<uint32_t>::max();
53 }
49 bool isDuplicate() const { return contig == std::numeric_limits<uint32_t>::max(); }
5450 };
5551
56 typedef unordered_multimap<Kmer, Position, hash<Kmer> >
57 SeqPosHashMultiMap;
52 typedef unordered_multimap<Kmer, Position, hash<Kmer>> SeqPosHashMultiMap;
5853
5954 #if HAVE_GOOGLE_SPARSE_HASH_MAP
60 # include <google/sparse_hash_map>
61 typedef google::sparse_hash_map<Kmer, Position, hash<Kmer> >
62 SeqPosHashUniqueMap;
55 #include <google/sparse_hash_map>
56 typedef google::sparse_hash_map<Kmer, Position, hash<Kmer>> SeqPosHashUniqueMap;
6357 #else
64 typedef unordered_map<Kmer, Position, hash<Kmer> >
65 SeqPosHashUniqueMap;
58 typedef unordered_map<Kmer, Position, hash<Kmer>> SeqPosHashUniqueMap;
6659 #endif
67
6860
6961 typedef std::vector<Alignment> AlignmentVector;
7062
7264 * Index a target sequence and align query sequences to that indexed
7365 * target.
7466 */
75 template <class SeqPosHashMap>
67 template<class SeqPosHashMap>
7668 class Aligner
7769 {
78 public:
79 typedef typename SeqPosHashMap::iterator map_iterator;
80 typedef typename SeqPosHashMap::const_iterator
81 map_const_iterator;
70 public:
71 typedef typename SeqPosHashMap::iterator map_iterator;
72 typedef typename SeqPosHashMap::const_iterator map_const_iterator;
8273
83 Aligner(int hashSize, size_t buckets)
84 : m_hashSize(hashSize), m_target(buckets) { }
74 Aligner(int hashSize, size_t buckets)
75 : m_hashSize(hashSize)
76 , m_target(buckets)
77 {}
8578
86 Aligner(int hashSize, size_t buckets, float factor)
87 : m_hashSize(hashSize)
88 {
89 m_target.max_load_factor(factor);
90 m_target.rehash(buckets);
91 }
79 Aligner(int hashSize, size_t buckets, float factor)
80 : m_hashSize(hashSize)
81 {
82 m_target.max_load_factor(factor);
83 m_target.rehash(buckets);
84 }
9285
93 void addReferenceSequence(const StringID& id,
94 const Sequence& seq);
95 void addReferenceSequence(const Kmer& kmer, Position pos);
86 void addReferenceSequence(const StringID& id, const Sequence& seq);
87 void addReferenceSequence(const Kmer& kmer, Position pos);
9688
97 template <class oiterator>
98 void alignRead(const std::string& qid, const Sequence& seq,
99 oiterator dest);
89 template<class oiterator>
90 void alignRead(const std::string& qid, const Sequence& seq, oiterator dest);
10091
101 size_t size() const { return m_target.size(); }
102 size_t bucket_count() const
103 {
104 return m_target.bucket_count();
105 }
92 size_t size() const { return m_target.size(); }
93 size_t bucket_count() const { return m_target.bucket_count(); }
10694
107 /** Return the number of duplicate k-mer in the target. */
108 size_t countDuplicates() const
109 {
110 assert(opt::multimap == opt::IGNORE);
111 return count_if(m_target.begin(), m_target.end(),
112 compose1(std::mem_fun_ref(&Position::isDuplicate),
113 mem_var(&SeqPosHashMap::value_type::second)));
114 }
95 /** Return the number of duplicate k-mer in the target. */
96 size_t countDuplicates() const
97 {
98 assert(opt::multimap == opt::IGNORE);
99 return count_if(
100 m_target.begin(), m_target.end(), [](const std::pair<const Kmer, Position>& s) {
101 return s.second.isDuplicate();
102 });
103 }
115104
116 private:
117 explicit Aligner(const Aligner&);
105 private:
106 explicit Aligner(const Aligner&);
118107
119 typedef std::map<unsigned, AlignmentVector> AlignmentSet;
108 typedef std::map<unsigned, AlignmentVector> AlignmentSet;
120109
121 void alignKmer(
122 AlignmentSet& aligns, const Sequence& kmer,
123 bool isRC, bool good, int read_ind, int seqLen);
110 void alignKmer(
111 AlignmentSet& aligns,
112 const Sequence& kmer,
113 bool isRC,
114 bool good,
115 int read_ind,
116 int seqLen);
124117
125 AlignmentSet getAlignmentsInternal(
126 const Sequence& seq, bool isRC);
118 AlignmentSet getAlignmentsInternal(const Sequence& seq, bool isRC);
127119
128 template <class oiterator>
129 void coalesceAlignments(
130 const std::string& qid, const std::string& seq,
131 const AlignmentSet& alignSet,
132 oiterator& dest);
120 template<class oiterator>
121 void coalesceAlignments(
122 const std::string& qid,
123 const std::string& seq,
124 const AlignmentSet& alignSet,
125 oiterator& dest);
133126
134 // The number of bases to hash on
135 int m_hashSize;
127 // The number of bases to hash on
128 int m_hashSize;
136129
137 /** A map of k-mer to contig coordinates. */
138 SeqPosHashMap m_target;
130 /** A map of k-mer to contig coordinates. */
131 SeqPosHashMap m_target;
139132
140 /** A dictionary of contig IDs. */
141 std::vector<const_string> m_dict;
133 /** A dictionary of contig IDs. */
134 std::vector<const_string> m_dict;
142135
143 unsigned contigIDToIndex(const std::string& id)
144 {
145 m_dict.push_back(id);
146 return m_dict.size() - 1;
147 }
136 unsigned contigIDToIndex(const std::string& id)
137 {
138 m_dict.push_back(id);
139 return m_dict.size() - 1;
140 }
148141
149 cstring contigIndexToID(unsigned index)
150 {
151 assert(index < m_dict.size());
152 return m_dict[index];
153 }
142 cstring contigIndexToID(unsigned index)
143 {
144 assert(index < m_dict.size());
145 return m_dict[index];
146 }
154147 };
155148
156149 #endif
11 #define PIPEMUX_H 1
22
33 #include "Pipe.h"
4 #include <semaphore.h>
4 #include "Semaphore.h"
55 #include <pthread.h>
66 #include <vector>
77
8 template <class T>
9 class PipeMux {
8 template<class T>
9 class PipeMux
10 {
1011 public:
11 /** Default constructor. */
12 PipeMux(size_t pipe_size = 1)
13 : m_index(0), m_entry_num(0), m_pipe_size(pipe_size)
12 /** Default constructor. */
13 PipeMux(size_t pipe_size = 1)
14 : m_index(0)
15 , m_entry_num(0)
16 , m_pipe_size(pipe_size)
1417 {
1518 pthread_rwlock_init(&m_rwlock_vecs, NULL);
1619 pthread_mutex_init(&m_mutex_index, NULL);
9396 }
9497
9598 /** Checks that the PipeMux is empty. */
96 bool empty() {
99 bool empty()
100 {
97101 pthread_rwlock_rdlock(&m_rwlock_vecs);
98102 bool isEmpty = m_pipes.empty();
99103 pthread_rwlock_unlock(&m_rwlock_vecs);
127131 }
128132 assert(i < m_pipes.size());
129133 delete m_pipes[i];
130 m_pipes.erase(m_pipes.begin()+i);
134 m_pipes.erase(m_pipes.begin() + i);
131135 assert(i < m_mutex_pipes.size());
132136 pthread_mutex_destroy(m_mutex_pipes[i]);
133137 delete m_mutex_pipes[i];
134 m_mutex_pipes.erase(m_mutex_pipes.begin()+i);
138 m_mutex_pipes.erase(m_mutex_pipes.begin() + i);
135139 // Make sure the index points to the next element.
136140 pthread_mutex_lock(&m_mutex_index);
137141 m_index = m_index == m_pipes.size() ? 0 : m_index;
139143
140144 pthread_rwlock_unlock(&m_rwlock_vecs);
141145 }
142
143146 };
144147
145148 #endif
0 #include "config.h"
10 #include "ContigPath.h"
21 #include "ContigProperties.h"
3 #include "Uncompress.h"
42 #include "Graph/Assemble.h"
53 #include "Graph/ContigGraph.h"
64 #include "Graph/ContigGraphAlgorithms.h"
86 #include "Graph/GraphAlgorithms.h"
97 #include "Graph/GraphIO.h"
108 #include "Graph/GraphUtil.h"
9 #include "Uncompress.h"
10 #include "config.h"
1111 #include <cassert>
1212 #include <cstdlib>
1313 #include <fstream>
2222
2323 #define PROGRAM "abyss-layout"
2424
25 static const char VERSION_MESSAGE[] =
26 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
27 "Written by Shaun Jackman.\n"
28 "\n"
29 "Copyright 2012 Shaun Jackman\n";
25 static const char VERSION_MESSAGE[] = PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
26 "Written by Shaun Jackman.\n"
27 "\n"
28 "Copyright 2012 Shaun Jackman\n";
3029
3130 static const char USAGE_MESSAGE[] =
32 "Usage: " PROGRAM " [OPTION]... OVERLAP\n"
33 "Layout contigs using the sequence overlap graph.\n"
34 "Output sequence paths.\n"
35 "\n"
36 " Arguments:\n"
37 "\n"
38 " OVERLAP the sequence overlap graph\n"
39 "\n"
40 " Options:\n"
41 "\n"
42 " -s, --min-length=N minimum sequence length [0]\n"
43 " -m, --min-overlap=N minimum overlap [0]\n"
44 " -k, --kmer=N length of a k-mer\n"
45 " -o, --out=FILE write the paths to FILE\n"
46 " -g, --graph=FILE write the graph to FILE\n"
47 " --tred remove transitive edges\n"
48 " --no-tred do not remove transitive edges [default]\n"
49 " --SS expect contigs to be oriented correctly\n"
50 " --no-SS no assumption about contig orientation [default]\n"
51 " -v, --verbose display verbose output\n"
52 " --help display this help and exit\n"
53 " --version output version information and exit\n"
54 "\n"
55 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
31 "Usage: " PROGRAM " [OPTION]... OVERLAP\n"
32 "Layout contigs using the sequence overlap graph.\n"
33 "Output sequence paths.\n"
34 "\n"
35 " Arguments:\n"
36 "\n"
37 " OVERLAP the sequence overlap graph\n"
38 "\n"
39 " Options:\n"
40 "\n"
41 " -s, --min-length=N minimum sequence length [0]\n"
42 " -m, --min-overlap=N minimum overlap [0]\n"
43 " -k, --kmer=N length of a k-mer\n"
44 " -o, --out=FILE write the paths to FILE\n"
45 " -g, --graph=FILE write the graph to FILE\n"
46 " --tred remove transitive edges\n"
47 " --no-tred do not remove transitive edges [default]\n"
48 " --SS expect contigs to be oriented correctly\n"
49 " --no-SS no assumption about contig orientation [default]\n"
50 " -v, --verbose display verbose output\n"
51 " --help display this help and exit\n"
52 " --version output version information and exit\n"
53 "\n"
54 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
5655
5756 namespace opt {
58 unsigned k; // used by ContigProperties
59
60 /** Minimum sequence length. */
61 static unsigned minLength;
62
63 /** Minimum overlap. */
64 static unsigned minOverlap;
65
66 /** Write the paths to this file. */
67 static string out;
68
69 /** Write the graph to this file. */
70 static string graphPath;
71
72 /** Remove transitive edges. */
73 static int tred;
74
75 /** Run a strand-specific RNA-Seq assembly. */
76 static int ss;
77
78 /** Verbose output. */
79 int verbose; // used by PopBubbles
80
81 /** Output format */
82 int format = DOT;
57 unsigned k; // used by ContigProperties
58
59 /** Minimum sequence length. */
60 static unsigned minLength;
61
62 /** Minimum overlap. */
63 static unsigned minOverlap;
64
65 /** Write the paths to this file. */
66 static string out;
67
68 /** Write the graph to this file. */
69 static string graphPath;
70
71 /** Remove transitive edges. */
72 static int tred;
73
74 /** Run a strand-specific RNA-Seq assembly. */
75 static int ss;
76
77 /** Verbose output. */
78 int verbose; // used by PopBubbles
79
80 /** Output format */
81 int format = DOT;
8382 }
8483
8584 static const char shortopts[] = "g:k:m:o:s:v";
8685
87 enum { OPT_HELP = 1, OPT_VERSION };
88
89 static const struct option longopts[] = {
90 { "graph", required_argument, NULL, 'g' },
91 { "kmer", required_argument, NULL, 'k' },
92 { "min-overlap", required_argument, NULL, 'm' },
93 { "out", required_argument, NULL, 'o' },
94 { "min-length", required_argument, NULL, 's' },
95 { "tred", no_argument, &opt::tred, true },
96 { "no-tred", no_argument, &opt::tred, false },
97 { "SS", no_argument, &opt::ss, 1 },
98 { "no-SS", no_argument, &opt::ss, 0 },
99 { "verbose", no_argument, NULL, 'v' },
100 { "help", no_argument, NULL, OPT_HELP },
101 { "version", no_argument, NULL, OPT_VERSION },
102 { NULL, 0, NULL, 0 }
86 enum
87 {
88 OPT_HELP = 1,
89 OPT_VERSION
10390 };
91
92 static const struct option longopts[] = { { "graph", required_argument, NULL, 'g' },
93 { "kmer", required_argument, NULL, 'k' },
94 { "min-overlap", required_argument, NULL, 'm' },
95 { "out", required_argument, NULL, 'o' },
96 { "min-length", required_argument, NULL, 's' },
97 { "tred", no_argument, &opt::tred, true },
98 { "no-tred", no_argument, &opt::tred, false },
99 { "SS", no_argument, &opt::ss, 1 },
100 { "no-SS", no_argument, &opt::ss, 0 },
101 { "verbose", no_argument, NULL, 'v' },
102 { "help", no_argument, NULL, OPT_HELP },
103 { "version", no_argument, NULL, OPT_VERSION },
104 { NULL, 0, NULL, 0 } };
104105
105106 /** An overlap graph. */
106107 typedef DirectedGraph<Length, Distance> DG;
107108 typedef ContigGraph<DG> Graph;
108109
109110 /** Remove short vertices. */
110 static void filterVertices(Graph& g, unsigned minLength)
111 static void
112 filterVertices(Graph& g, unsigned minLength)
111113 {
112114 typedef graph_traits<Graph> GTraits;
113115 typedef GTraits::vertex_descriptor V;
130132 }
131133
132134 if (opt::verbose > 0) {
133 cerr << "Ignored " << numRemoved << " sequences shorter than "
134 << minLength << " bp.\n";
135 cerr << "Ignored " << numRemoved << " sequences shorter than " << minLength << " bp.\n";
135136 printGraphStats(cerr, g);
136137 }
137138 }
138139
139140 /** Return true if the edge is a small overlap. */
140 struct IsSmallOverlap {
141 IsSmallOverlap(Graph& g) : m_g(g) { }
141 struct IsSmallOverlap
142 {
143 IsSmallOverlap(Graph& g)
144 : m_g(g)
145 {}
142146 bool operator()(graph_traits<Graph>::edge_descriptor e) const
143147 {
144148 int maxDistance = -opt::minOverlap;
148152 };
149153
150154 /** Remove small overlaps. */
151 static void filterEdges(Graph& g, unsigned minOverlap)
155 static void
156 filterEdges(Graph& g, unsigned minOverlap)
152157 {
153158 if (minOverlap == 0)
154159 return;
162167 }
163168
164169 /** Read a graph from the specified file. */
165 static void readGraph(const string& path, Graph& g)
170 static void
171 readGraph(const string& path, Graph& g)
166172 {
167173 if (opt::verbose > 0)
168174 cerr << "Reading `" << path << "'...\n";
177183 }
178184
179185 /** Return the length histogram. */
180 static Histogram buildLengthHistogram(const Graph& g)
186 static Histogram
187 buildLengthHistogram(const Graph& g)
181188 {
182189 typedef graph_traits<Graph>::vertex_descriptor V;
183190 typedef graph_traits<Graph>::vertex_iterator Vit;
192199 }
193200
194201 /** Run abyss-layout. */
195 int main(int argc, char** argv)
202 int
203 main(int argc, char** argv)
196204 {
197205 bool die = false;
198 for (int c; (c = getopt_long(argc, argv,
199 shortopts, longopts, NULL)) != -1;) {
206 for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
200207 istringstream arg(optarg != NULL ? optarg : "");
201208 switch (c) {
202 case '?':
209 case '?':
203210 die = true;
204211 break;
205 case 'k':
212 case 'k':
206213 arg >> opt::k;
207214 break;
208 case 'g':
215 case 'g':
209216 arg >> opt::graphPath;
210217 break;
211 case 'm':
218 case 'm':
212219 arg >> opt::minOverlap;
213220 break;
214 case 'o':
221 case 'o':
215222 arg >> opt::out;
216223 break;
217 case 's':
224 case 's':
218225 arg >> opt::minLength;
219226 break;
220 case 'v':
227 case 'v':
221228 opt::verbose++;
222229 break;
223 case OPT_HELP:
230 case OPT_HELP:
224231 cout << USAGE_MESSAGE;
225232 exit(EXIT_SUCCESS);
226 case OPT_VERSION:
233 case OPT_VERSION:
227234 cout << VERSION_MESSAGE;
228235 exit(EXIT_SUCCESS);
229236 }
230237 if (optarg != NULL && !arg.eof()) {
231 cerr << PROGRAM ": invalid option: `-"
232 << (char)c << optarg << "'\n";
238 cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n";
233239 exit(EXIT_FAILURE);
234240 }
235241 }
240246 }
241247
242248 if (die) {
243 cerr << "Try `" << PROGRAM
244 << " --help' for more information.\n";
249 cerr << "Try `" << PROGRAM << " --help' for more information.\n";
245250 exit(EXIT_FAILURE);
246251 }
247252
262267 if (opt::tred) {
263268 unsigned numTransitive = remove_transitive_edges(g);
264269 if (opt::verbose > 0) {
265 cerr << "Removed " << numTransitive
266 << " transitive edges.\n";
270 cerr << "Removed " << numTransitive << " transitive edges.\n";
267271 printGraphStats(cerr, g);
268272 }
269273 }
277281 sort(paths.begin(), paths.end());
278282 if (opt::verbose > 0) {
279283 unsigned n = 0;
280 for (ContigPaths::const_iterator it = paths.begin();
281 it != paths.end(); ++it)
284 for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it)
282285 n += it->size();
283 cerr << "Assembled " << n << " sequences in "
284 << paths.size() << " contigs.\n";
286 cerr << "Assembled " << n << " sequences in " << paths.size() << " contigs.\n";
285287 printGraphStats(cerr, g);
286288 }
287289
290292 ostream& out = opt::out.empty() || opt::out == "-" ? cout : fout;
291293 assert_good(out, opt::out);
292294 g_contigNames.unlock();
293 for (vector<ContigPath>::const_iterator it = paths.begin();
294 it != paths.end(); ++it)
295 for (vector<ContigPath>::const_iterator it = paths.begin(); it != paths.end(); ++it)
295296 out << createContigName() << '\t' << *it << '\n';
296297 assert_good(out, opt::out);
297298
298299 // Create the new vertices.
299 for (vector<ContigPath>::const_iterator it = paths.begin();
300 it != paths.end(); ++it) {
300 for (vector<ContigPath>::const_iterator it = paths.begin(); it != paths.end(); ++it) {
301301 const ContigPath& path = *it;
302302 merge(g, path.begin(), path.end());
303 remove_vertex_if(g, path.begin(), path.end(),
304 not1(std::mem_fun_ref(&ContigNode::ambiguous)));
303 remove_vertex_if(
304 g, path.begin(), path.end(), [](const ContigNode& c) { return !c.ambiguous(); });
305305 }
306306 if (opt::verbose > 0)
307307 printGraphStats(cerr, g);
6868 clang-format:
6969 for i in Bloom/RollingBloomDBGVisitor.h Bloom/bloom.cc BloomDBG/BloomIO.h \
7070 BloomDBG/Checkpoint.h BloomDBG/HashAgnosticCascadingBloom.h BloomDBG/bloom-dbg.* \
71 ABYSS/abyss.cc Assembly/BranchGroup.h FMIndex/BitArrays.h FilterGraph/FilterGraph.cc \
72 Graph/ContigGraphAlgorithms.h KAligner/Aligner.h KAligner/PipeMux.h Layout/layout.cc \
73 MergePaths/MergeContigs.cpp MergePaths/MergePaths.cpp ParseAligns/ParseAligns.cpp \
74 ParseAligns/abyss-fixmate.cc PathOverlap/PathOverlap.cpp PopBubbles/PopBubbles.cpp Scaffold/scaffold.cc \
7175 Unittest/BloomDBG/HashAgnosticCascadingBloomTest.cpp; do clang-format -style=file $$i >$$i.fixed; done
7276 for i in Bloom/RollingBloomDBGVisitor.h Bloom/bloom.cc BloomDBG/BloomIO.h \
7377 BloomDBG/Checkpoint.h BloomDBG/HashAgnosticCascadingBloom.h BloomDBG/bloom-dbg.* \
78 ABYSS/abyss.cc Assembly/BranchGroup.h FMIndex/BitArrays.h FilterGraph/FilterGraph.cc \
79 Graph/ContigGraphAlgorithms.h KAligner/Aligner.h KAligner/PipeMux.h Layout/layout.cc \
80 MergePaths/MergeContigs.cpp MergePaths/MergePaths.cpp ParseAligns/ParseAligns.cpp \
81 ParseAligns/abyss-fixmate.cc PathOverlap/PathOverlap.cpp PopBubbles/PopBubbles.cpp Scaffold/scaffold.cc \
7482 Unittest/BloomDBG/HashAgnosticCascadingBloomTest.cpp; do diff -su $$i $$i.fixed && rm -f $$i.fixed; done
7583 if ls *.fixed; then exit 1; fi
0 #include "config.h"
10 #include "Common/Options.h"
21 #include "ContigNode.h"
32 #include "ContigPath.h"
43 #include "ContigProperties.h"
4 #include "DataBase/DB.h"
5 #include "DataBase/Options.h"
56 #include "DataLayer/Options.h"
67 #include "Dictionary.h"
78 #include "FastaReader.h"
8 #include "Histogram.h"
9 #include "IOUtil.h"
10 #include "MemoryUtil.h"
11 #include "smith_waterman.h"
12 #include "Sequence.h"
13 #include "StringUtil.h"
14 #include "Uncompress.h"
159 #include "Graph/ContigGraph.h"
1610 #include "Graph/ContigGraphAlgorithms.h"
1711 #include "Graph/DirectedGraph.h"
1812 #include "Graph/GraphIO.h"
1913 #include "Graph/GraphUtil.h"
2014 #include "Graph/Options.h"
15 #include "Histogram.h"
16 #include "IOUtil.h"
17 #include "MemoryUtil.h"
18 #include "Sequence.h"
19 #include "StringUtil.h"
20 #include "Uncompress.h"
21 #include "config.h"
22 #include "smith_waterman.h"
2123 #include <algorithm>
2224 #include <cstdlib>
2325 #include <fstream>
2527 #include <iostream>
2628 #include <limits>
2729 #include <vector>
28 #include "DataBase/Options.h"
29 #include "DataBase/DB.h"
3030
3131 using namespace std;
3232
3535 DB db;
3636
3737 static const char VERSION_MESSAGE[] =
38 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
39 "Written by Shaun Jackman.\n"
40 "\n"
41 "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
38 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
39 "Written by Shaun Jackman.\n"
40 "\n"
41 "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
4242
4343 static const char USAGE_MESSAGE[] =
44 "Usage: " PROGRAM " -k<kmer> -o<out.fa> [OPTION]... FASTA [OVERLAP] PATH\n"
45 "Merge paths of contigs to create larger contigs.\n"
46 "\n"
47 " Arguments:\n"
48 "\n"
49 " FASTA contigs in FASTA format\n"
50 " OVERLAP contig overlap graph\n"
51 " PATH sequences of contig IDs\n"
52 "\n"
53 " Options:\n"
54 "\n"
55 " -k, --kmer=KMER_SIZE k-mer size\n"
56 " -o, --out=FILE output the merged contigs to FILE [stdout]\n"
57 " -g, --graph=FILE write the contig overlap graph to FILE\n"
58 " --merged output only merged contigs\n"
59 " --adj output the graph in adj format\n"
60 " --dot output the graph in dot format [default]\n"
61 " --dot-meancov same as above but give the mean coverage\n"
62 " --gfa output the graph in GFA1 format\n"
63 " --gfa1 output the graph in GFA1 format\n"
64 " --gfa2 output the graph in GFA2 format\n"
65 " --gv output the graph in GraphViz format\n"
66 " --sam output the graph in SAM format\n"
67 " -v, --verbose display verbose output\n"
68 " --help display this help and exit\n"
69 " --version output version information and exit\n"
70 " --db=FILE specify path of database repository in FILE\n"
71 " --library=NAME specify library NAME for database\n"
72 " --strain=NAME specify strain NAME for database\n"
73 " --species=NAME specify species NAME for database\n"
74 "\n"
75 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
44 "Usage: " PROGRAM " -k<kmer> -o<out.fa> [OPTION]... FASTA [OVERLAP] PATH\n"
45 "Merge paths of contigs to create larger contigs.\n"
46 "\n"
47 " Arguments:\n"
48 "\n"
49 " FASTA contigs in FASTA format\n"
50 " OVERLAP contig overlap graph\n"
51 " PATH sequences of contig IDs\n"
52 "\n"
53 " Options:\n"
54 "\n"
55 " -k, --kmer=KMER_SIZE k-mer size\n"
56 " -o, --out=FILE output the merged contigs to FILE [stdout]\n"
57 " -g, --graph=FILE write the contig overlap graph to FILE\n"
58 " --merged output only merged contigs\n"
59 " --adj output the graph in adj format\n"
60 " --dot output the graph in dot format [default]\n"
61 " --dot-meancov same as above but give the mean coverage\n"
62 " --gfa output the graph in GFA1 format\n"
63 " --gfa1 output the graph in GFA1 format\n"
64 " --gfa2 output the graph in GFA2 format\n"
65 " --gv output the graph in GraphViz format\n"
66 " --sam output the graph in SAM format\n"
67 " -v, --verbose display verbose output\n"
68 " --help display this help and exit\n"
69 " --version output version information and exit\n"
70 " --db=FILE specify path of database repository in FILE\n"
71 " --library=NAME specify library NAME for database\n"
72 " --strain=NAME specify strain NAME for database\n"
73 " --species=NAME specify species NAME for database\n"
74 "\n"
75 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
7676
7777 namespace opt {
78 string db;
79 dbVars metaVars;
80 unsigned k; // used by ContigProperties
81 unsigned pathCount; // num of initial paths
82
83 /** Output FASTA path. */
84 static string out = "-";
85
86 /** Output graph path. */
87 static string graphPath;
88
89 /** Output graph format. */
90 int format = DOT;
91
92 /** Output only merged contigs. */
93 int onlyMerged;
94
95 /** Minimum overlap. */
96 static unsigned minOverlap = 20;
97
98 /** Minimum alignment identity. */
99 static float minIdentity = 0.9;
78 string db;
79 dbVars metaVars;
80 unsigned k; // used by ContigProperties
81 unsigned pathCount; // num of initial paths
82
83 /** Output FASTA path. */
84 static string out = "-";
85
86 /** Output graph path. */
87 static string graphPath;
88
89 /** Output graph format. */
90 int format = DOT;
91
92 /** Output only merged contigs. */
93 int onlyMerged;
94
95 /** Minimum overlap. */
96 static unsigned minOverlap = 20;
97
98 /** Minimum alignment identity. */
99 static float minIdentity = 0.9;
100100 }
101101
102102 static const char shortopts[] = "g:k:o:v";
103103
104 enum { OPT_HELP = 1, OPT_VERSION, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
105 //enum { OPT_HELP = 1, OPT_VERSION };
106
107 static const struct option longopts[] = {
108 { "adj", no_argument, &opt::format, ADJ },
109 { "dot", no_argument, &opt::format, DOT },
110 { "dot-meancov", no_argument, &opt::format, DOT_MEANCOV },
111 { "gfa", no_argument, &opt::format, GFA1 },
112 { "gfa1", no_argument, &opt::format, GFA1 },
113 { "gfa2", no_argument, &opt::format, GFA2 },
114 { "gv", no_argument, &opt::format, DOT },
115 { "sam", no_argument, &opt::format, SAM },
116 { "graph", required_argument, NULL, 'g' },
117 { "kmer", required_argument, NULL, 'k' },
118 { "merged", no_argument, &opt::onlyMerged, 1 },
119 { "out", required_argument, NULL, 'o' },
120 { "path", required_argument, NULL, 'p' },
121 { "verbose", no_argument, NULL, 'v' },
122 { "help", no_argument, NULL, OPT_HELP },
123 { "version", no_argument, NULL, OPT_VERSION },
124 { "db", required_argument, NULL, OPT_DB },
125 { "library", required_argument, NULL, OPT_LIBRARY },
126 { "strain", required_argument, NULL, OPT_STRAIN },
127 { "species", required_argument, NULL, OPT_SPECIES },
128 { NULL, 0, NULL, 0 }
104 enum
105 {
106 OPT_HELP = 1,
107 OPT_VERSION,
108 OPT_DB,
109 OPT_LIBRARY,
110 OPT_STRAIN,
111 OPT_SPECIES
129112 };
113 // enum { OPT_HELP = 1, OPT_VERSION };
114
115 static const struct option longopts[] = { { "adj", no_argument, &opt::format, ADJ },
116 { "dot", no_argument, &opt::format, DOT },
117 { "dot-meancov", no_argument, &opt::format, DOT_MEANCOV },
118 { "gfa", no_argument, &opt::format, GFA1 },
119 { "gfa1", no_argument, &opt::format, GFA1 },
120 { "gfa2", no_argument, &opt::format, GFA2 },
121 { "gv", no_argument, &opt::format, DOT },
122 { "sam", no_argument, &opt::format, SAM },
123 { "graph", required_argument, NULL, 'g' },
124 { "kmer", required_argument, NULL, 'k' },
125 { "merged", no_argument, &opt::onlyMerged, 1 },
126 { "out", required_argument, NULL, 'o' },
127 { "path", required_argument, NULL, 'p' },
128 { "verbose", no_argument, NULL, 'v' },
129 { "help", no_argument, NULL, OPT_HELP },
130 { "version", no_argument, NULL, OPT_VERSION },
131 { "db", required_argument, NULL, OPT_DB },
132 { "library", required_argument, NULL, OPT_LIBRARY },
133 { "strain", required_argument, NULL, OPT_STRAIN },
134 { "species", required_argument, NULL, OPT_SPECIES },
135 { NULL, 0, NULL, 0 } };
130136
131137 /* A contig sequence. */
132 struct Contig {
138 struct Contig
139 {
133140 Contig(const string& comment, const string& seq)
134 : comment(comment), seq(seq) { }
135 Contig(const FastaRecord& o) : comment(o.comment), seq(o.seq) { }
141 : comment(comment)
142 , seq(seq)
143 {}
144 Contig(const FastaRecord& o)
145 : comment(o.comment)
146 , seq(o.seq)
147 {}
136148 string comment;
137149 string seq;
138150 };
143155 /** Return the sequence of the specified contig node. The sequence
144156 * may be ambiguous or reverse complemented.
145157 */
146 static Sequence sequence(const Contigs& contigs, const ContigNode& id)
158 static Sequence
159 sequence(const Contigs& contigs, const ContigNode& id)
147160 {
148161 if (id.ambiguous()) {
149162 string s(id.ambiguousSequence());
159172 /** Return a consensus sequence of a and b.
160173 * @return an empty string if a consensus could not be found
161174 */
162 static string createConsensus(const Sequence& a, const Sequence& b)
175 static string
176 createConsensus(const Sequence& a, const Sequence& b)
163177 {
164178 assert(a.length() == b.length());
165179 if (a == b)
166180 return a;
167181 string s;
168182 s.reserve(a.length());
169 for (string::const_iterator ita = a.begin(), itb = b.begin();
170 ita != a.end(); ++ita, ++itb) {
183 for (string::const_iterator ita = a.begin(), itb = b.begin(); ita != a.end(); ++ita, ++itb) {
171184 bool mask = islower(*ita) || islower(*itb);
172185 char ca = toupper(*ita), cb = toupper(*itb);
173 char c = ca == cb ? ca
174 : ca == 'N' ? cb
175 : cb == 'N' ? ca
176 : ambiguityIsSubset(ca, cb) ? ambiguityOr(ca, cb)
177 : 'x';
186 char c = ca == cb
187 ? ca
188 : ca == 'N'
189 ? cb
190 : cb == 'N' ? ca : ambiguityIsSubset(ca, cb) ? ambiguityOr(ca, cb) : 'x';
178191 if (c == 'x')
179192 return string("");
180193 s += mask ? tolower(c) : c;
182195 return s;
183196 }
184197
185 typedef ContigGraph<DirectedGraph<ContigProperties, Distance> > Graph;
198 typedef ContigGraph<DirectedGraph<ContigProperties, Distance>> Graph;
186199 typedef graph_traits<Graph>::vertex_descriptor vertex_descriptor;
187200
188201 /** Return the properties of the specified vertex, unless u is
189202 * ambiguous, in which case return the length of the ambiguous
190203 * sequence.
191204 */
192 static inline
193 ContigProperties get(vertex_bundle_t, const Graph& g, ContigNode u)
194 {
195 return u.ambiguous()
196 ? ContigProperties(u.length() + opt::k - 1, 0)
197 : g[u];
205 static inline ContigProperties
206 get(vertex_bundle_t, const Graph& g, ContigNode u)
207 {
208 return u.ambiguous() ? ContigProperties(u.length() + opt::k - 1, 0) : g[u];
198209 }
199210
200211 /** Append the sequence of contig v to seq. */
201 static void mergeContigs(const Graph& g, const Contigs& contigs,
202 vertex_descriptor u, vertex_descriptor v,
203 Sequence& seq, const ContigPath& path)
212 static void
213 mergeContigs(
214 const Graph& g,
215 const Contigs& contigs,
216 vertex_descriptor u,
217 vertex_descriptor v,
218 Sequence& seq,
219 const ContigPath& path)
204220 {
205221 int d = get(edge_bundle, g, u, v).distance;
206222 assert(d < 0);
234250 unsigned matches = o.overlap_match;
235251 const string& consensus = o.overlap_str;
236252 float identity = (float)matches / consensus.size();
237 good = matches >= opt::minOverlap
238 && identity >= opt::minIdentity;
253 good = matches >= opt::minOverlap && identity >= opt::minIdentity;
239254 if (opt::verbose > 2)
240 cerr << matches << " / " << consensus.size()
241 << " = " << identity
242 << (matches < opt::minOverlap ? " (too few)"
243 : identity < opt::minIdentity ? " (too low)"
244 : " (good)") << '\n';
255 cerr << matches << " / " << consensus.size() << " = " << identity
256 << (matches < opt::minOverlap
257 ? " (too few)"
258 : identity < opt::minIdentity ? " (too low)" : " (good)")
259 << '\n';
245260 }
246261 if (good) {
247262 assert(overlaps.size() == 1);
251266 seq += Sequence(s, o.overlap_h_pos + 1);
252267 } else {
253268 cerr << "warning: the head of " << get(vertex_name, g, v)
254 << " does not match the tail of the previous contig\n"
255 << ao << '\n' << bo << '\n' << path << endl;
269 << " does not match the tail of the previous contig\n"
270 << ao << '\n'
271 << bo << '\n'
272 << path << endl;
256273 seq += 'n';
257274 seq += s;
258275 }
259276 }
260277
261278 /** Return a FASTA comment for the specified path. */
262 static void pathToComment(ostream& out,
263 const Graph& g, const ContigPath& path)
279 static void
280 pathToComment(ostream& out, const Graph& g, const ContigPath& path)
264281 {
265282 out << get(vertex_name, g, path.front());
266283 if (path.size() == 1)
273290 }
274291
275292 /** Merge the specified path. */
276 static Contig mergePath(const Graph& g, const Contigs& contigs,
277 const ContigPath& path)
293 static Contig
294 mergePath(const Graph& g, const Contigs& contigs, const ContigPath& path)
278295 {
279296 Sequence seq;
280297 unsigned coverage = 0;
281 for (ContigPath::const_iterator it = path.begin();
282 it != path.end(); ++it) {
298 for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) {
283299 if (!it->ambiguous())
284300 coverage += g[*it].coverage;
285301 if (seq.empty()) {
286302 seq = sequence(contigs, *it);
287303 } else {
288304 assert(it != path.begin());
289 mergeContigs(g, contigs, *(it-1), *it, seq, path);
305 mergeContigs(g, contigs, *(it - 1), *it, seq, path);
290306 }
291307 }
292308 ostringstream ss;
301317 /** Read contig paths from the specified file.
302318 * @param ids [out] the string ID of the paths
303319 */
304 static ContigPaths readPaths(const string& inPath,
305 vector<string>* ids = NULL)
320 static ContigPaths
321 readPaths(const string& inPath, vector<string>* ids = NULL)
306322 {
307323 if (ids != NULL)
308324 assert(ids->empty());
324340
325341 ++count;
326342 if (opt::verbose > 1 && count % 1000000 == 0)
327 cerr << "Read " << count << " paths. "
328 "Using " << toSI(getMemoryUsage())
329 << "B of memory.\n";
343 cerr << "Read " << count
344 << " paths. "
345 "Using "
346 << toSI(getMemoryUsage()) << "B of memory.\n";
330347 }
331348 if (opt::verbose > 0)
332 cerr << "Read " << count << " paths. "
333 "Using " << toSI(getMemoryUsage()) << "B of memory.\n";
349 cerr << "Read " << count
350 << " paths. "
351 "Using "
352 << toSI(getMemoryUsage()) << "B of memory.\n";
334353 if (!opt::db.empty())
335354 addToDb(db, "Init_paths", count);
336355 opt::pathCount = count;
340359
341360 /** Finds all contigs used in each path in paths, and
342361 * marks them as seen in the vector seen. */
343 static void seenContigs(vector<bool>& seen, const ContigPaths& paths)
344 {
345 for (ContigPaths::const_iterator it = paths.begin();
346 it != paths.end(); ++it)
347 for (ContigPath::const_iterator itc = it->begin();
348 itc != it->end(); ++itc)
362 static void
363 seenContigs(vector<bool>& seen, const ContigPaths& paths)
364 {
365 for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it)
366 for (ContigPath::const_iterator itc = it->begin(); itc != it->end(); ++itc)
349367 if (itc->id() < seen.size())
350368 seen[itc->id()] = true;
351369 }
353371 /** Mark contigs for removal. An empty path indicates that a contig
354372 * should be removed.
355373 */
356 static void markRemovedContigs(vector<bool>& marked,
357 const vector<string>& pathIDs, const ContigPaths& paths)
358 {
359 for (ContigPaths::const_iterator it = paths.begin();
360 it != paths.end(); ++it) {
374 static void
375 markRemovedContigs(vector<bool>& marked, const vector<string>& pathIDs, const ContigPaths& paths)
376 {
377 for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) {
361378 if (it->empty()) {
362 size_t i = get(g_contigNames,
363 pathIDs[it - paths.begin()]);
379 size_t i = get(g_contigNames, pathIDs[it - paths.begin()]);
364380 assert(i < marked.size());
365381 marked[i] = true;
366382 }
368384 }
369385
370386 /** Output the updated overlap graph. */
371 static void outputGraph(Graph& g,
372 const vector<string>& pathIDs, const ContigPaths& paths,
373 const string& commandLine)
387 static void
388 outputGraph(
389 Graph& g,
390 const vector<string>& pathIDs,
391 const ContigPaths& paths,
392 const string& commandLine)
374393 {
375394 typedef graph_traits<Graph>::vertex_descriptor V;
376395
377396 // Add the path vertices.
378397 g_contigNames.unlock();
379 for (ContigPaths::const_iterator it = paths.begin();
380 it != paths.end(); ++it) {
398 for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) {
381399 const ContigPath& path = *it;
382400 const string& id = pathIDs[it - paths.begin()];
383401 if (!path.empty()) {
388406 g_contigNames.lock();
389407
390408 // Remove the vertices that are used in paths.
391 for (ContigPaths::const_iterator it = paths.begin();
392 it != paths.end(); ++it) {
409 for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) {
393410 const ContigPath& path = *it;
394411 const string& id = pathIDs[it - paths.begin()];
395412 if (path.empty()) {
396413 remove_vertex(find_vertex(id, false, g), g);
397414 } else {
398 remove_vertex_if(g, path.begin(), path.end(),
399 not1(std::mem_fun_ref(&ContigNode::ambiguous)));
415 remove_vertex_if(
416 g, path.begin(), path.end(), [](const ContigNode& c) { return !c.ambiguous(); });
400417 }
401418 }
402419
413430 printGraphStats(cerr, g);
414431 }
415432
416 int main(int argc, char** argv)
433 int
434 main(int argc, char** argv)
417435 {
418436 opt::trimMasked = false;
419437
421439 {
422440 ostringstream ss;
423441 char** last = argv + argc - 1;
424 copy(argv, last, ostream_iterator<const char *>(ss, " "));
442 copy(argv, last, ostream_iterator<const char*>(ss, " "));
425443 ss << *last;
426444 commandLine = ss.str();
427445 }
430448 opt::metaVars.resize(3);
431449
432450 bool die = false;
433 for (int c; (c = getopt_long(argc, argv,
434 shortopts, longopts, NULL)) != -1;) {
451 for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
435452 istringstream arg(optarg != NULL ? optarg : "");
436453 switch (c) {
437 case '?': die = true; break;
438 case 'g': arg >> opt::graphPath; break;
439 case 'k': arg >> opt::k; break;
440 case 'o': arg >> opt::out; break;
441 case 'v': opt::verbose++; break;
442 case OPT_HELP:
443 cout << USAGE_MESSAGE;
444 exit(EXIT_SUCCESS);
445 case OPT_VERSION:
446 cout << VERSION_MESSAGE;
447 exit(EXIT_SUCCESS);
448 case OPT_DB:
449 arg >> opt::db; break;
450 case OPT_LIBRARY:
451 arg >> opt::metaVars[0]; break;
452 case OPT_STRAIN:
453 arg >> opt::metaVars[1]; break;
454 case OPT_SPECIES:
455 arg >> opt::metaVars[2]; break;
454 case '?':
455 die = true;
456 break;
457 case 'g':
458 arg >> opt::graphPath;
459 break;
460 case 'k':
461 arg >> opt::k;
462 break;
463 case 'o':
464 arg >> opt::out;
465 break;
466 case 'v':
467 opt::verbose++;
468 break;
469 case OPT_HELP:
470 cout << USAGE_MESSAGE;
471 exit(EXIT_SUCCESS);
472 case OPT_VERSION:
473 cout << VERSION_MESSAGE;
474 exit(EXIT_SUCCESS);
475 case OPT_DB:
476 arg >> opt::db;
477 break;
478 case OPT_LIBRARY:
479 arg >> opt::metaVars[0];
480 break;
481 case OPT_STRAIN:
482 arg >> opt::metaVars[1];
483 break;
484 case OPT_SPECIES:
485 arg >> opt::metaVars[2];
486 break;
456487 }
457488 if (optarg != NULL && !arg.eof()) {
458 cerr << PROGRAM ": invalid option: `-"
459 << (char)c << optarg << "'\n";
489 cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n";
460490 exit(EXIT_FAILURE);
461491 }
462492 }
467497 }
468498
469499 if (opt::out.empty()) {
470 cerr << PROGRAM ": " << "missing -o,--out option\n";
500 cerr << PROGRAM ": "
501 << "missing -o,--out option\n";
471502 die = true;
472503 }
473504
482513 }
483514
484515 if (die) {
485 cerr << "Try `" << PROGRAM
486 << " --help' for more information.\n";
516 cerr << "Try `" << PROGRAM << " --help' for more information.\n";
487517 exit(EXIT_FAILURE);
488518 }
489519
490520 if (!opt::db.empty()) {
491 init(db,
492 opt::db,
493 opt::verbose,
494 PROGRAM,
495 opt::getCommand(argc, argv),
496 opt::metaVars
497 );
521 init(db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars);
498522 addToDb(db, "K", opt::k);
499523 }
500524
512536 fin >> g;
513537 assert(fin.eof());
514538 if (opt::verbose > 0)
515 cerr << "Read " << num_vertices(g) << " vertices. "
516 "Using " << toSI(getMemoryUsage())
517 << "B of memory.\n";
539 cerr << "Read " << num_vertices(g)
540 << " vertices. "
541 "Using "
542 << toSI(getMemoryUsage()) << "B of memory.\n";
518543 if (!opt::db.empty()) {
519544 addToDb(db, "Init_vertices", num_vertices(g));
520545 addToDb(db, "Init_edges", num_edges(g));
530555 unsigned count = 0;
531556 FastaReader in(contigFile, FastaReader::NO_FOLD_CASE);
532557 for (FastaRecord rec; in >> rec;) {
533 if (!adjPath.empty()
534 && g_contigNames.count(rec.id) == 0)
558 if (!adjPath.empty() && g_contigNames.count(rec.id) == 0)
535559 continue;
536560 if (adjPath.empty()) {
537 graph_traits<Graph>::vertex_descriptor
538 u = add_vertex(ContigProperties(rec.seq.length(), 0), g);
561 graph_traits<Graph>::vertex_descriptor u =
562 add_vertex(ContigProperties(rec.seq.length(), 0), g);
539563 put(vertex_name, g, u, rec.id);
540564 }
541565 assert(get(g_contigNames, rec.id) == contigs.size());
543567
544568 ++count;
545569 if (opt::verbose > 1 && count % 1000000 == 0)
546 cerr << "Read " << count << " sequences. "
547 "Using " << toSI(getMemoryUsage())
548 << "B of memory.\n";
570 cerr << "Read " << count
571 << " sequences. "
572 "Using "
573 << toSI(getMemoryUsage()) << "B of memory.\n";
549574 }
550575 if (opt::verbose > 0)
551 cerr << "Read " << count << " sequences. "
552 "Using " << toSI(getMemoryUsage())
553 << "B of memory.\n";
576 cerr << "Read " << count
577 << " sequences. "
578 "Using "
579 << toSI(getMemoryUsage()) << "B of memory.\n";
554580 if (!opt::db.empty())
555581 addToDb(db, "Init_seq", count);
556582 assert(in.eof());
570596 // Output those contigs that were not seen in a path.
571597 Histogram lengthHistogram;
572598 ofstream fout;
573 ostream& out = opt::out == "-" ? cout
574 : (fout.open(opt::out.c_str()), fout);
599 ostream& out = opt::out == "-" ? cout : (fout.open(opt::out.c_str()), fout);
575600 assert_good(out, opt::out);
576601 if (!opt::onlyMerged) {
577 for (Contigs::const_iterator it = contigs.begin();
578 it != contigs.end(); ++it) {
602 for (Contigs::const_iterator it = contigs.begin(); it != contigs.end(); ++it) {
579603 ContigID id(it - contigs.begin());
580604 if (!seen[id]) {
581605 const Contig& contig = *it;
584608 out << ' ' << contig.comment;
585609 out << '\n' << contig.seq << '\n';
586610 if (opt::verbose > 0)
587 lengthHistogram.insert(
588 count_if(contig.seq.begin(), contig.seq.end(),
589 isACGT));
611 lengthHistogram.insert(count_if(contig.seq.begin(), contig.seq.end(), isACGT));
590612 }
591613 }
592614 }
593615
594616 unsigned npaths = 0;
595 for (ContigPaths::const_iterator it = paths.begin();
596 it != paths.end(); ++it) {
617 for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) {
597618 const ContigPath& path = *it;
598619 if (path.empty())
599620 continue;
600621 Contig contig = mergePath(g, contigs, path);
601 out << '>' << pathIDs[it - paths.begin()]
602 << ' ' << contig.comment << '\n'
603 << contig.seq << '\n';
622 out << '>' << pathIDs[it - paths.begin()] << ' ' << contig.comment << '\n'
623 << contig.seq << '\n';
604624 assert_good(out, opt::out);
605625 npaths++;
606626 if (opt::verbose > 0)
607 lengthHistogram.insert(
608 count_if(contig.seq.begin(), contig.seq.end(),
609 isACGT));
627 lengthHistogram.insert(count_if(contig.seq.begin(), contig.seq.end(), isACGT));
610628 }
611629
612630 if (!opt::graphPath.empty())
616634 return 0;
617635
618636 float minCov = numeric_limits<float>::infinity(),
619 minCovUsed = numeric_limits<float>::infinity();
637 minCovUsed = numeric_limits<float>::infinity();
620638 for (unsigned i = 0; i < contigs.size(); i++) {
621639 ContigProperties vp = g[ContigNode(i, false)];
622640 if (vp.coverage == 0 || vp.length < opt::k)
627645 minCovUsed = min(minCovUsed, cov);
628646 }
629647
630 cerr << "The minimum coverage of single-end contigs is "
631 << minCov << ".\n"
632 << "The minimum coverage of merged contigs is "
633 << minCovUsed << ".\n";
648 cerr << "The minimum coverage of single-end contigs is " << minCov << ".\n"
649 << "The minimum coverage of merged contigs is " << minCovUsed << ".\n";
634650 if (minCov < minCovUsed)
635651 cerr << "Consider increasing the coverage threshold "
636 "parameter, c, to " << minCovUsed << ".\n";
652 "parameter, c, to "
653 << minCovUsed << ".\n";
637654
638655 if (opt::verbose > 0) {
639656 const unsigned STATS_MIN_LENGTH = 200; // bp
640 printContiguityStats(cerr, lengthHistogram, STATS_MIN_LENGTH)
641 << '\t' << opt::out << '\n';
657 printContiguityStats(cerr, lengthHistogram, STATS_MIN_LENGTH) << '\t' << opt::out << '\n';
642658 }
643659 return 0;
644660 }
0 #include "config.h"
10 #include "Common/Options.h"
21 #include "ContigID.h"
32 #include "ContigPath.h"
43 #include "Functional.h" // for mem_var
5 #include "IOUtil.h"
6 #include "Uncompress.h"
74 #include "Graph/Assemble.h"
85 #include "Graph/ContigGraph.h"
96 #include "Graph/DirectedGraph.h"
107 #include "Graph/DotIO.h"
118 #include "Graph/GraphAlgorithms.h"
129 #include "Graph/GraphUtil.h"
10 #include "IOUtil.h"
11 #include "Uncompress.h"
12 #include "config.h"
13 #include <algorithm>
1314 #include <boost/tuple/tuple.hpp>
14 #include <algorithm>
1515 #include <cassert>
1616 #include <climits> // for UINT_MAX
1717 #include <cstdlib>
2626 #include <set>
2727 #include <vector>
2828 #if _OPENMP
29 # include <omp.h>
29 #include <omp.h>
3030 #endif
31 #include "DataBase/DB.h"
3132 #include "DataBase/Options.h"
32 #include "DataBase/DB.h"
3333
3434 using namespace std;
3535 using boost::tie;
3939 DB db;
4040
4141 static const char VERSION_MESSAGE[] =
42 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
43 "Written by Jared Simpson and Shaun Jackman.\n"
44 "\n"
45 "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
42 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
43 "Written by Jared Simpson and Shaun Jackman.\n"
44 "\n"
45 "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
4646
4747 static const char USAGE_MESSAGE[] =
48 "Usage: " PROGRAM " -k<kmer> [OPTION]... LEN PATH\n"
49 "Merge sequences of contigs IDs.\n"
50 "\n"
51 " Arguments:\n"
52 "\n"
53 " LEN lengths of the contigs\n"
54 " PATH sequences of contig IDs\n"
55 "\n"
56 " Options:\n"
57 "\n"
58 " -k, --kmer=KMER_SIZE k-mer size\n"
59 " -s, --seed-length=L minimum length of a seed contig [0]\n"
60 " -G, --genome-size=N expected genome size. Used to calculate NG50\n"
61 " and associated stats [disabled]\n"
62 " -o, --out=FILE write result to FILE\n"
63 " --no-greedy use the non-greedy algorithm [default]\n"
64 " --greedy use the greedy algorithm\n"
65 " -g, --graph=FILE write the path overlap graph to FILE\n"
66 " -j, --threads=N use N parallel threads [1]\n"
67 " -v, --verbose display verbose output\n"
68 " --help display this help and exit\n"
69 " --version output version information and exit\n"
70 " --db=FILE specify path of database repository in FILE\n"
71 " --library=NAME specify library NAME for database\n"
72 " --strain=NAME specify strain NAME for database\n"
73 " --species=NAME specify species NAME for database\n"
74 "\n"
75 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
48 "Usage: " PROGRAM " -k<kmer> [OPTION]... LEN PATH\n"
49 "Merge sequences of contigs IDs.\n"
50 "\n"
51 " Arguments:\n"
52 "\n"
53 " LEN lengths of the contigs\n"
54 " PATH sequences of contig IDs\n"
55 "\n"
56 " Options:\n"
57 "\n"
58 " -k, --kmer=KMER_SIZE k-mer size\n"
59 " -s, --seed-length=L minimum length of a seed contig [0]\n"
60 " -G, --genome-size=N expected genome size. Used to calculate NG50\n"
61 " and associated stats [disabled]\n"
62 " -o, --out=FILE write result to FILE\n"
63 " --no-greedy use the non-greedy algorithm [default]\n"
64 " --greedy use the greedy algorithm\n"
65 " -g, --graph=FILE write the path overlap graph to FILE\n"
66 " -j, --threads=N use N parallel threads [1]\n"
67 " -v, --verbose display verbose output\n"
68 " --help display this help and exit\n"
69 " --version output version information and exit\n"
70 " --db=FILE specify path of database repository in FILE\n"
71 " --library=NAME specify library NAME for database\n"
72 " --strain=NAME specify strain NAME for database\n"
73 " --species=NAME specify species NAME for database\n"
74 "\n"
75 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
7676
7777 namespace opt {
78 string db;
79 dbVars metaVars;
80 unsigned k; // used by GraphIO
81 static string out;
82 static int threads = 1;
83
84 /** Minimum length of a seed contig. */
85 static unsigned seedLen;
86
87 /** Use a greedy algorithm. */
88 static int greedy;
89
90 /** Genome size. Used to calculate NG50. */
91 static long long unsigned genomeSize;
92
93 /** Write the path overlap graph to this file. */
94 static string graphPath;
78 string db;
79 dbVars metaVars;
80 unsigned k; // used by GraphIO
81 static string out;
82 static int threads = 1;
83
84 /** Minimum length of a seed contig. */
85 static unsigned seedLen;
86
87 /** Use a greedy algorithm. */
88 static int greedy;
89
90 /** Genome size. Used to calculate NG50. */
91 static long long unsigned genomeSize;
92
93 /** Write the path overlap graph to this file. */
94 static string graphPath;
9595 }
9696
9797 static const char shortopts[] = "G:g:j:k:o:s:v";
9898
99 enum { OPT_HELP = 1, OPT_VERSION, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
100 //enum { OPT_HELP = 1, OPT_VERSION };
101
102 static const struct option longopts[] = {
103 { "genome-size", required_argument, NULL, 'G' },
104 { "graph", no_argument, NULL, 'g' },
105 { "greedy", no_argument, &opt::greedy, true },
106 { "no-greedy", no_argument, &opt::greedy, false },
107 { "kmer", required_argument, NULL, 'k' },
108 { "out", required_argument, NULL, 'o' },
109 { "seed-length", required_argument, NULL, 's' },
110 { "threads", required_argument, NULL, 'j' },
111 { "verbose", no_argument, NULL, 'v' },
112 { "help", no_argument, NULL, OPT_HELP },
113 { "version", no_argument, NULL, OPT_VERSION },
114 { "db", required_argument, NULL, OPT_DB },
115 { "library", required_argument, NULL, OPT_LIBRARY },
116 { "strain", required_argument, NULL, OPT_STRAIN },
117 { "species", required_argument, NULL, OPT_SPECIES },
118 { NULL, 0, NULL, 0 }
99 enum
100 {
101 OPT_HELP = 1,
102 OPT_VERSION,
103 OPT_DB,
104 OPT_LIBRARY,
105 OPT_STRAIN,
106 OPT_SPECIES
119107 };
108 // enum { OPT_HELP = 1, OPT_VERSION };
109
110 static const struct option longopts[] = { { "genome-size", required_argument, NULL, 'G' },
111 { "graph", no_argument, NULL, 'g' },
112 { "greedy", no_argument, &opt::greedy, true },
113 { "no-greedy", no_argument, &opt::greedy, false },
114 { "kmer", required_argument, NULL, 'k' },
115 { "out", required_argument, NULL, 'o' },
116 { "seed-length", required_argument, NULL, 's' },
117 { "threads", required_argument, NULL, 'j' },
118 { "verbose", no_argument, NULL, 'v' },
119 { "help", no_argument, NULL, OPT_HELP },
120 { "version", no_argument, NULL, OPT_VERSION },
121 { "db", required_argument, NULL, OPT_DB },
122 { "library", required_argument, NULL, OPT_LIBRARY },
123 { "strain", required_argument, NULL, OPT_STRAIN },
124 { "species", required_argument, NULL, OPT_SPECIES },
125 { NULL, 0, NULL, 0 } };
120126
121127 typedef map<ContigID, ContigPath> ContigPathMap;
122128
123129 /** Orientation of an edge. */
124 enum dir_type {
130 enum dir_type
131 {
125132 DIR_X, // u--v none
126133 DIR_F, // u->v forward
127134 DIR_R, // u<-v reverse
131138 /** Lengths of contigs measured in k-mer. */
132139 typedef vector<unsigned> Lengths;
133140
134 static ContigPath align(const Lengths& lengths,
135 const ContigPath& p1, const ContigPath& p2,
136 ContigNode pivot);
137 static ContigPath align(const Lengths& lengths,
138 const ContigPath& p1, const ContigPath& p2,
139 ContigNode pivot, dir_type& orientation);
141 static ContigPath
142 align(const Lengths& lengths, const ContigPath& p1, const ContigPath& p2, ContigNode pivot);
143 static ContigPath
144 align(
145 const Lengths& lengths,
146 const ContigPath& p1,
147 const ContigPath& p2,
148 ContigNode pivot,
149 dir_type& orientation);
140150
141151 static bool gDebugPrint;
142152
143153 /**
144 * Build a histogram of the lengths of the assembled paths and unused contigs.
145 * Note: This function does not account for the ammount of overlap between contigs.
146 */
154 * Build a histogram of the lengths of the assembled paths and unused contigs.
155 * Note: This function does not account for the ammount of overlap between contigs.
156 */
147157 static Histogram
148158 buildAssembledLengthHistogram(const Lengths& lengths, const ContigPaths& paths)
149159 {
152162 // Compute the lengths of the paths
153163 // Mark the vertices that are used in paths
154164 vector<bool> used(lengths.size());
155 for (ContigPaths::const_iterator pathIt = paths.begin();
156 pathIt != paths.end(); ++pathIt) {
165 for (ContigPaths::const_iterator pathIt = paths.begin(); pathIt != paths.end(); ++pathIt) {
157166 const ContigPath& path = *pathIt;
158167 size_t totalLength = 0;
159168 for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) {
183192 Histogram h = buildAssembledLengthHistogram(lengths, paths);
184193 const unsigned STATS_MIN_LENGTH = opt::seedLen;
185194 printContiguityStats(cerr, h, STATS_MIN_LENGTH, true, "\t", opt::genomeSize)
186 << '\t' << opt::out << '\n';
195 << '\t' << opt::out << '\n';
187196 }
188197
189198 /** Return all contigs that are tandem repeats, identified as those
190199 * contigs that appear more than once in a single path.
191200 */
192 static set<ContigID> findRepeats(const ContigPathMap& paths)
201 static set<ContigID>
202 findRepeats(const ContigPathMap& paths)
193203 {
194204 set<ContigID> repeats;
195 for (ContigPathMap::const_iterator pathIt = paths.begin();
196 pathIt != paths.end(); ++pathIt) {
205 for (ContigPathMap::const_iterator pathIt = paths.begin(); pathIt != paths.end(); ++pathIt) {
197206 const ContigPath& path = pathIt->second;
198207 map<ContigID, unsigned> count;
199 for (ContigPath::const_iterator it = path.begin();
200 it != path.end(); ++it)
208 for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it)
201209 if (!it->ambiguous())
202210 count[it->contigIndex()]++;
203 for (map<ContigID, unsigned>::const_iterator
204 it = count.begin(); it != count.end(); ++it)
211 for (map<ContigID, unsigned>::const_iterator it = count.begin(); it != count.end(); ++it)
205212 if (it->second > 1)
206213 repeats.insert(it->first);
207214 }
211218 /** Remove tandem repeats from the set of paths.
212219 * @return the removed paths
213220 */
214 static set<ContigID> removeRepeats(ContigPathMap& paths)
221 static set<ContigID>
222 removeRepeats(ContigPathMap& paths)
215223 {
216224 set<ContigID> repeats = findRepeats(paths);
217225 if (gDebugPrint) {
218226 cout << "Repeats:";
219227 if (!repeats.empty()) {
220 for (set<ContigID>::const_iterator it = repeats.begin();
221 it != repeats.end(); ++it)
228 for (set<ContigID>::const_iterator it = repeats.begin(); it != repeats.end(); ++it)
222229 cout << ' ' << get(g_contigNames, *it);
223230 } else
224231 cout << " none";
226233 }
227234
228235 unsigned removed = 0;
229 for (set<ContigID>::const_iterator it = repeats.begin();
230 it != repeats.end(); ++it)
236 for (set<ContigID>::const_iterator it = repeats.begin(); it != repeats.end(); ++it)
231237 if (paths.count(*it) > 0)
232238 removed++;
233239 if (removed == paths.size()) {
238244 }
239245
240246 ostringstream ss;
241 for (set<ContigID>::const_iterator it = repeats.begin();
242 it != repeats.end(); ++it)
247 for (set<ContigID>::const_iterator it = repeats.begin(); it != repeats.end(); ++it)
243248 if (paths.erase(*it) > 0)
244249 ss << ' ' << get(g_contigNames, *it);
245250
248253 return repeats;
249254 }
250255
251 static void appendToMergeQ(deque<ContigNode>& mergeQ,
252 set<ContigNode>& seen, const ContigPath& path)
253 {
254 for (ContigPath::const_iterator it = path.begin();
255 it != path.end(); ++it)
256 static void
257 appendToMergeQ(deque<ContigNode>& mergeQ, set<ContigNode>& seen, const ContigPath& path)
258 {
259 for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it)
256260 if (!it->ambiguous() && seen.insert(*it).second)
257261 mergeQ.push_back(*it);
258262 }
259263
260264 /** A path overlap graph. */
261 typedef ContigGraph<DirectedGraph<> > PathGraph;
265 typedef ContigGraph<DirectedGraph<>> PathGraph;
262266
263267 /** Add an edge if the two paths overlap.
264268 * @param pivot the pivot at which to seed the alignment
265269 * @return whether an overlap was found
266270 */
267 static bool addOverlapEdge(const Lengths& lengths,
268 PathGraph& gout, ContigNode pivot,
269 ContigNode seed1, const ContigPath& path1,
270 ContigNode seed2, const ContigPath& path2)
271 static bool
272 addOverlapEdge(
273 const Lengths& lengths,
274 PathGraph& gout,
275 ContigNode pivot,
276 ContigNode seed1,
277 const ContigPath& path1,
278 ContigNode seed2,
279 const ContigPath& path2)
271280 {
272281 assert(seed1 != seed2);
273282
274283 // Determine the orientation of the overlap edge.
275284 dir_type orientation = DIR_X;
276 ContigPath consensus = align(lengths,
277 path1, path2, pivot, orientation);
285 ContigPath consensus = align(lengths, path1, path2, pivot, orientation);
278286 if (consensus.empty())
279287 return false;
280288 assert(orientation != DIR_X);
281289 if (orientation == DIR_B) {
282290 // One of the paths subsumes the other. Use the order of the
283291 // seeds to determine the orientation of the edge.
284 orientation = find(consensus.begin(), consensus.end(), seed1)
285 < find(consensus.begin(), consensus.end(), seed2)
286 ? DIR_F : DIR_R;
292 orientation = find(consensus.begin(), consensus.end(), seed1) <
293 find(consensus.begin(), consensus.end(), seed2)
294 ? DIR_F
295 : DIR_R;
287296 }
288297 assert(orientation == DIR_F || orientation == DIR_R);
289298
300309 }
301310
302311 /** Return the specified path. */
303 static ContigPath getPath(const ContigPathMap& paths, ContigNode u)
312 static ContigPath
313 getPath(const ContigPathMap& paths, ContigNode u)
304314 {
305315 ContigPathMap::const_iterator it = paths.find(u.contigIndex());
306316 assert(it != paths.end());
311321 }
312322
313323 /** Find the overlaps between paths and add edges to the graph. */
314 static void findPathOverlaps(const Lengths& lengths,
315 const ContigPathMap& paths,
316 const ContigNode& seed1, const ContigPath& path1,
317 PathGraph& gout)
318 {
319 for (ContigPath::const_iterator it = path1.begin();
320 it != path1.end(); ++it) {
324 static void
325 findPathOverlaps(
326 const Lengths& lengths,
327 const ContigPathMap& paths,
328 const ContigNode& seed1,
329 const ContigPath& path1,
330 PathGraph& gout)
331 {
332 for (ContigPath::const_iterator it = path1.begin(); it != path1.end(); ++it) {
321333 ContigNode seed2 = *it;
322334 if (seed1 == seed2)
323335 continue;
324336 if (seed2.ambiguous())
325337 continue;
326 ContigPathMap::const_iterator path2It
327 = paths.find(seed2.contigIndex());
338 ContigPathMap::const_iterator path2It = paths.find(seed2.contigIndex());
328339 if (path2It == paths.end())
329340 continue;
330341
331342 ContigPath path2 = path2It->second;
332343 if (seed2.sense())
333344 reverseComplement(path2.begin(), path2.end());
334 addOverlapEdge(lengths,
335 gout, seed2, seed1, path1, seed2, path2);
345 addOverlapEdge(lengths, gout, seed2, seed1, path1, seed2, path2);
336346 }
337347 }
338348
339349 /** Attempt to merge the paths specified in mergeQ with path.
340350 * @return the number of paths merged
341351 */
342 static unsigned mergePaths(const Lengths& lengths,
343 ContigPath& path,
344 deque<ContigNode>& mergeQ, set<ContigNode>& seen,
345 const ContigPathMap& paths)
352 static unsigned
353 mergePaths(
354 const Lengths& lengths,
355 ContigPath& path,
356 deque<ContigNode>& mergeQ,
357 set<ContigNode>& seen,
358 const ContigPathMap& paths)
346359 {
347360 unsigned merged = 0;
348361 deque<ContigNode> invalid;
349362 for (ContigNode pivot; !mergeQ.empty(); mergeQ.pop_front()) {
350363 pivot = mergeQ.front();
351 ContigPathMap::const_iterator path2It
352 = paths.find(pivot.contigIndex());
364 ContigPathMap::const_iterator path2It = paths.find(pivot.contigIndex());
353365 if (path2It == paths.end())
354366 continue;
355367
366378 path.swap(consensus);
367379 if (gDebugPrint)
368380 #pragma omp critical(cout)
369 cout << get(g_contigNames, pivot)
370 << '\t' << path2 << '\n'
371 << '\t' << path << '\n';
381 cout << get(g_contigNames, pivot) << '\t' << path2 << '\n' << '\t' << path << '\n';
372382 merged++;
373383 }
374384 mergeQ.swap(invalid);
378388 /** Merge the paths of the specified seed path.
379389 * @return the merged contig path
380390 */
381 static ContigPath mergePath(const Lengths& lengths,
382 const ContigPathMap& paths, const ContigPath& seedPath)
391 static ContigPath
392 mergePath(const Lengths& lengths, const ContigPathMap& paths, const ContigPath& seedPath)
383393 {
384394 assert(!seedPath.empty());
385395 ContigNode seed1 = seedPath.front();
386 ContigPathMap::const_iterator path1It
387 = paths.find(seed1.contigIndex());
396 ContigPathMap::const_iterator path1It = paths.find(seed1.contigIndex());
388397 assert(path1It != paths.end());
389398 ContigPath path(path1It->second);
390399 if (seedPath.front().sense())
392401 if (opt::verbose > 1)
393402 #pragma omp critical(cout)
394403 cout << "\n* " << seedPath << '\n'
395 << get(g_contigNames, seedPath.front())
396 << '\t' << path << '\n';
397 for (ContigPath::const_iterator it = seedPath.begin() + 1;
398 it != seedPath.end(); ++it) {
404 << get(g_contigNames, seedPath.front()) << '\t' << path << '\n';
405 for (ContigPath::const_iterator it = seedPath.begin() + 1; it != seedPath.end(); ++it) {
399406 ContigNode seed2 = *it;
400 ContigPathMap::const_iterator path2It
401 = paths.find(seed2.contigIndex());
407 ContigPathMap::const_iterator path2It = paths.find(seed2.contigIndex());
402408 assert(path2It != paths.end());
403409 ContigPath path2 = path2It->second;
404410 if (seed2.sense())
405411 reverseComplement(path2.begin(), path2.end());
406412
407 ContigNode pivot
408 = find(path.begin(), path.end(), seed2) != path.end()
409 ? seed2 : seed1;
413 ContigNode pivot = find(path.begin(), path.end(), seed2) != path.end() ? seed2 : seed1;
410414 ContigPath consensus = align(lengths, path, path2, pivot);
411415 if (consensus.empty()) {
412416 // This seed could be removed from the seed path.
413417 if (opt::verbose > 1)
414418 #pragma omp critical(cout)
415 cout << get(g_contigNames, seed2)
416 << '\t' << path2 << '\n'
417 << "\tinvalid\n";
419 cout << get(g_contigNames, seed2) << '\t' << path2 << '\n' << "\tinvalid\n";
418420 } else {
419421 path.swap(consensus);
420422 if (opt::verbose > 1)
421423 #pragma omp critical(cout)
422 cout << get(g_contigNames, seed2)
423 << '\t' << path2 << '\n'
424 << '\t' << path << '\n';
424 cout << get(g_contigNames, seed2) << '\t' << path2 << '\n' << '\t' << path << '\n';
425425 }
426426 seed1 = seed2;
427427 }
434434 /** Merge the specified seed paths.
435435 * @return the merged contig paths
436436 */
437 static ContigPaths mergeSeedPaths(const Lengths& lengths,
438 const ContigPathMap& paths, const ContigPaths& seedPaths)
437 static ContigPaths
438 mergeSeedPaths(const Lengths& lengths, const ContigPathMap& paths, const ContigPaths& seedPaths)
439439 {
440440 if (opt::verbose > 0)
441441 cout << "\nMerging paths\n";
442442
443443 ContigPaths out;
444444 out.reserve(seedPaths.size());
445 for (ContigPaths::const_iterator it = seedPaths.begin();
446 it != seedPaths.end(); ++it)
445 for (ContigPaths::const_iterator it = seedPaths.begin(); it != seedPaths.end(); ++it)
447446 out.push_back(mergePath(lengths, paths, *it));
448447 return out;
449448 }
451450 /** Extend the specified path as long as is unambiguously possible and
452451 * add the result to the specified container.
453452 */
454 static void extendPaths(const Lengths& lengths,
455 ContigID id, const ContigPathMap& paths,
456 ContigPathMap& out)
453 static void
454 extendPaths(const Lengths& lengths, ContigID id, const ContigPathMap& paths, ContigPathMap& out)
457455 {
458456 ContigPathMap::const_iterator pathIt = paths.find(id);
459457 assert(pathIt != paths.end());
460458
461459 pair<ContigPathMap::iterator, bool> inserted;
462 #pragma omp critical(out)
460 #pragma omp critical(out)
463461 inserted = out.insert(*pathIt);
464462 assert(inserted.second);
465463 ContigPath& path = inserted.first->second;
466464
467465 if (gDebugPrint)
468 #pragma omp critical(cout)
469 cout << "\n* " << get(g_contigNames, id) << "+\n"
470 << '\t' << path << '\n';
466 #pragma omp critical(cout)
467 cout << "\n* " << get(g_contigNames, id) << "+\n" << '\t' << path << '\n';
471468
472469 set<ContigNode> seen;
473470 seen.insert(ContigNode(id, false));
477474 ;
478475
479476 if (!mergeQ.empty() && gDebugPrint) {
480 #pragma omp critical(cout)
477 #pragma omp critical(cout)
481478 {
482479 cout << "invalid\n";
483 for (deque<ContigNode>::const_iterator it
484 = mergeQ.begin(); it != mergeQ.end(); ++it)
485 cout << get(g_contigNames, *it) << '\t'
486 << paths.find(it->contigIndex())->second << '\n';
480 for (deque<ContigNode>::const_iterator it = mergeQ.begin(); it != mergeQ.end(); ++it)
481 cout << get(g_contigNames, *it) << '\t' << paths.find(it->contigIndex())->second
482 << '\n';
487483 }
488484 }
489485 }
490486
491487 /** Return true if the contigs are equal or both are ambiguous. */
492 static bool equalOrBothAmbiguos(const ContigNode& a,
493 const ContigNode& b)
488 static bool
489 equalOrBothAmbiguos(const ContigNode& a, const ContigNode& b)
494490 {
495491 return a == b || (a.ambiguous() && b.ambiguous());
496492 }
497493
498494 /** Return true if both paths are equal, ignoring ambiguous nodes. */
499 static bool equalIgnoreAmbiguos(const ContigPath& a,
500 const ContigPath& b)
501 {
502 return a.size() == b.size()
503 && equal(a.begin(), a.end(), b.begin(), equalOrBothAmbiguos);
495 static bool
496 equalIgnoreAmbiguos(const ContigPath& a, const ContigPath& b)
497 {
498 return a.size() == b.size() && equal(a.begin(), a.end(), b.begin(), equalOrBothAmbiguos);
504499 }
505500
506501 /** Return whether this path is a cycle. */
507 static bool isCycle(const Lengths& lengths, const ContigPath& path)
502 static bool
503 isCycle(const Lengths& lengths, const ContigPath& path)
508504 {
509505 return !align(lengths, path, path, path.front()).empty();
510506 }
513509 * @param overlaps [out] paths that are found to overlap
514510 * @return the ID of the subsuming path
515511 */
516 static ContigID identifySubsumedPaths(const Lengths& lengths,
517 ContigPathMap::const_iterator path1It,
518 ContigPathMap& paths,
519 set<ContigID>& out,
520 set<ContigID>& overlaps)
512 static ContigID
513 identifySubsumedPaths(
514 const Lengths& lengths,
515 ContigPathMap::const_iterator path1It,
516 ContigPathMap& paths,
517 set<ContigID>& out,
518 set<ContigID>& overlaps)
521519 {
522520 ostringstream vout;
523521 out.clear();
524522 ContigID id(path1It->first);
525523 const ContigPath& path = path1It->second;
526524 if (gDebugPrint)
527 vout << get(g_contigNames, ContigNode(id, false))
528 << '\t' << path << '\n';
529
530 for (ContigPath::const_iterator it = path.begin();
531 it != path.end(); ++it) {
525 vout << get(g_contigNames, ContigNode(id, false)) << '\t' << path << '\n';
526
527 for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) {
532528 ContigNode pivot = *it;
533529 if (pivot.ambiguous() || pivot.id() == id)
534530 continue;
535 ContigPathMap::iterator path2It
536 = paths.find(pivot.contigIndex());
531 ContigPathMap::iterator path2It = paths.find(pivot.contigIndex());
537532 if (path2It == paths.end())
538533 continue;
539534 ContigPath path2 = path2It->second;
544539 continue;
545540 if (equalIgnoreAmbiguos(consensus, path)) {
546541 if (gDebugPrint)
547 vout << get(g_contigNames, pivot)
548 << '\t' << path2 << '\n';
542 vout << get(g_contigNames, pivot) << '\t' << path2 << '\n';
549543 out.insert(path2It->first);
550544 } else if (equalIgnoreAmbiguos(consensus, path2)) {
551545 // This path is larger. Use it as the seed.
552 return identifySubsumedPaths(lengths, path2It, paths, out,
553 overlaps);
546 return identifySubsumedPaths(lengths, path2It, paths, out, overlaps);
554547 } else if (isCycle(lengths, consensus)) {
555548 // The consensus path is a cycle.
556549 bool isCyclePath1 = isCycle(lengths, path);
558551 if (!isCyclePath1 && !isCyclePath2) {
559552 // Neither path is a cycle.
560553 if (gDebugPrint)
561 vout << get(g_contigNames, pivot)
562 << '\t' << path2 << '\n'
563 << "ignored\t" << consensus << '\n';
554 vout << get(g_contigNames, pivot) << '\t' << path2 << '\n'
555 << "ignored\t" << consensus << '\n';
564556 overlaps.insert(id);
565557 overlaps.insert(path2It->first);
566558 } else {
567559 // At least one path is a cycle.
568560 if (gDebugPrint)
569 vout << get(g_contigNames, pivot)
570 << '\t' << path2 << '\n'
571 << "cycle\t" << consensus << '\n';
561 vout << get(g_contigNames, pivot) << '\t' << path2 << '\n'
562 << "cycle\t" << consensus << '\n';
572563 if (isCyclePath1 && isCyclePath2)
573564 out.insert(path2It->first);
574565 else if (!isCyclePath1)
578569 }
579570 } else {
580571 if (gDebugPrint)
581 vout << get(g_contigNames, pivot)
582 << '\t' << path2 << '\n'
583 << "ignored\t" << consensus << '\n';
572 vout << get(g_contigNames, pivot) << '\t' << path2 << '\n'
573 << "ignored\t" << consensus << '\n';
584574 overlaps.insert(id);
585575 overlaps.insert(path2It->first);
586576 }
594584 * @param overlaps [out] paths that are found to overlap
595585 * @return the next iterator after path1it
596586 */
597 static ContigPathMap::const_iterator removeSubsumedPaths(
598 const Lengths& lengths,
599 ContigPathMap::const_iterator path1It, ContigPathMap& paths,
600 ContigID& seed, set<ContigID>& overlaps)
587 static ContigPathMap::const_iterator
588 removeSubsumedPaths(
589 const Lengths& lengths,
590 ContigPathMap::const_iterator path1It,
591 ContigPathMap& paths,
592 ContigID& seed,
593 set<ContigID>& overlaps)
601594 {
602595 if (gDebugPrint)
603596 cout << '\n';
604597 set<ContigID> eq;
605 seed = identifySubsumedPaths(lengths,
606 path1It, paths, eq, overlaps);
598 seed = identifySubsumedPaths(lengths, path1It, paths, eq, overlaps);
607599 ++path1It;
608 for (set<ContigID>::const_iterator it = eq.begin();
609 it != eq.end(); ++it) {
600 for (set<ContigID>::const_iterator it = eq.begin(); it != eq.end(); ++it) {
610601 if (*it == path1It->first)
611602 ++path1It;
612603 paths.erase(*it);
617608 /** Remove paths subsumed by another path.
618609 * @return paths that are found to overlap
619610 */
620 static set<ContigID> removeSubsumedPaths(const Lengths& lengths,
621 ContigPathMap& paths)
611 static set<ContigID>
612 removeSubsumedPaths(const Lengths& lengths, ContigPathMap& paths)
622613 {
623614 set<ContigID> overlaps, seen;
624 for (ContigPathMap::const_iterator iter = paths.begin();
625 iter != paths.end();) {
615 for (ContigPathMap::const_iterator iter = paths.begin(); iter != paths.end();) {
626616 if (seen.count(iter->first) == 0) {
627617 ContigID seed;
628 iter = removeSubsumedPaths(lengths,
629 iter, paths, seed, overlaps);
618 iter = removeSubsumedPaths(lengths, iter, paths, seed, overlaps);
630619 seen.insert(seed);
631620 } else
632621 ++iter;
638627 * outgoing edges, (u,v1) and (u,v2), add the edge (v1,v2) if v1 < v2,
639628 * and add the edge (v2,v1) if v2 < v1.
640629 */
641 static void addMissingEdges(const Lengths& lengths,
642 PathGraph& g, const ContigPathMap& paths)
630 static void
631 addMissingEdges(const Lengths& lengths, PathGraph& g, const ContigPathMap& paths)
643632 {
644633 typedef graph_traits<PathGraph>::adjacency_iterator Vit;
645634 typedef graph_traits<PathGraph>::vertex_iterator Uit;
657646 ++vit1;
658647 assert(v1 != u);
659648 ContigPath path1 = getPath(paths, v1);
660 if (find(path1.begin(), path1.end(),
661 ContigPath::value_type(u)) == path1.end())
649 if (find(path1.begin(), path1.end(), ContigPath::value_type(u)) == path1.end())
662650 continue;
663651 for (Vit vit2 = vit1; vit2 != vrange.second; ++vit2) {
664652 V v2 = *vit2;
667655 if (edge(v1, v2, g).second || edge(v2, v1, g).second)
668656 continue;
669657 ContigPath path2 = getPath(paths, v2);
670 if (find(path2.begin(), path2.end(),
671 ContigPath::value_type(u)) == path2.end())
658 if (find(path2.begin(), path2.end(), ContigPath::value_type(u)) == path2.end())
672659 continue;
673 numAdded += addOverlapEdge(lengths,
674 g, u, v1, path1, v2, path2);
660 numAdded += addOverlapEdge(lengths, g, u, v1, path1, v2, path2);
675661 }
676662 }
677663 }
682668 }
683669
684670 /** Remove transitive edges. */
685 static void removeTransitiveEdges(PathGraph& pathGraph)
671 static void
672 removeTransitiveEdges(PathGraph& pathGraph)
686673 {
687674 unsigned nbefore = num_edges(pathGraph);
688675 unsigned nremoved = remove_transitive_edges(pathGraph);
689676 unsigned nafter = num_edges(pathGraph);
690677 if (opt::verbose > 0)
691 cout << "Removed " << nremoved << " transitive edges of "
692 << nbefore << " edges leaving "
693 << nafter << " edges.\n";
678 cout << "Removed " << nremoved << " transitive edges of " << nbefore << " edges leaving "
679 << nafter << " edges.\n";
694680 assert(nbefore - nremoved == nafter);
695681 if (!opt::db.empty()) {
696682 addToDb(db, "Edges_init", nbefore);
702688 * Remove the edge (u,v) if deg+(u) > 1 and deg-(v) > 1 and the
703689 * overlap of (u,v) is small.
704690 */
705 static void removeSmallOverlaps(PathGraph& g,
706 const ContigPathMap& paths)
691 static void
692 removeSmallOverlaps(PathGraph& g, const ContigPathMap& paths)
707693 {
708694 typedef graph_traits<PathGraph>::edge_descriptor E;
709695 typedef graph_traits<PathGraph>::out_edge_iterator Eit;
725711 if (in_degree(v, g) < 2)
726712 continue;
727713 ContigPath pathv = getPath(paths, v);
728 if (pathu.back() == pathv.front()
729 && paths.count(pathu.back().contigIndex()) > 0)
714 if (pathu.back() == pathv.front() && paths.count(pathu.back().contigIndex()) > 0)
730715 edges.push_back(uv);
731716 }
732717 }
733718 remove_edges(g, edges.begin(), edges.end());
734719 if (opt::verbose > 0)
735 cout << "Removed " << edges.size()
736 << " small overlap edges.\n";
720 cout << "Removed " << edges.size() << " small overlap edges.\n";
737721 if (!opt::db.empty())
738722 addToDb(db, "Edges_removed_small_overlap", edges.size());
739723 }
740724
741725 /** Output the path overlap graph. */
742 static void outputPathGraph(PathGraph& pathGraph)
726 static void
727 outputPathGraph(PathGraph& pathGraph)
743728 {
744729 if (opt::graphPath.empty())
745730 return;
750735 }
751736
752737 /** Sort and output the specified paths. */
753 static void outputSortedPaths(const Lengths& lengths, const ContigPathMap& paths)
738 static void
739 outputSortedPaths(const Lengths& lengths, const ContigPathMap& paths)
754740 {
755741 // Sort the paths.
756742 vector<ContigPath> sortedPaths(paths.size());
757 transform(paths.begin(), paths.end(), sortedPaths.begin(),
758 mem_var(&ContigPathMap::value_type::second));
743 transform(
744 paths.begin(),
745 paths.end(),
746 sortedPaths.begin(),
747 mem_var(&ContigPathMap::value_type::second));
759748 sort(sortedPaths.begin(), sortedPaths.end());
760749
761750 // Output the paths.
762751 ofstream fout(opt::out.c_str());
763752 ostream& out = opt::out.empty() ? cout : fout;
764753 assert_good(out, opt::out);
765 for (vector<ContigPath>::const_iterator it = sortedPaths.begin();
766 it != sortedPaths.end(); ++it)
754 for (vector<ContigPath>::const_iterator it = sortedPaths.begin(); it != sortedPaths.end(); ++it)
767755 out << createContigName() << '\t' << *it << '\n';
768756 assert_good(out, opt::out);
769757
772760 }
773761
774762 /** Assemble the path overlap graph. */
775 static void assemblePathGraph(const Lengths& lengths,
776 PathGraph& pathGraph, ContigPathMap& paths)
763 static void
764 assemblePathGraph(const Lengths& lengths, PathGraph& pathGraph, ContigPathMap& paths)
777765 {
778766 ContigPaths seedPaths;
779767 assembleDFS(pathGraph, back_inserter(seedPaths));
780 ContigPaths mergedPaths = mergeSeedPaths(lengths,
781 paths, seedPaths);
768 ContigPaths mergedPaths = mergeSeedPaths(lengths, paths, seedPaths);
782769 if (opt::verbose > 1)
783770 cout << '\n';
784771
785772 // Replace each path with the merged path.
786 for (ContigPaths::const_iterator it1 = seedPaths.begin();
787 it1 != seedPaths.end(); ++it1) {
773 for (ContigPaths::const_iterator it1 = seedPaths.begin(); it1 != seedPaths.end(); ++it1) {
788774 const ContigPath& path(mergedPaths[it1 - seedPaths.begin()]);
789775 ContigPath pathrc(path);
790776 reverseComplement(pathrc.begin(), pathrc.end());
791 for (ContigPath::const_iterator it2 = it1->begin();
792 it2 != it1->end(); ++it2) {
777 for (ContigPath::const_iterator it2 = it1->begin(); it2 != it1->end(); ++it2) {
793778 ContigNode seed(*it2);
794779 if (find(path.begin(), path.end(), seed) != path.end()) {
795 paths[seed.contigIndex()]
796 = seed.sense() ? pathrc : path;
780 paths[seed.contigIndex()] = seed.sense() ? pathrc : path;
797781 } else {
798782 // This seed was not included in the merged path.
799783 }
811795 }
812796
813797 /** Read a set of paths from the specified file. */
814 static ContigPathMap readPaths(const Lengths& lengths,
815 const string& filePath)
798 static ContigPathMap
799 readPaths(const Lengths& lengths, const string& filePath)
816800 {
817801 if (opt::verbose > 0)
818802 cerr << "Reading `" << filePath << "'..." << endl;
832816 continue;
833817 }
834818
835 bool inserted = paths.insert(
836 make_pair(id, path)).second;
819 bool inserted = paths.insert(make_pair(id, path)).second;
837820 assert(inserted);
838821 (void)inserted;
839822 }
840823 assert(in.eof());
841824
842825 if (opt::seedLen > 0)
843 cout << "Ignored " << tooSmall
844 << " paths whose seeds are shorter than "
845 << opt::seedLen << " bp.\n";
826 cout << "Ignored " << tooSmall << " paths whose seeds are shorter than " << opt::seedLen
827 << " bp.\n";
846828 return paths;
847829 }
848830
850832 * @return true if out != last
851833 */
852834 template<class T1, class T2, class T3>
853 bool atomicInc(T1& it, T2 last, T3& out)
854 {
855 #pragma omp critical(atomicInc)
835 bool
836 atomicInc(T1& it, T2 last, T3& out)
837 {
838 #pragma omp critical(atomicInc)
856839 out = it == last ? it : it++;
857840 return out != last;
858841 }
859842
860843 /** Build the path overlap graph. */
861 static void buildPathGraph(const Lengths& lengths,
862 PathGraph& g, const ContigPathMap& paths)
844 static void
845 buildPathGraph(const Lengths& lengths, PathGraph& g, const ContigPathMap& paths)
863846 {
864847 // Create the vertices of the path overlap graph.
865848 PathGraph(lengths.size()).swap(g);
873856
874857 // Find the overlapping paths.
875858 ContigPathMap::const_iterator sharedIt = paths.begin();
876 #pragma omp parallel
877 for (ContigPathMap::const_iterator it;
878 atomicInc(sharedIt, paths.end(), it);)
879 findPathOverlaps(lengths, paths,
880 ContigNode(it->first, false), it->second, g);
859 #pragma omp parallel
860 for (ContigPathMap::const_iterator it; atomicInc(sharedIt, paths.end(), it);)
861 findPathOverlaps(lengths, paths, ContigNode(it->first, false), it->second, g);
881862 if (gDebugPrint)
882863 cout << '\n';
883864
889870
890871 // graph statistics
891872 vector<int> vals = passGraphStatsVal(g);
892 vector<string> keys = make_vector<string>()
893 << "V"
894 << "E"
895 << "degree0pctg"
896 << "degree1pctg"
897 << "degree234pctg"
898 << "degree5pctg"
899 << "degree_max";
873 vector<string> keys = make_vector<string>() << "V"
874 << "E"
875 << "degree0pctg"
876 << "degree1pctg"
877 << "degree234pctg"
878 << "degree5pctg"
879 << "degree_max";
900880 if (!opt::db.empty()) {
901 for (unsigned i=0; i<vals.size(); i++)
881 for (unsigned i = 0; i < vals.size(); i++)
902882 addToDb(db, keys[i], vals[i]);
903883 }
904884 outputPathGraph(g);
905885 }
906886
907887 /** Read contig lengths. */
908 static Lengths readContigLengths(istream& in)
888 static Lengths
889 readContigLengths(istream& in)
909890 {
910891 assert(in);
911892 assert(g_contigNames.empty());
925906 }
926907
927908 /** Read contig lengths. */
928 static Lengths readContigLengths(const string& path)
909 static Lengths
910 readContigLengths(const string& path)
929911 {
930912 ifstream fin(path.c_str());
931913 if (path != "-")
934916 return readContigLengths(in);
935917 }
936918
937 int main(int argc, char** argv)
919 int
920 main(int argc, char** argv)
938921 {
939922 if (!opt::db.empty())
940923 opt::metaVars.resize(3);
941924
942925 bool die = false;
943 for (int c; (c = getopt_long(argc, argv,
944 shortopts, longopts, NULL)) != -1;) {
926 for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
945927 istringstream arg(optarg != NULL ? optarg : "");
946928 switch (c) {
947 case '?': die = true; break;
948 case 'G': {
949 double x;
950 arg >> x;
951 opt::genomeSize = x;
952 break;
953 }
954 case 'g': arg >> opt::graphPath; break;
955 case 'j': arg >> opt::threads; break;
956 case 'k': arg >> opt::k; break;
957 case 'o': arg >> opt::out; break;
958 case 's': arg >> opt::seedLen; break;
959 case 'v': opt::verbose++; break;
960 case OPT_HELP:
961 cout << USAGE_MESSAGE;
962 exit(EXIT_SUCCESS);
963 case OPT_VERSION:
964 cout << VERSION_MESSAGE;
965 exit(EXIT_SUCCESS);
966 case OPT_DB:
967 arg >> opt::db; break;
968 case OPT_LIBRARY:
969 arg >> opt::metaVars[0]; break;
970 case OPT_STRAIN:
971 arg >> opt::metaVars[1]; break;
972 case OPT_SPECIES:
973 arg >> opt::metaVars[2]; break;
929 case '?':
930 die = true;
931 break;
932 case 'G': {
933 double x;
934 arg >> x;
935 opt::genomeSize = x;
936 break;
937 }
938 case 'g':
939 arg >> opt::graphPath;
940 break;
941 case 'j':
942 arg >> opt::threads;
943 break;
944 case 'k':
945 arg >> opt::k;
946 break;
947 case 'o':
948 arg >> opt::out;
949 break;
950 case 's':
951 arg >> opt::seedLen;
952 break;
953 case 'v':
954 opt::verbose++;
955 break;
956 case OPT_HELP:
957 cout << USAGE_MESSAGE;
958 exit(EXIT_SUCCESS);
959 case OPT_VERSION:
960 cout << VERSION_MESSAGE;
961 exit(EXIT_SUCCESS);
962 case OPT_DB:
963 arg >> opt::db;
964 break;
965 case OPT_LIBRARY:
966 arg >> opt::metaVars[0];
967 break;
968 case OPT_STRAIN:
969 arg >> opt::metaVars[1];
970 break;
971 case OPT_SPECIES:
972 arg >> opt::metaVars[2];
973 break;
974974 }
975975 if (optarg != NULL && !arg.eof()) {
976 cerr << PROGRAM ": invalid option: `-"
977 << (char)c << optarg << "'\n";
976 cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n";
978977 exit(EXIT_FAILURE);
979978 }
980979 }
993992 }
994993
995994 if (die) {
996 cerr << "Try `" << PROGRAM
997 << " --help' for more information.\n";
995 cerr << "Try `" << PROGRAM << " --help' for more information.\n";
998996 exit(EXIT_FAILURE);
999997 }
1000998
10121010 cerr << "Reading `" << argv[optind] << "'..." << endl;
10131011
10141012 if (!opt::db.empty()) {
1015 init(db,
1016 opt::db,
1017 opt::verbose,
1018 PROGRAM,
1019 opt::getCommand(argc, argv),
1020 opt::metaVars
1021 );
1013 init(db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars);
10221014 }
10231015
10241016 Lengths lengths = readContigLengths(argv[optind++]);
1025 ContigPathMap originalPathMap = readPaths(
1026 lengths, argv[optind++]);
1017 ContigPathMap originalPathMap = readPaths(lengths, argv[optind++]);
10271018
10281019 removeRepeats(originalPathMap);
10291020
10411032 ContigPathMap resultsPathMap;
10421033 #if _OPENMP
10431034 ContigPathMap::iterator sharedIt = originalPathMap.begin();
1044 #pragma omp parallel
1045 for (ContigPathMap::iterator it;
1046 atomicInc(sharedIt, originalPathMap.end(), it);)
1047 extendPaths(lengths,
1048 it->first, originalPathMap, resultsPathMap);
1035 #pragma omp parallel
1036 for (ContigPathMap::iterator it; atomicInc(sharedIt, originalPathMap.end(), it);)
1037 extendPaths(lengths, it->first, originalPathMap, resultsPathMap);
10491038 #else
1050 for (ContigPathMap::const_iterator it = originalPathMap.begin();
1051 it != originalPathMap.end(); ++it)
1052 extendPaths(lengths,
1053 it->first, originalPathMap, resultsPathMap);
1039 for (ContigPathMap::const_iterator it = originalPathMap.begin(); it != originalPathMap.end();
1040 ++it)
1041 extendPaths(lengths, it->first, originalPathMap, resultsPathMap);
10541042 #endif
10551043 if (gDebugPrint)
10561044 cout << '\n';
10591047
10601048 if (gDebugPrint)
10611049 cout << "\nRemoving redundant contigs\n";
1062 set<ContigID> overlaps = removeSubsumedPaths(lengths,
1063 resultsPathMap);
1050 set<ContigID> overlaps = removeSubsumedPaths(lengths, resultsPathMap);
10641051
10651052 if (!overlaps.empty() && !repeats.empty()) {
10661053 // Remove the newly-discovered repeat contigs from the
10671054 // original paths.
1068 for (set<ContigID>::const_iterator it = repeats.begin();
1069 it != repeats.end(); ++it)
1055 for (set<ContigID>::const_iterator it = repeats.begin(); it != repeats.end(); ++it)
10701056 originalPathMap.erase(*it);
10711057
10721058 // Reassemble the paths that were found to overlap.
10731059 if (gDebugPrint) {
10741060 cout << "\nReassembling overlapping contigs:";
1075 for (set<ContigID>::const_iterator it = overlaps.begin();
1076 it != overlaps.end(); ++it)
1061 for (set<ContigID>::const_iterator it = overlaps.begin(); it != overlaps.end(); ++it)
10771062 cout << ' ' << get(g_contigNames, *it);
10781063 cout << '\n';
10791064 }
10801065
1081 for (set<ContigID>::const_iterator it = overlaps.begin();
1082 it != overlaps.end(); ++it) {
1066 for (set<ContigID>::const_iterator it = overlaps.begin(); it != overlaps.end(); ++it) {
10831067 if (originalPathMap.count(*it) == 0)
10841068 continue; // repeat
10851069 ContigPathMap::iterator oldIt = resultsPathMap.find(*it);
10871071 continue; // subsumed
10881072 ContigPath old = oldIt->second;
10891073 resultsPathMap.erase(oldIt);
1090 extendPaths(lengths,
1091 *it, originalPathMap, resultsPathMap);
1074 extendPaths(lengths, *it, originalPathMap, resultsPathMap);
10921075 if (gDebugPrint) {
10931076 if (resultsPathMap[*it] == old)
10941077 cout << "no change\n";
11031086 overlaps = removeSubsumedPaths(lengths, resultsPathMap);
11041087 if (!overlaps.empty() && gDebugPrint) {
11051088 cout << "\nOverlapping contigs:";
1106 for (set<ContigID>::const_iterator it = overlaps.begin();
1107 it != overlaps.end(); ++it)
1089 for (set<ContigID>::const_iterator it = overlaps.begin(); it != overlaps.end(); ++it)
11081090 cout << ' ' << get(g_contigNames, *it);
11091091 cout << '\n';
11101092 }
11161098 }
11171099
11181100 /** Return the length of the specified contig in k-mer. */
1119 static unsigned getLength(const Lengths& lengths,
1120 const ContigNode& u)
1121 {
1122 return u.ambiguous() ? u.length()
1123 : lengths.at(u.id());
1101 static unsigned
1102 getLength(const Lengths& lengths, const ContigNode& u)
1103 {
1104 return u.ambiguous() ? u.length() : lengths.at(u.id());
11241105 }
11251106
11261107 /** Functor to add the number of k-mer in two contigs. */
11271108 struct AddLength
11281109 {
1129 AddLength(const Lengths& lengths) : m_lengths(lengths) { }
1110 AddLength(const Lengths& lengths)
1111 : m_lengths(lengths)
1112 {}
11301113 unsigned operator()(unsigned addend, const ContigNode& u) const
11311114 {
11321115 return addend + getLength(m_lengths, u);
11331116 }
1117
11341118 private:
11351119 const Lengths& m_lengths;
11361120 };
11401124 * found.
11411125 * @return true if an alignment is found
11421126 */
1143 template <class iterator, class oiterator>
1144 static bool alignCoordinates(const Lengths& lengths,
1145 iterator& first1, iterator last1,
1146 iterator& first2, iterator last2, oiterator& result)
1127 template<class iterator, class oiterator>
1128 static bool
1129 alignCoordinates(
1130 const Lengths& lengths,
1131 iterator& first1,
1132 iterator last1,
1133 iterator& first2,
1134 iterator last2,
1135 oiterator& result)
11471136 {
11481137 oiterator out = result;
11491138
12051194 * the consensus at out if an alignment is found.
12061195 * @return true if an alignment is found
12071196 */
1208 template <class iterator, class oiterator>
1209 static bool buildConsensus(const Lengths& lengths,
1210 iterator it1, iterator it1e,
1211 iterator it2, iterator it2e, oiterator& out)
1197 template<class iterator, class oiterator>
1198 static bool
1199 buildConsensus(
1200 const Lengths& lengths,
1201 iterator it1,
1202 iterator it1e,
1203 iterator it2,
1204 iterator it2e,
1205 oiterator& out)
12121206 {
12131207 iterator it1b = it1 + 1;
12141208 assert(!it1b->ambiguous());
12281222
12291223 unsigned ambiguous1 = it1->length();
12301224 unsigned ambiguous2 = it2a->length();
1231 unsigned unambiguous1 = accumulate(it1b, it1e,
1232 0, AddLength(lengths));
1233 unsigned unambiguous2 = accumulate(it2, it2a,
1234 0, AddLength(lengths));
1235 if (ambiguous1 < unambiguous2
1236 || ambiguous2 < unambiguous1) {
1225 unsigned unambiguous1 = accumulate(it1b, it1e, 0, AddLength(lengths));
1226 unsigned unambiguous2 = accumulate(it2, it2a, 0, AddLength(lengths));
1227 if (ambiguous1 < unambiguous2 || ambiguous2 < unambiguous1) {
12371228 // Two gaps overlap and either of the gaps is smaller
12381229 // than the unambiguous sequence that overlaps the
12391230 // gap. No alignment.
12401231 return false;
12411232 }
12421233
1243 unsigned n = max(1U,
1244 max(ambiguous2 - unambiguous1,
1245 ambiguous1 - unambiguous2));
1234 unsigned n = max(1U, max(ambiguous2 - unambiguous1, ambiguous1 - unambiguous2));
12461235 out = copy(it2, it2a, out);
12471236 *out++ = ContigNode(n, 'N');
12481237 out = copy(it1b, it1e, out);
12541243 * in it1 and it2.
12551244 * @return true if an alignment is found
12561245 */
1257 template <class iterator, class oiterator>
1258 static bool alignAtSeed(const Lengths& lengths,
1259 iterator& it1, iterator it1e, iterator last1,
1260 iterator& it2, iterator last2, oiterator& out)
1246 template<class iterator, class oiterator>
1247 static bool
1248 alignAtSeed(
1249 const Lengths& lengths,
1250 iterator& it1,
1251 iterator it1e,
1252 iterator last1,
1253 iterator& it2,
1254 iterator last2,
1255 oiterator& out)
12611256 {
12621257 assert(it1 != last1);
12631258 assert(it1->ambiguous());
12691264 // fewest number of contigs in the consensus sequence.
12701265 unsigned bestLen = UINT_MAX;
12711266 iterator bestIt2e;
1272 for (iterator it2e = it2;
1273 (it2e = find(it2e, last2, *it1e)) != last2; ++it2e) {
1267 for (iterator it2e = it2; (it2e = find(it2e, last2, *it1e)) != last2; ++it2e) {
12741268 oiterator myOut = out;
1275 if (buildConsensus(lengths, it1, it1e, it2, it2e, myOut)
1276 && align(lengths, it1e, last1, it2e, last2, myOut)) {
1269 if (buildConsensus(lengths, it1, it1e, it2, it2e, myOut) &&
1270 align(lengths, it1e, last1, it2e, last2, myOut)) {
12771271 unsigned len = myOut - out;
12781272 if (len <= bestLen) {
12791273 bestLen = len;
12821276 }
12831277 }
12841278 if (bestLen != UINT_MAX) {
1285 bool good = buildConsensus(lengths,
1286 it1, it1e, it2, bestIt2e, out);
1279 bool good = buildConsensus(lengths, it1, it1e, it2, bestIt2e, out);
12871280 assert(good);
12881281 it1 = it1e;
12891282 it2 = bestIt2e;
12961289 * The end of the alignment is returned in it1 and it2.
12971290 * @return true if an alignment is found
12981291 */
1299 template <class iterator, class oiterator>
1300 static bool alignAmbiguous(const Lengths& lengths,
1301 iterator& it1, iterator last1,
1302 iterator& it2, iterator last2, oiterator& out)
1292 template<class iterator, class oiterator>
1293 static bool
1294 alignAmbiguous(
1295 const Lengths& lengths,
1296 iterator& it1,
1297 iterator last1,
1298 iterator& it2,
1299 iterator last2,
1300 oiterator& out)
13031301 {
13041302 assert(it1 != last1);
13051303 assert(it1->ambiguous());
13231321 * The end of the alignment is returned in it1 and it2.
13241322 * @return true if an alignment is found
13251323 */
1326 template <class iterator, class oiterator>
1327 static bool alignOne(const Lengths& lengths,
1328 iterator& it1, iterator last1,
1329 iterator& it2, iterator last2, oiterator& out)
1324 template<class iterator, class oiterator>
1325 static bool
1326 alignOne(
1327 const Lengths& lengths,
1328 iterator& it1,
1329 iterator last1,
1330 iterator& it2,
1331 iterator last2,
1332 oiterator& out)
13301333 {
13311334 // Check for a trivial alignment.
13321335 unsigned n1 = last1 - it1, n2 = last2 - it2;
13461349 return true;
13471350 }
13481351
1349 return
1350 it1->ambiguous() && it2->ambiguous()
1351 ? (it1->length() > it2->length()
1352 ? alignAmbiguous(lengths, it1, last1, it2, last2, out)
1353 : alignAmbiguous(lengths, it2, last2, it1, last1, out)
1354 )
1355 : it1->ambiguous()
1356 ? alignAmbiguous(lengths, it1, last1, it2, last2, out)
1357 : it2->ambiguous()
1358 ? alignAmbiguous(lengths, it2, last2, it1, last1, out)
1359 : (*out++ = *it1, *it1++ == *it2++);
1352 return it1->ambiguous() && it2->ambiguous()
1353 ? (it1->length() > it2->length()
1354 ? alignAmbiguous(lengths, it1, last1, it2, last2, out)
1355 : alignAmbiguous(lengths, it2, last2, it1, last1, out))
1356 : it1->ambiguous()
1357 ? alignAmbiguous(lengths, it1, last1, it2, last2, out)
1358 : it2->ambiguous() ? alignAmbiguous(lengths, it2, last2, it1, last1, out)
1359 : (*out++ = *it1, *it1++ == *it2++);
13601360 }
13611361
13621362 /** Align the ambiguous region [it1, last1) to [it2, last2)
13641364 * @return the orientation of the alignment if an alignments is found
13651365 * or zero otherwise
13661366 */
1367 template <class iterator, class oiterator>
1368 static dir_type align(const Lengths& lengths,
1369 iterator it1, iterator last1,
1370 iterator it2, iterator last2, oiterator& out)
1367 template<class iterator, class oiterator>
1368 static dir_type
1369 align(
1370 const Lengths& lengths,
1371 iterator it1,
1372 iterator last1,
1373 iterator it2,
1374 iterator last2,
1375 oiterator& out)
13711376 {
13721377 assert(it1 != last1);
13731378 assert(it2 != last2);
13781383 out = copy(it1, last1, out);
13791384 out = copy(it2, last2, out);
13801385 return it1 == last1 && it2 == last2 ? DIR_B
1381 : it1 == last1 ? DIR_F
1382 : it2 == last2 ? DIR_R
1383 : DIR_X;
1386 : it1 == last1 ? DIR_F : it2 == last2 ? DIR_R : DIR_X;
13841387 }
13851388
13861389 /** Find an equivalent region of the two specified paths, starting the
13881391 * @param[out] orientation the orientation of the alignment
13891392 * @return the consensus sequence
13901393 */
1391 static ContigPath align(const Lengths& lengths,
1392 const ContigPath& p1, const ContigPath& p2,
1393 ContigPath::const_iterator pivot1,
1394 ContigPath::const_iterator pivot2,
1395 dir_type& orientation)
1394 static ContigPath
1395 align(
1396 const Lengths& lengths,
1397 const ContigPath& p1,
1398 const ContigPath& p2,
1399 ContigPath::const_iterator pivot1,
1400 ContigPath::const_iterator pivot2,
1401 dir_type& orientation)
13961402 {
13971403 assert(*pivot1 == *pivot2);
1398 ContigPath::const_reverse_iterator
1399 rit1 = ContigPath::const_reverse_iterator(pivot1+1),
1400 rit2 = ContigPath::const_reverse_iterator(pivot2+1);
1404 ContigPath::const_reverse_iterator rit1 = ContigPath::const_reverse_iterator(pivot1 + 1),
1405 rit2 = ContigPath::const_reverse_iterator(pivot2 + 1);
14011406 ContigPath alignmentr(p1.rend() - rit1 + p2.rend() - rit2);
14021407 ContigPath::iterator rout = alignmentr.begin();
1403 dir_type alignedr = align(lengths,
1404 rit1, p1.rend(), rit2, p2.rend(), rout);
1408 dir_type alignedr = align(lengths, rit1, p1.rend(), rit2, p2.rend(), rout);
14051409 alignmentr.erase(rout, alignmentr.end());
14061410
14071411 ContigPath::const_iterator it1 = pivot1, it2 = pivot2;
14081412 ContigPath alignmentf(p1.end() - it1 + p2.end() - it2);
14091413 ContigPath::iterator fout = alignmentf.begin();
1410 dir_type alignedf = align(lengths,
1411 it1, p1.end(), it2, p2.end(), fout);
1414 dir_type alignedf = align(lengths, it1, p1.end(), it2, p2.end(), fout);
14121415 alignmentf.erase(fout, alignmentf.end());
14131416
14141417 ContigPath consensus;
14161419 // Found an alignment.
14171420 assert(!alignmentf.empty());
14181421 assert(!alignmentr.empty());
1419 consensus.reserve(alignmentr.size()-1 + alignmentf.size());
1420 consensus.assign(alignmentr.rbegin(), alignmentr.rend()-1);
1421 consensus.insert(consensus.end(),
1422 alignmentf.begin(), alignmentf.end());
1422 consensus.reserve(alignmentr.size() - 1 + alignmentf.size());
1423 consensus.assign(alignmentr.rbegin(), alignmentr.rend() - 1);
1424 consensus.insert(consensus.end(), alignmentf.begin(), alignmentf.end());
14231425
14241426 // Determine the orientation of the alignment.
14251427 unsigned dirs = alignedr << 2 | alignedf;
14511453 /** Return a pivot suitable for aligning the two paths if one exists,
14521454 * otherwise return false.
14531455 */
1454 static pair<ContigNode, bool> findPivot(
1455 const ContigPath& path1, const ContigPath& path2)
1456 {
1457 for (ContigPath::const_iterator it = path2.begin();
1458 it != path2.end(); ++it) {
1456 static pair<ContigNode, bool>
1457 findPivot(const ContigPath& path1, const ContigPath& path2)
1458 {
1459 for (ContigPath::const_iterator it = path2.begin(); it != path2.end(); ++it) {
14591460 if (it->ambiguous())
14601461 continue;
1461 if (count(path2.begin(), path2.end(), *it) == 1
1462 && count(path1.begin(), path1.end(), *it) == 1)
1462 if (count(path2.begin(), path2.end(), *it) == 1 &&
1463 count(path1.begin(), path1.end(), *it) == 1)
14631464 return make_pair(*it, true);
14641465 }
14651466 return make_pair(ContigNode(0), false);
14691470 * @param[out] orientation the orientation of the alignment
14701471 * @return the consensus sequence
14711472 */
1472 static ContigPath align(const Lengths& lengths,
1473 const ContigPath& path1, const ContigPath& path2,
1474 ContigNode pivot, dir_type& orientation)
1473 static ContigPath
1474 align(
1475 const Lengths& lengths,
1476 const ContigPath& path1,
1477 const ContigPath& path2,
1478 ContigNode pivot,
1479 dir_type& orientation)
14751480 {
14761481 if (&path1 == &path2) {
14771482 // Ignore the trivial alignment when aligning a path to
14811486 orientation = DIR_B;
14821487 return path1;
14831488 } else {
1484 ContigPath::const_iterator it
1485 = search(path1.begin(), path1.end(),
1486 path2.begin(), path2.end());
1489 ContigPath::const_iterator it =
1490 search(path1.begin(), path1.end(), path2.begin(), path2.end());
14871491 if (it != path1.end()) {
14881492 // path2 is subsumed in path1.
14891493 // Determine the orientation of the edge.
1490 orientation
1491 = it == path1.begin() ? DIR_R
1492 : it + path2.size() == path1.end() ? DIR_F
1493 : DIR_B;
1494 orientation =
1495 it == path1.begin() ? DIR_R : it + path2.size() == path1.end() ? DIR_F : DIR_B;
14941496 return path1;
14951497 }
14961498 }
14971499
14981500 // Find a suitable pivot.
1499 if (find(path1.begin(), path1.end(), pivot) == path1.end()
1500 || find(path2.begin(), path2.end(), pivot)
1501 == path2.end()) {
1501 if (find(path1.begin(), path1.end(), pivot) == path1.end() ||
1502 find(path2.begin(), path2.end(), pivot) == path2.end()) {
15021503 bool good;
15031504 tie(pivot, good) = findPivot(path1, path2);
15041505 if (!good)
15061507 }
15071508 assert(find(path1.begin(), path1.end(), pivot) != path1.end());
15081509
1509 ContigPath::const_iterator it2 = find(path2.begin(), path2.end(),
1510 pivot);
1510 ContigPath::const_iterator it2 = find(path2.begin(), path2.end(), pivot);
15111511 assert(it2 != path2.end());
15121512 if (&path1 != &path2) {
15131513 // The seed must be unique in path2, unless we're aligning a
15141514 // path to itself.
1515 assert(count(it2+1, path2.end(), pivot) == 0);
1515 assert(count(it2 + 1, path2.end(), pivot) == 0);
15161516 }
15171517
15181518 ContigPath consensus;
15191519 for (ContigPath::const_iterator it1 = find_if(
1520 path1.begin(), path1.end(),
1521 bind2nd(equal_to<ContigNode>(), pivot));
1522 it1 != path1.end();
1523 it1 = find_if(it1+1, path1.end(),
1524 bind2nd(equal_to<ContigNode>(), pivot))) {
1520 path1.begin(), path1.end(), [&pivot](const ContigNode& c) { return c == pivot; });
1521 it1 != path1.end();
1522 it1 =
1523 find_if(it1 + 1, path1.end(), [&pivot](const ContigNode& c) { return c == pivot; })) {
15251524 if (&*it1 == &*it2) {
15261525 // We are aligning a path to itself, and this is the
15271526 // trivial alignment, which we'll ignore.
15281527 continue;
15291528 }
1530 consensus = align(lengths,
1531 path1, path2, it1, it2, orientation);
1529 consensus = align(lengths, path1, path2, it1, it2, orientation);
15321530 if (!consensus.empty())
15331531 return consensus;
15341532 }
15381536 /** Find an equivalent region of the two specified paths.
15391537 * @return the consensus sequence
15401538 */
1541 static ContigPath align(const Lengths& lengths,
1542 const ContigPath& path1, const ContigPath& path2,
1543 ContigNode pivot)
1539 static ContigPath
1540 align(const Lengths& lengths, const ContigPath& path1, const ContigPath& path2, ContigNode pivot)
15441541 {
15451542 dir_type orientation;
15461543 return align(lengths, path1, path2, pivot, orientation);
278278 where
279279 help = putStr (usageInfo usage options) >> exitSuccess
280280 tryHelp = "Try 'abyss-samtobreak --help' for more information."
281 version = "abyss-samtobreak (ABySS) 2.2.3\n"
281 version = "abyss-samtobreak (ABySS) 2.2.4\n"
282282 usage = "Usage: samtobreak [OPTION]... [FILE]...\n\
283283 \Calculate contig and scaffold contiguity and correctness metrics.\n"
284284
2626 #define PROGRAM "ParseAligns"
2727
2828 static const char VERSION_MESSAGE[] =
29 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
30 "Written by Jared Simpson and Shaun Jackman.\n"
31 "\n"
32 "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
29 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
30 "Written by Jared Simpson and Shaun Jackman.\n"
31 "\n"
32 "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
3333
3434 static const char USAGE_MESSAGE[] =
35 "Usage: " PROGRAM " -k<kmer> [OPTION]... [FILE]...\n"
36 "Write pairs that map to the same contig to the file SAME.\n"
37 "Write pairs that map to different contigs to standard output.\n"
38 "Alignments may be read from FILE(s) or standard input.\n"
39 "\n"
40 " Options:\n"
41 "\n"
42 " -l, --min-align=N minimum alignment length\n"
43 " -d, --dist=DISTANCE write distance estimates to this file\n"
44 " -f, --frag=SAME write fragment sizes to this file\n"
45 " -h, --hist=FILE write the fragment size histogram to FILE\n"
46 " --sam alignments are in SAM format\n"
47 " --kaligner alignments are in KAligner format\n"
48 " -c, --cover=COVERAGE coverage cut-off for distance estimates\n"
49 " -v, --verbose display verbose output\n"
50 " --help display this help and exit\n"
51 " --version output version information and exit\n"
52 "\n"
53 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
35 "Usage: " PROGRAM " -k<kmer> [OPTION]... [FILE]...\n"
36 "Write pairs that map to the same contig to the file SAME.\n"
37 "Write pairs that map to different contigs to standard output.\n"
38 "Alignments may be read from FILE(s) or standard input.\n"
39 "\n"
40 " Options:\n"
41 "\n"
42 " -l, --min-align=N minimum alignment length\n"
43 " -d, --dist=DISTANCE write distance estimates to this file\n"
44 " -f, --frag=SAME write fragment sizes to this file\n"
45 " -h, --hist=FILE write the fragment size histogram to FILE\n"
46 " --sam alignments are in SAM format\n"
47 " --kaligner alignments are in KAligner format\n"
48 " -c, --cover=COVERAGE coverage cut-off for distance estimates\n"
49 " -v, --verbose display verbose output\n"
50 " --help display this help and exit\n"
51 " --version output version information and exit\n"
52 "\n"
53 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
5454
5555 namespace opt {
56 unsigned k; // used by DistanceEst
57 static unsigned c;
58 static int verbose;
59 static string distPath;
60 static string fragPath;
61 static string histPath;
62
63 /** Input alignment format. */
64 static int inputFormat;
65 enum { KALIGNER, SAM };
66
67 /** Output format */
68 int format = ADJ; // used by Estimate
56 unsigned k; // used by DistanceEst
57 static unsigned c;
58 static int verbose;
59 static string distPath;
60 static string fragPath;
61 static string histPath;
62
63 /** Input alignment format. */
64 static int inputFormat;
65 enum
66 {
67 KALIGNER,
68 SAM
69 };
70
71 /** Output format */
72 int format = ADJ; // used by Estimate
6973 }
7074
7175 static const char shortopts[] = "d:l:f:h:c:v";
7276
73 enum { OPT_HELP = 1, OPT_VERSION };
77 enum
78 {
79 OPT_HELP = 1,
80 OPT_VERSION
81 };
7482
7583 static const struct option longopts[] = {
76 { "dist", required_argument, NULL, 'd' },
84 { "dist", required_argument, NULL, 'd' },
7785 { "min-align", required_argument, NULL, 'l' },
78 { "frag", required_argument, NULL, 'f' },
79 { "hist", required_argument, NULL, 'h' },
80 { "kaligner",no_argument, &opt::inputFormat, opt::KALIGNER },
81 { "sam", no_argument, &opt::inputFormat, opt::SAM },
82 { "cover", required_argument, NULL, 'c' },
83 { "verbose", no_argument, NULL, 'v' },
84 { "help", no_argument, NULL, OPT_HELP },
85 { "version", no_argument, NULL, OPT_VERSION },
86 { "frag", required_argument, NULL, 'f' },
87 { "hist", required_argument, NULL, 'h' },
88 { "kaligner", no_argument, &opt::inputFormat, opt::KALIGNER },
89 { "sam", no_argument, &opt::inputFormat, opt::SAM },
90 { "cover", required_argument, NULL, 'c' },
91 { "verbose", no_argument, NULL, 'v' },
92 { "help", no_argument, NULL, OPT_HELP },
93 { "version", no_argument, NULL, OPT_VERSION },
8694 { NULL, 0, NULL, 0 }
8795 };
8896
89 static struct {
97 static struct
98 {
9099 size_t alignments;
91100 size_t bothUnaligned;
92101 size_t oneUnaligned;
108117 typedef unordered_map<string, EstimateRecord> EstimateMap;
109118 static EstimateMap estMap;
110119
111 static bool checkUniqueAlignments(const AlignmentVector& alignVec);
112 static string makePairID(string id);
120 static bool
121 checkUniqueAlignments(const AlignmentVector& alignVec);
122 static string
123 makePairID(string id);
113124
114125 /**
115126 * Return the size of the fragment demarcated by the specified
116127 * alignments.
117128 */
118 static int fragmentSize(const Alignment& a0, const Alignment& a1)
129 static int
130 fragmentSize(const Alignment& a0, const Alignment& a1)
119131 {
120132 assert(a0.contig == a1.contig);
121133 assert(a0.isRC != a1.isRC);
127139 typedef pair<ContigNode, DistanceEst> Estimate;
128140 typedef vector<Estimate> Estimates;
129141
130 static void addEstimate(EstimateMap& map, const Alignment& a,
131 Estimate& est, bool reverse)
132 {
133 //count up the number of estimates that agree
142 static void
143 addEstimate(EstimateMap& map, const Alignment& a, Estimate& est, bool reverse)
144 {
145 // count up the number of estimates that agree
134146 bool placed = false;
135147 bool a_isRC = a.isRC != reverse;
136148 EstimateMap::iterator estimatesIt = map.find(a.contig);
137149 if (estimatesIt != map.end()) {
138150 Estimates& estimates = estimatesIt->second.estimates[a_isRC];
139 for (Estimates::iterator estIt = estimates.begin();
140 estIt != estimates.end(); ++estIt) {
151 for (Estimates::iterator estIt = estimates.begin(); estIt != estimates.end(); ++estIt) {
141152 if (estIt->first.id() == est.first.id()) {
142153 estIt->second.numPairs++;
143154 estIt->second.distance += est.second.distance;
148159 }
149160 if (!placed)
150161 map[a.contig].estimates[a_isRC].push_back(est);
151
152 }
153
154 static void doReadIntegrity(const ReadAlignMap::value_type& a)
162 }
163
164 static void
165 doReadIntegrity(const ReadAlignMap::value_type& a)
155166 {
156167 AlignmentVector::const_iterator refAlignIter = a.second.begin();
157168 unsigned firstStart, lastEnd, largestSize;
163174 first = last = largest = *refAlignIter;
164175 ++refAlignIter;
165176
166 //for each alignment in the vector a.second
177 // for each alignment in the vector a.second
167178 for (; refAlignIter != a.second.end(); ++refAlignIter) {
168179 if ((unsigned)refAlignIter->read_start_pos < firstStart) {
169180 firstStart = refAlignIter->read_start_pos;
170181 first = *refAlignIter;
171182 }
172 if ((unsigned)(refAlignIter->read_start_pos +
173 refAlignIter->align_length) > lastEnd) {
174 lastEnd = refAlignIter->read_start_pos +
175 refAlignIter->align_length;
183 if ((unsigned)(refAlignIter->read_start_pos + refAlignIter->align_length) > lastEnd) {
184 lastEnd = refAlignIter->read_start_pos + refAlignIter->align_length;
176185 last = *refAlignIter;
177186 }
178187 if ((unsigned)refAlignIter->align_length > largestSize) {
183192
184193 if (largest.contig != last.contig) {
185194 Estimate est;
186 unsigned largest_end =
187 largest.read_start_pos + largest.align_length - opt::k;
195 unsigned largest_end = largest.read_start_pos + largest.align_length - opt::k;
188196 int distance = last.read_start_pos - largest_end;
189 est.first = find_vertex(
190 last.contig, largest.isRC != last.isRC,
191 g_contigNames);
197 est.first = find_vertex(last.contig, largest.isRC != last.isRC, g_contigNames);
192198 est.second.distance = distance - opt::k;
193199 est.second.numPairs = 1;
194200 est.second.stdDev = 0;
195201 addEstimate(estMap, largest, est, false);
196202 }
197203
198 if (largest.contig != first.contig &&
199 largest.contig != last.contig) {
204 if (largest.contig != first.contig && largest.contig != last.contig) {
200205 Estimate est;
201 unsigned first_end =
202 first.read_start_pos + first.align_length - opt::k;
206 unsigned first_end = first.read_start_pos + first.align_length - opt::k;
203207 int distance = last.read_start_pos - first_end;
204 est.first = find_vertex(
205 last.contig, first.isRC != last.isRC,
206 g_contigNames);
208 est.first = find_vertex(last.contig, first.isRC != last.isRC, g_contigNames);
207209 est.second.distance = distance - opt::k;
208210 est.second.numPairs = 1;
209211 est.second.stdDev = 0;
214216 largest.flipQuery();
215217 first.flipQuery();
216218 Estimate est;
217 unsigned largest_end =
218 largest.read_start_pos + largest.align_length - opt::k;
219 unsigned largest_end = largest.read_start_pos + largest.align_length - opt::k;
219220 int distance = first.read_start_pos - largest_end;
220 est.first = find_vertex(
221 first.contig, largest.isRC != first.isRC,
222 g_contigNames);
221 est.first = find_vertex(first.contig, largest.isRC != first.isRC, g_contigNames);
223222 est.second.distance = distance - opt::k;
224223 est.second.numPairs = 1;
225224 est.second.stdDev = 0;
258257 #endif
259258 }
260259
261 static void generateDistFile()
260 static void
261 generateDistFile()
262262 {
263263 ofstream distFile(opt::distPath.c_str());
264264 assert(distFile.is_open());
265 for (EstimateMap::iterator mapIt = estMap.begin();
266 mapIt != estMap.end(); ++mapIt) {
267 //Skip empty iterators
265 for (EstimateMap::iterator mapIt = estMap.begin(); mapIt != estMap.end(); ++mapIt) {
266 // Skip empty iterators
268267 assert(!mapIt->second.estimates[0].empty() || !mapIt->second.estimates[1].empty());
269268 distFile << mapIt->first;
270269 for (int refIsRC = 0; refIsRC <= 1; refIsRC++) {
271270 if (refIsRC)
272271 distFile << " ;";
273272
274 for (Estimates::iterator vecIt
275 = mapIt->second.estimates[refIsRC].begin();
276 vecIt != mapIt->second.estimates[refIsRC].end(); ++vecIt) {
277 vecIt->second.distance
278 = (int)round((double)vecIt->second.distance /
279 (double)vecIt->second.numPairs);
280 if (vecIt->second.numPairs >= opt::c
281 && vecIt->second.numPairs != 0
282 /*&& vecIt->distance > 1 - opt::k*/)
283 distFile
284 << ' ' << get(g_contigNames, vecIt->first)
285 << ',' << vecIt->second;
273 for (Estimates::iterator vecIt = mapIt->second.estimates[refIsRC].begin();
274 vecIt != mapIt->second.estimates[refIsRC].end();
275 ++vecIt) {
276 vecIt->second.distance =
277 (int)round((double)vecIt->second.distance / (double)vecIt->second.numPairs);
278 if (vecIt->second.numPairs >= opt::c && vecIt->second.numPairs != 0
279 /*&& vecIt->distance > 1 - opt::k*/)
280 distFile << ' ' << get(g_contigNames, vecIt->first) << ',' << vecIt->second;
286281 }
287282 }
288283 distFile << '\n';
291286 distFile.close();
292287 }
293288
294 static bool isSingleEnd(const string& id);
295 static bool needsFlipping(const string& id);
289 static bool
290 isSingleEnd(const string& id);
291 static bool
292 needsFlipping(const string& id);
296293
297294 /**
298295 * Return an alignment flipped as necessary to produce an alignment
301298 * alignment, so that the alignment is forward-reverse, which is
302299 * required by DistanceEst.
303300 */
304 static const Alignment flipAlignment(const Alignment& a,
305 const string& id)
301 static const Alignment
302 flipAlignment(const Alignment& a, const string& id)
306303 {
307304 return needsFlipping(id) ? a.flipQuery() : a;
308305 }
309306
310 static void handleAlignmentPair(const ReadAlignMap::value_type& curr,
311 const ReadAlignMap::value_type& pair)
307 static void
308 handleAlignmentPair(const ReadAlignMap::value_type& curr, const ReadAlignMap::value_type& pair)
312309 {
313310 const string& currID = curr.first;
314311 const string& pairID = pair.first;
322319 stats.bothUnaligned++;
323320 } else if (curr.second.empty() || pair.second.empty()) {
324321 stats.oneUnaligned++;
325 } else if (!checkUniqueAlignments(curr.second)
326 || !checkUniqueAlignments(pair.second)) {
322 } else if (!checkUniqueAlignments(curr.second) || !checkUniqueAlignments(pair.second)) {
327323 stats.numMulti++;
328 } else if (curr.second.size() > MAX_SPAN
329 && pair.second.size() > MAX_SPAN) {
324 } else if (curr.second.size() > MAX_SPAN && pair.second.size() > MAX_SPAN) {
330325 stats.numSplit++;
331326 } else {
332327 // Iterate over the vectors, outputting the aligments
333328 bool counted = false;
334 for (AlignmentVector::const_iterator refAlignIter
335 = curr.second.begin();
336 refAlignIter != curr.second.end(); ++refAlignIter) {
337 for (AlignmentVector::const_iterator pairAlignIter
338 = pair.second.begin();
339 pairAlignIter != pair.second.end();
340 ++pairAlignIter) {
341 const Alignment& a0 = flipAlignment(*refAlignIter,
342 currID);
343 const Alignment& a1 = flipAlignment(*pairAlignIter,
344 pairID);
329 for (AlignmentVector::const_iterator refAlignIter = curr.second.begin();
330 refAlignIter != curr.second.end();
331 ++refAlignIter) {
332 for (AlignmentVector::const_iterator pairAlignIter = pair.second.begin();
333 pairAlignIter != pair.second.end();
334 ++pairAlignIter) {
335 const Alignment& a0 = flipAlignment(*refAlignIter, currID);
336 const Alignment& a1 = flipAlignment(*pairAlignIter, pairID);
345337
346338 bool sameTarget = a0.contig == a1.contig;
347 if (sameTarget
348 && curr.second.size() == 1
349 && pair.second.size() == 1) {
339 if (sameTarget && curr.second.size() == 1 && pair.second.size() == 1) {
350340 // Same target and the only alignment.
351341 if (a0.isRC != a1.isRC) {
352342 // Correctly oriented. Add this alignment to
362352 counted = true;
363353 }
364354
365 bool outputSameTarget = opt::fragPath.empty()
366 && opt::histPath.empty();
355 bool outputSameTarget = opt::fragPath.empty() && opt::histPath.empty();
367356 if (!sameTarget || outputSameTarget) {
368 cout << SAMRecord(a0, a1) << '\n'
369 << SAMRecord(a1, a0) << '\n';
357 cout << SAMRecord(a0, a1) << '\n' << SAMRecord(a1, a0) << '\n';
370358 assert(cout.good());
371359 }
372360 }
376364 }
377365 }
378366
379 static void printProgress(const ReadAlignMap& map)
367 static void
368 printProgress(const ReadAlignMap& map)
380369 {
381370 if (opt::verbose == 0)
382371 return;
389378 if (stats.alignments % 1000000 == 0 || buckets != prevBuckets) {
390379 prevBuckets = buckets;
391380 size_t size = map.size();
392 cerr << "Read " << stats.alignments << " alignments. "
393 "Hash load: " << size << " / " << buckets
394 << " = " << (float)size / buckets
395 << " using " << toSI(getMemoryUsage()) << "B." << endl;
396 }
397 }
398
399 static void handleAlignment(
400 const ReadAlignMap::value_type& alignments,
401 ReadAlignMap& out)
381 cerr << "Read " << stats.alignments
382 << " alignments. "
383 "Hash load: "
384 << size << " / " << buckets << " = " << (float)size / buckets << " using "
385 << toSI(getMemoryUsage()) << "B." << endl;
386 }
387 }
388
389 static void
390 handleAlignment(const ReadAlignMap::value_type& alignments, ReadAlignMap& out)
402391 {
403392 if (!isSingleEnd(alignments.first)) {
404393 string pairID = makePairID(alignments.first);
407396 handleAlignmentPair(*pairIter, alignments);
408397 out.erase(pairIter);
409398 } else if (!out.insert(alignments).second) {
410 cerr << "error: duplicate read ID `" << alignments.first
411 << "'\n";
399 cerr << "error: duplicate read ID `" << alignments.first << "'\n";
412400 exit(EXIT_FAILURE);
413401 }
414402 }
420408 printProgress(out);
421409 }
422410
423 static void readAlignment(const string& line, ReadAlignMap& out)
411 static void
412 readAlignment(const string& line, ReadAlignMap& out)
424413 {
425414 istringstream s(line);
426415 pair<string, AlignmentVector> v;
427416 switch (opt::inputFormat) {
428 case opt::SAM:
429 {
417 case opt::SAM: {
430418 SAMRecord sam;
431419 s >> sam;
432420 assert(s);
438426 if (!sam.isUnmapped())
439427 v.second.push_back(sam);
440428 break;
441 }
442 case opt::KALIGNER:
443 {
429 }
430 case opt::KALIGNER: {
444431 s >> v.first;
445432 assert(s);
446433 v.second.reserve(count(line.begin(), line.end(), '\t'));
447 v.second.assign(
448 istream_iterator<Alignment>(s),
449 istream_iterator<Alignment>());
434 v.second.assign(istream_iterator<Alignment>(s), istream_iterator<Alignment>());
450435 assert(s.eof());
451436 break;
452 }
437 }
453438 }
454439 handleAlignment(v, out);
455440 }
456441
457 static void readAlignments(istream& in, ReadAlignMap* pout)
442 static void
443 readAlignments(istream& in, ReadAlignMap* pout)
458444 {
459445 for (string line; getline(in, line);)
460446 if (line.empty() || line[0] == '@')
464450 assert(in.eof());
465451 }
466452
467 static void readAlignmentsFile(string path, ReadAlignMap* pout)
453 static void
454 readAlignmentsFile(string path, ReadAlignMap* pout)
468455 {
469456 if (opt::verbose > 0)
470457 cerr << "Reading `" << path << "'..." << endl;
475462 }
476463
477464 /** Return the specified number formatted as a percent. */
478 static string percent(size_t x, size_t n)
465 static string
466 percent(size_t x, size_t n)
479467 {
480468 ostringstream ss;
481469 ss << setw((int)log10(n) + 1) << x;
482470 if (x > 0)
483 ss << " " << setprecision(3) << (float)100*x/n << '%';
471 ss << " " << setprecision(3) << (float)100 * x / n << '%';
484472 return ss.str();
485473 }
486474
487 int main(int argc, char* const* argv)
475 int
476 main(int argc, char* const* argv)
488477 {
489478 bool die = false;
490 for (int c; (c = getopt_long(argc, argv,
491 shortopts, longopts, NULL)) != -1;) {
479 for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
492480 istringstream arg(optarg != NULL ? optarg : "");
493481 switch (c) {
494 case '?': die = true; break;
495 case 'l': arg >> opt::k; break;
496 case 'c': arg >> opt::c; break;
497 case 'd': arg >> opt::distPath; break;
498 case 'f': arg >> opt::fragPath; break;
499 case 'h': arg >> opt::histPath; break;
500 case 'v': opt::verbose++; break;
501 case OPT_HELP:
502 cout << USAGE_MESSAGE;
503 exit(EXIT_SUCCESS);
504 case OPT_VERSION:
505 cout << VERSION_MESSAGE;
506 exit(EXIT_SUCCESS);
482 case '?':
483 die = true;
484 break;
485 case 'l':
486 arg >> opt::k;
487 break;
488 case 'c':
489 arg >> opt::c;
490 break;
491 case 'd':
492 arg >> opt::distPath;
493 break;
494 case 'f':
495 arg >> opt::fragPath;
496 break;
497 case 'h':
498 arg >> opt::histPath;
499 break;
500 case 'v':
501 opt::verbose++;
502 break;
503 case OPT_HELP:
504 cout << USAGE_MESSAGE;
505 exit(EXIT_SUCCESS);
506 case OPT_VERSION:
507 cout << VERSION_MESSAGE;
508 exit(EXIT_SUCCESS);
507509 }
508510 if (optarg != NULL && !arg.eof()) {
509 cerr << PROGRAM ": invalid option: `-"
510 << (char)c << optarg << "'\n";
511 cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n";
511512 exit(EXIT_FAILURE);
512513 }
513514 }
514515
515516 if (opt::k <= 0 && opt::inputFormat == opt::KALIGNER) {
516 cerr << PROGRAM ": " << "missing -k,--kmer option\n";
517 cerr << PROGRAM ": "
518 << "missing -k,--kmer option\n";
517519 die = true;
518520 }
519521
520522 if (die) {
521 cerr << "Try `" << PROGRAM
522 << " --help' for more information.\n";
523 cerr << "Try `" << PROGRAM << " --help' for more information.\n";
523524 exit(EXIT_FAILURE);
524525 }
525526
530531
531532 ReadAlignMap alignTable(1);
532533 if (optind < argc) {
533 for_each(argv + optind, argv + argc,
534 bind2nd(ptr_fun(readAlignmentsFile), &alignTable));
534 for_each(argv + optind, argv + argc, [&alignTable](const std::string& s) {
535 readAlignmentsFile(s, &alignTable);
536 });
535537 } else {
536538 if (opt::verbose > 0)
537539 cerr << "Reading from standard input..." << endl;
542544
543545 unsigned numRF = histogram.count(INT_MIN, 0);
544546 unsigned numFR = histogram.count(1, INT_MAX);
545 size_t sum = alignTable.size()
546 + stats.bothUnaligned + stats.oneUnaligned
547 + numFR + numRF + stats.numFF
548 + stats.numDifferent + stats.numMulti + stats.numSplit;
549 cerr <<
550 "Mateless " << percent(alignTable.size(), sum) << "\n"
551 "Unaligned " << percent(stats.bothUnaligned, sum) << "\n"
552 "Singleton " << percent(stats.oneUnaligned, sum) << "\n"
553 "FR " << percent(numFR, sum) << "\n"
554 "RF " << percent(numRF, sum) << "\n"
555 "FF " << percent(stats.numFF, sum) << "\n"
556 "Different " << percent(stats.numDifferent, sum) << "\n"
557 "Multimap " << percent(stats.numMulti, sum) << "\n"
558 "Split " << percent(stats.numSplit, sum) << "\n"
559 "Total " << sum << endl;
547 size_t sum = alignTable.size() + stats.bothUnaligned + stats.oneUnaligned + numFR + numRF +
548 stats.numFF + stats.numDifferent + stats.numMulti + stats.numSplit;
549 cerr << "Mateless " << percent(alignTable.size(), sum)
550 << "\n"
551 "Unaligned "
552 << percent(stats.bothUnaligned, sum)
553 << "\n"
554 "Singleton "
555 << percent(stats.oneUnaligned, sum)
556 << "\n"
557 "FR "
558 << percent(numFR, sum)
559 << "\n"
560 "RF "
561 << percent(numRF, sum)
562 << "\n"
563 "FF "
564 << percent(stats.numFF, sum)
565 << "\n"
566 "Different "
567 << percent(stats.numDifferent, sum)
568 << "\n"
569 "Multimap "
570 << percent(stats.numMulti, sum)
571 << "\n"
572 "Split "
573 << percent(stats.numSplit, sum)
574 << "\n"
575 "Total "
576 << sum << endl;
560577
561578 if (!opt::distPath.empty())
562579 generateDistFile();
579596 histogram.removeOutliers();
580597 Histogram h = histogram.trimFraction(0.0001);
581598 if (opt::verbose > 0)
582 cerr << "Stats mean: " << setprecision(4) << h.mean() << " "
583 "median: " << setprecision(4) << h.median() << " "
584 "sd: " << setprecision(4) << h.sd() << " "
585 "n: " << h.size() << " "
586 "min: " << h.minimum() << " max: " << h.maximum() << '\n'
587 << h.barplot() << endl;
599 cerr << "Stats mean: " << setprecision(4) << h.mean()
600 << " "
601 "median: "
602 << setprecision(4) << h.median()
603 << " "
604 "sd: "
605 << setprecision(4) << h.sd()
606 << " "
607 "n: "
608 << h.size()
609 << " "
610 "min: "
611 << h.minimum() << " max: " << h.maximum() << '\n'
612 << h.barplot() << endl;
588613
589614 if (stats.numFF > numFR && stats.numFF > numRF) {
590615 cerr << "error: The mate pairs of this library are oriented "
591 "forward-forward (FF), which is not supported by ABySS."
592 << endl;
616 "forward-forward (FF), which is not supported by ABySS."
617 << endl;
593618 exit(EXIT_FAILURE);
594619 }
595620
598623
599624 /** Return whether any k-mer in the query is aligned more than once.
600625 */
601 static bool checkUniqueAlignments(const AlignmentVector& alignVec)
626 static bool
627 checkUniqueAlignments(const AlignmentVector& alignVec)
602628 {
603629 assert(!alignVec.empty());
604630 if (alignVec.size() == 1)
607633 unsigned nKmer = alignVec.front().read_length - opt::k + 1;
608634 vector<unsigned> coverage(nKmer);
609635
610 for (AlignmentVector::const_iterator iter = alignVec.begin();
611 iter != alignVec.end(); ++iter) {
636 for (AlignmentVector::const_iterator iter = alignVec.begin(); iter != alignVec.end(); ++iter) {
612637 assert((unsigned)iter->align_length >= opt::k);
613 unsigned end = iter->read_start_pos
614 + iter->align_length - opt::k + 1;
638 unsigned end = iter->read_start_pos + iter->align_length - opt::k + 1;
615639 assert(end <= nKmer);
616640 for (unsigned i = iter->read_start_pos; i < end; i++)
617641 coverage[i]++;
623647 return true;
624648 }
625649
626 static bool replaceSuffix(string& s,
627 const string& suffix0, const string& suffix1)
650 static bool
651 replaceSuffix(string& s, const string& suffix0, const string& suffix1)
628652 {
629653 if (endsWith(s, suffix0)) {
630 s.replace(s.length() - suffix0.length(), string::npos,
631 suffix1);
654 s.replace(s.length() - suffix0.length(), string::npos, suffix1);
632655 return true;
633656 } else if (endsWith(s, suffix1)) {
634 s.replace(s.length() - suffix1.length(), string::npos,
635 suffix0);
657 s.replace(s.length() - suffix1.length(), string::npos, suffix0);
636658 return true;
637659 } else
638660 return false;
639661 }
640662
641663 /** Return true if the specified read ID is of a single-end read. */
642 static bool isSingleEnd(const string& id)
664 static bool
665 isSingleEnd(const string& id)
643666 {
644667 unsigned l = id.length();
645 return endsWith(id, ".fn")
646 || (l > 6 && id.substr(l-6, 5) == ".part");
668 return endsWith(id, ".fn") || (l > 6 && id.substr(l - 6, 5) == ".part");
647669 }
648670
649671 /** Return the mate ID of the specified read ID. */
650 static string makePairID(string id)
672 static string
673 makePairID(string id)
651674 {
652675 if (equal(id.begin(), id.begin() + 3, "SRR"))
653676 return id;
655678 assert(!id.empty());
656679 char& c = id[id.length() - 1];
657680 switch (c) {
658 case '1': c = '2'; return id;
659 case '2': c = '1'; return id;
660 case 'A': c = 'B'; return id;
661 case 'B': c = 'A'; return id;
662 case 'F': c = 'R'; return id;
663 case 'R': c = 'F'; return id;
664 case 'f': c = 'r'; return id;
665 case 'r': c = 'f'; return id;
666 }
667
668 if (replaceSuffix(id, "forward", "reverse")
669 || replaceSuffix(id, "F3", "R3"))
670 return id;
671
672 cerr << "error: read ID `" << id << "' must end in one of\n"
673 "\t1 and 2 or A and B or F and R or"
674 " F3 and R3 or forward and reverse\n";
681 case '1':
682 c = '2';
683 return id;
684 case '2':
685 c = '1';
686 return id;
687 case 'A':
688 c = 'B';
689 return id;
690 case 'B':
691 c = 'A';
692 return id;
693 case 'F':
694 c = 'R';
695 return id;
696 case 'R':
697 c = 'F';
698 return id;
699 case 'f':
700 c = 'r';
701 return id;
702 case 'r':
703 c = 'f';
704 return id;
705 }
706
707 if (replaceSuffix(id, "forward", "reverse") || replaceSuffix(id, "F3", "R3"))
708 return id;
709
710 cerr << "error: read ID `" << id
711 << "' must end in one of\n"
712 "\t1 and 2 or A and B or F and R or"
713 " F3 and R3 or forward and reverse\n";
675714 exit(EXIT_FAILURE);
676715 }
677716
678 static bool needsFlipping(const string& id)
717 static bool
718 needsFlipping(const string& id)
679719 {
680720 return endsWith(id, "F3");
681721 }
0 #include "ContigID.h"
1 #include "DataBase/DB.h"
2 #include "DataBase/Options.h"
03 #include "Histogram.h"
14 #include "IOUtil.h"
25 #include "MemoryUtil.h"
47 #include "StringUtil.h"
58 #include "Uncompress.h"
69 #include "UnorderedMap.h"
7 #include "ContigID.h"
810 #include <algorithm>
11 #include <boost/unordered_map.hpp>
912 #include <climits>
1013 #include <cstdlib>
1114 #include <fstream>
1316 #include <getopt.h>
1417 #include <iomanip>
1518 #include <iostream>
16 #include <boost/unordered_map.hpp>
17 #include "DataBase/Options.h"
18 #include "DataBase/DB.h"
1919
2020 using namespace std;
2121
2424 DB db;
2525
2626 static const char VERSION_MESSAGE[] =
27 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
28 "Written by Shaun Jackman.\n"
29 "\n"
30 "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
27 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
28 "Written by Shaun Jackman.\n"
29 "\n"
30 "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
3131
3232 static const char USAGE_MESSAGE[] =
33 "Usage: " PROGRAM " [OPTION]... [FILE]...\n"
34 "Write read pairs that map to the same contig to the file SAME.\n"
35 "Write read pairs that map to different contigs to stdout.\n"
36 "Alignments may be in FILE(s) or standard input.\n"
37 "\n"
38 " Options:\n"
39 "\n"
40 " --no-qname set the qname to * [default]\n"
41 " --qname do not alter the qname\n"
42 " --all print all alignments\n"
43 " --diff print alignments that align to different\n"
44 " contigs [default]\n"
45 " -l, --min-align=N the minimal alignment size [1]\n"
46 " -s, --same=SAME write properly-paired reads to this file\n"
47 " -h, --hist=FILE write the fragment size histogram to FILE\n"
48 " -c, --cov=FILE write the physical coverage to FILE\n"
49 " -v, --verbose display verbose output\n"
50 " --help display this help and exit\n"
51 " --version output version information and exit\n"
52 " --db=FILE specify path of database repository in FILE\n"
53 " --library=NAME specify library NAME for sqlite\n"
54 " --strain=NAME specify strain NAME for sqlite\n"
55 " --species=NAME specify species NAME for sqlite\n"
56 "\n"
57 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
33 "Usage: " PROGRAM " [OPTION]... [FILE]...\n"
34 "Write read pairs that map to the same contig to the file SAME.\n"
35 "Write read pairs that map to different contigs to stdout.\n"
36 "Alignments may be in FILE(s) or standard input.\n"
37 "\n"
38 " Options:\n"
39 "\n"
40 " --no-qname set the qname to * [default]\n"
41 " --qname do not alter the qname\n"
42 " --all print all alignments\n"
43 " --diff print alignments that align to different\n"
44 " contigs [default]\n"
45 " -l, --min-align=N the minimal alignment size [1]\n"
46 " -s, --same=SAME write properly-paired reads to this file\n"
47 " -h, --hist=FILE write the fragment size histogram to FILE\n"
48 " -c, --cov=FILE write the physical coverage to FILE\n"
49 " -v, --verbose display verbose output\n"
50 " --help display this help and exit\n"
51 " --version output version information and exit\n"
52 " --db=FILE specify path of database repository in FILE\n"
53 " --library=NAME specify library NAME for sqlite\n"
54 " --strain=NAME specify strain NAME for sqlite\n"
55 " --species=NAME specify species NAME for sqlite\n"
56 "\n"
57 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
5858
5959 namespace opt {
60 string db;
61 dbVars metaVars;
62 static string fragPath;
63 static string histPath;
64 static string covPath;
65 static int qname;
66 static int verbose;
67 static int print_all;
60 string db;
61 dbVars metaVars;
62 static string fragPath;
63 static string histPath;
64 static string covPath;
65 static int qname;
66 static int verbose;
67 static int print_all;
6868 }
6969
7070 // for sqlite params
7373
7474 static const char shortopts[] = "h:c:l:s:v";
7575
76 enum { OPT_HELP = 1, OPT_VERSION, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
77
78 static const struct option longopts[] = {
79 { "qname", no_argument, &opt::qname, 1 },
80 { "no-qname", no_argument, &opt::qname, 0 },
81 { "all", no_argument, &opt::print_all, 1 },
82 { "diff", no_argument, &opt::print_all, 0 },
83 { "min-align", required_argument, NULL, 'l' },
84 { "hist", required_argument, NULL, 'h' },
85 { "cov", required_argument, NULL, 'c' },
86 { "same", required_argument, NULL, 's' },
87 { "verbose", no_argument, NULL, 'v' },
88 { "help", no_argument, NULL, OPT_HELP },
89 { "version", no_argument, NULL, OPT_VERSION },
90 { "db", required_argument, NULL, OPT_DB },
91 { "library", required_argument, NULL, OPT_LIBRARY },
92 { "strain", required_argument, NULL, OPT_STRAIN },
93 { "species", required_argument, NULL, OPT_SPECIES },
94 { NULL, 0, NULL, 0 }
76 enum
77 {
78 OPT_HELP = 1,
79 OPT_VERSION,
80 OPT_DB,
81 OPT_LIBRARY,
82 OPT_STRAIN,
83 OPT_SPECIES
9584 };
9685
97 static struct {
86 static const struct option longopts[] = { { "qname", no_argument, &opt::qname, 1 },
87 { "no-qname", no_argument, &opt::qname, 0 },
88 { "all", no_argument, &opt::print_all, 1 },
89 { "diff", no_argument, &opt::print_all, 0 },
90 { "min-align", required_argument, NULL, 'l' },
91 { "hist", required_argument, NULL, 'h' },
92 { "cov", required_argument, NULL, 'c' },
93 { "same", required_argument, NULL, 's' },
94 { "verbose", no_argument, NULL, 'v' },
95 { "help", no_argument, NULL, OPT_HELP },
96 { "version", no_argument, NULL, OPT_VERSION },
97 { "db", required_argument, NULL, OPT_DB },
98 { "library", required_argument, NULL, OPT_LIBRARY },
99 { "strain", required_argument, NULL, OPT_STRAIN },
100 { "species", required_argument, NULL, OPT_SPECIES },
101 { NULL, 0, NULL, 0 } };
102
103 static struct
104 {
98105 size_t alignments;
99106 size_t bothUnaligned;
100107 size_t oneUnaligned;
105112 static ofstream g_fragFile;
106113 static Histogram g_histogram;
107114 static ofstream g_covFile;
108 static vector< vector<int> > g_contigCov;
109
110 static void incrementRange(SAMRecord& a)
115 static vector<vector<int>> g_contigCov;
116
117 static void
118 incrementRange(SAMRecord& a)
111119 {
112120 unsigned inx = get(g_contigNames, a.rname);
113121 g_contigCov[inx][a.pos]++;
118126 g_contigCov[inx][end]--;
119127 }
120128
121 static void handlePair(SAMRecord& a0, SAMRecord& a1)
122 {
123 if ((a0.isRead1() && a1.isRead1())
124 || (a0.isRead2() && a1.isRead2())) {
125 cerr << "error: duplicate read ID `" << a0.qname
126 << (a0.isRead1() ? "/1" : "")
127 << (a0.isRead2() ? "/2" : "")
128 << "'\n";
129 static void
130 handlePair(SAMRecord& a0, SAMRecord& a1)
131 {
132 if ((a0.isRead1() && a1.isRead1()) || (a0.isRead2() && a1.isRead2())) {
133 cerr << "error: duplicate read ID `" << a0.qname << (a0.isRead1() ? "/1" : "")
134 << (a0.isRead2() ? "/2" : "") << "'\n";
129135 exit(EXIT_FAILURE);
130136 }
131137
174180 typedef boost::unordered_map<string, SAMAlignment> Alignments;
175181 #endif
176182
177 static void printProgress(const Alignments& map)
183 static void
184 printProgress(const Alignments& map)
178185 {
179186 if (opt::verbose == 0)
180187 return;
188195 prevBuckets = buckets;
189196 size_t size = map.size();
190197 cerr << "Read " << stats.alignments << " alignments. "
191 << "Hash load: " << size << " / " << buckets
192 << " = " << (float)size / buckets
193 << " using " << toSI(getMemoryUsage()) << "B." << endl;
194 }
195 }
196
197 static void handleAlignment(SAMRecord& sam, Alignments& map)
198 {
199 pair<Alignments::iterator, bool> it = map.insert(
200 make_pair(sam.qname, sam));
198 << "Hash load: " << size << " / " << buckets << " = " << (float)size / buckets
199 << " using " << toSI(getMemoryUsage()) << "B." << endl;
200 }
201 }
202
203 static void
204 handleAlignment(SAMRecord& sam, Alignments& map)
205 {
206 pair<Alignments::iterator, bool> it = map.insert(make_pair(sam.qname, sam));
201207 if (!it.second) {
202208 #if SAM_SEQ_QUAL
203209 SAMRecord& a0 = it.first->second;
221227 printProgress(map);
222228 }
223229
224 static void assert_eof(istream& in)
230 static void
231 assert_eof(istream& in)
225232 {
226233 if (in.eof())
227234 return;
233240 }
234241
235242 /** Print physical coverage in wiggle format. */
236 static void printCov(string file)
243 static void
244 printCov(string file)
237245 {
238246 ofstream out(file.c_str());
239247 for (unsigned i = 0; i < g_contigCov.size(); i++) {
240248 out << "variableStep\tchrom=" << get(g_contigNames, i) << '\n';
241249 int prev = g_contigCov[i][0];
242 if (prev != 0) out << "0\t" << prev << '\n';
250 if (prev != 0)
251 out << "0\t" << prev << '\n';
243252 for (unsigned j = 1; j < g_contigCov[i].size(); j++) {
244253 prev += g_contigCov[i][j];
245 if (prev != 0) out << j << "\t" << prev << '\n';
246 }
247 }
248 }
249
250 void parseTag(string line) {
254 if (prev != 0)
255 out << j << "\t" << prev << '\n';
256 }
257 }
258 }
259
260 void
261 parseTag(string line)
262 {
251263 stringstream ss(line);
252264 string tag;
253265 char type[2];
268280 assert(length > 0);
269281 assert(id.size() > 0);
270282 put(g_contigNames, g_contigCov.size(), id);
271 g_contigCov.push_back(vector<int>(length));
272 }
273
274 static void readAlignments(istream& in, Alignments* pMap)
283 g_contigCov.push_back(vector<int>(length));
284 }
285
286 static void
287 readAlignments(istream& in, Alignments* pMap)
275288 {
276289 for (SAMRecord sam; in >> ws;) {
277290 if (in.peek() == '@') {
293306 assert_eof(in);
294307 }
295308
296 static void readAlignmentsFile(string path, Alignments* pMap)
309 static void
310 readAlignmentsFile(string path, Alignments* pMap)
297311 {
298312 if (opt::verbose > 0)
299313 cerr << "Reading `" << path << "'..." << endl;
304318 }
305319
306320 /** Return the specified number formatted as a percent. */
307 static string percent(size_t x, size_t n)
321 static string
322 percent(size_t x, size_t n)
308323 {
309324 ostringstream ss;
310325 ss << setw((int)log10(n) + 1) << x;
311326 if (x > 0)
312 ss << " " << setprecision(3) << (float)100*x/n << '%';
327 ss << " " << setprecision(3) << (float)100 * x / n << '%';
313328 return ss.str();
314329 }
315330
316331 /** Print statistics of the specified histogram. h not passed by
317 * reference because we want to make a copy
332 * reference because we want to make a copy
318333 **/
319 static void printHistogramStats(Histogram h)
334 static void
335 printHistogramStats(Histogram h)
320336 {
321337 unsigned n_orig = h.size();
322338 h.eraseNegative();
323339 h.removeNoise();
324340 h.removeOutliers();
325341 h = h.trimFraction(0.0001);
326 cerr << "Stats mean: " << setprecision(4) << h.mean() << " "
327 "median: " << setprecision(4) << h.median() << " "
328 "sd: " << setprecision(4) << h.sd() << " "
329 "n: " << h.size() << " "
330 "min: " << h.minimum() << " "
331 "max: " << h.maximum() << " "
332 "ignored: " << n_orig - h.size() << '\n'
333 << h.barplot() << endl;
342 cerr << "Stats mean: " << setprecision(4) << h.mean()
343 << " "
344 "median: "
345 << setprecision(4) << h.median()
346 << " "
347 "sd: "
348 << setprecision(4) << h.sd()
349 << " "
350 "n: "
351 << h.size()
352 << " "
353 "min: "
354 << h.minimum()
355 << " "
356 "max: "
357 << h.maximum()
358 << " "
359 "ignored: "
360 << n_orig - h.size() << '\n'
361 << h.barplot() << endl;
334362 if (!opt::db.empty()) {
335 vals = make_vector<int>()
336 << (int)round(h.mean())
337 << h.median()
338 << (int)round(h.sd())
339 << h.size()
340 << h.minimum()
341 << h.maximum()
342 << n_orig-h.size();
343
344 keys = make_vector<string>()
345 << "mean"
346 << "median"
347 << "sd"
348 << "n"
349 << "min"
350 << "max"
351 << "ignored";
352
353 for (unsigned i=0; i<vals.size(); i++)
363 vals = make_vector<int>() << (int)round(h.mean()) << h.median() << (int)round(h.sd())
364 << h.size() << h.minimum() << h.maximum() << n_orig - h.size();
365
366 keys = make_vector<string>() << "mean"
367 << "median"
368 << "sd"
369 << "n"
370 << "min"
371 << "max"
372 << "ignored";
373
374 for (unsigned i = 0; i < vals.size(); i++)
354375 addToDb(db, keys[i], vals[i]);
355376 }
356377 }
357378
358 int main(int argc, char* const* argv)
379 int
380 main(int argc, char* const* argv)
359381 {
360382 opt::metaVars.resize(3);
361383
362384 bool die = false;
363 for (int c; (c = getopt_long(argc, argv,
364 shortopts, longopts, NULL)) != -1;) {
385 for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
365386 istringstream arg(optarg != NULL ? optarg : "");
366387 switch (c) {
367 case '?': die = true; break;
368 case 'l':
369 arg >> opt::minAlign;
370 break;
371 case 's': arg >> opt::fragPath; break;
372 case 'h': arg >> opt::histPath; break;
373 case 'c': arg >> opt::covPath; break;
374 case 'v': opt::verbose++; break;
375 case OPT_HELP:
376 cout << USAGE_MESSAGE;
377 exit(EXIT_SUCCESS);
378 case OPT_VERSION:
379 cout << VERSION_MESSAGE;
380 exit(EXIT_SUCCESS);
381 case OPT_DB:
382 arg >> opt::db;
383 break;
384 case OPT_LIBRARY:
385 arg >> opt::metaVars[0];
386 break;
387 case OPT_STRAIN:
388 arg >> opt::metaVars[1];
389 break;
390 case OPT_SPECIES:
391 arg >> opt::metaVars[2];
392 break;
388 case '?':
389 die = true;
390 break;
391 case 'l':
392 arg >> opt::minAlign;
393 break;
394 case 's':
395 arg >> opt::fragPath;
396 break;
397 case 'h':
398 arg >> opt::histPath;
399 break;
400 case 'c':
401 arg >> opt::covPath;
402 break;
403 case 'v':
404 opt::verbose++;
405 break;
406 case OPT_HELP:
407 cout << USAGE_MESSAGE;
408 exit(EXIT_SUCCESS);
409 case OPT_VERSION:
410 cout << VERSION_MESSAGE;
411 exit(EXIT_SUCCESS);
412 case OPT_DB:
413 arg >> opt::db;
414 break;
415 case OPT_LIBRARY:
416 arg >> opt::metaVars[0];
417 break;
418 case OPT_STRAIN:
419 arg >> opt::metaVars[1];
420 break;
421 case OPT_SPECIES:
422 arg >> opt::metaVars[2];
423 break;
393424 }
394425 if (optarg != NULL && !arg.eof()) {
395 cerr << PROGRAM ": invalid option: `-"
396 << (char)c << optarg << "'\n";
426 cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n";
397427 exit(EXIT_FAILURE);
398428 }
399429 }
400430
401431 if (die) {
402 cerr << "Try `" << PROGRAM
403 << " --help' for more information.\n";
432 cerr << "Try `" << PROGRAM << " --help' for more information.\n";
404433 exit(EXIT_FAILURE);
405434 }
406435
410439 }
411440
412441 if (!opt::db.empty())
413 init(db,
414 opt::db,
415 opt::verbose,
416 PROGRAM,
417 opt::getCommand(argc, argv),
418 opt::metaVars
419 );
442 init(db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars);
420443
421444 Alignments alignments(1);
422445 if (optind < argc) {
423 for_each(argv + optind, argv + argc,
424 bind2nd(ptr_fun(readAlignmentsFile), &alignments));
446 for_each(argv + optind, argv + argc, [&alignments](const std::string& s) {
447 readAlignmentsFile(s, &alignments);
448 });
425449 } else {
426450 if (opt::verbose > 0)
427451 cerr << "Reading from standard input..." << endl;
434458
435459 // Print the unpaired alignments.
436460 if (opt::print_all) {
437 for (Alignments::iterator it = alignments.begin();
438 it != alignments.end(); it++) {
461 for (Alignments::iterator it = alignments.begin(); it != alignments.end(); it++) {
439462 #if SAM_SEQ_QUAL
440463 SAMRecord& a0 = it->second;
441464 #else
449472
450473 unsigned numRF = g_histogram.count(INT_MIN, 0);
451474 unsigned numFR = g_histogram.count(1, INT_MAX);
452 size_t sum = alignments.size()
453 + stats.bothUnaligned + stats.oneUnaligned
454 + numFR + numRF + stats.numFF
455 + stats.numDifferent;
456 cerr <<
457 "Mateless " << percent(alignments.size(), sum) << "\n"
458 "Unaligned " << percent(stats.bothUnaligned, sum) << "\n"
459 "Singleton " << percent(stats.oneUnaligned, sum) << "\n"
460 "FR " << percent(numFR, sum) << "\n"
461 "RF " << percent(numRF, sum) << "\n"
462 "FF " << percent(stats.numFF, sum) << "\n"
463 "Different " << percent(stats.numDifferent, sum) << "\n"
464 "Total " << sum << endl;
475 size_t sum = alignments.size() + stats.bothUnaligned + stats.oneUnaligned + numFR + numRF +
476 stats.numFF + stats.numDifferent;
477 cerr << "Mateless " << percent(alignments.size(), sum)
478 << "\n"
479 "Unaligned "
480 << percent(stats.bothUnaligned, sum)
481 << "\n"
482 "Singleton "
483 << percent(stats.oneUnaligned, sum)
484 << "\n"
485 "FR "
486 << percent(numFR, sum)
487 << "\n"
488 "RF "
489 << percent(numRF, sum)
490 << "\n"
491 "FF "
492 << percent(stats.numFF, sum)
493 << "\n"
494 "Different "
495 << percent(stats.numDifferent, sum)
496 << "\n"
497 "Total "
498 << sum << endl;
465499
466500 if (!opt::db.empty()) {
467 vals = make_vector<int>()
468 << alignments.size()
469 << stats.bothUnaligned
470 << stats.oneUnaligned
471 << numFR
472 << numRF
473 << stats.numFF
474 << stats.numDifferent
475 << sum;
476
477 keys = make_vector<string>()
478 << "Mateless"
479 << "Unaligned"
480 << "Singleton"
481 << "FR"
482 << "RF"
483 << "FF"
484 << "Different"
485 << "Total";
486
487 for (unsigned i=0; i<vals.size(); i++)
501 vals = make_vector<int>() << alignments.size() << stats.bothUnaligned << stats.oneUnaligned
502 << numFR << numRF << stats.numFF << stats.numDifferent << sum;
503
504 keys = make_vector<string>() << "Mateless"
505 << "Unaligned"
506 << "Singleton"
507 << "FR"
508 << "RF"
509 << "FF"
510 << "Different"
511 << "Total";
512
513 for (unsigned i = 0; i < vals.size(); i++)
488514 addToDb(db, keys[i], vals[i]);
489515 }
490516
491517 if (alignments.size() == sum) {
492518 cerr << PROGRAM ": error: All reads are mateless. This "
493 "can happen when first and second read IDs do not match."
494 << endl;
519 "can happen when first and second read IDs do not match."
520 << endl;
495521 exit(EXIT_FAILURE);
496522 }
497523
524550
525551 if (stats.numFF > numFR && stats.numFF > numRF) {
526552 cerr << PROGRAM ": error: The mate pairs of this library are "
527 "oriented forward-forward (FF), which is not supported "
528 "by ABySS."
529 << endl;
553 "oriented forward-forward (FF), which is not supported "
554 "by ABySS."
555 << endl;
530556 exit(EXIT_FAILURE);
531557 }
532558
0 #include "config.h"
10 #include "ContigID.h"
21 #include "ContigPath.h"
32 #include "ContigProperties.h"
3 #include "DataBase/DB.h"
4 #include "DataBase/Options.h"
45 #include "Functional.h"
5 #include "IOUtil.h"
6 #include "Uncompress.h"
76 #include "Graph/ContigGraph.h"
87 #include "Graph/ContigGraphAlgorithms.h"
98 #include "Graph/DirectedGraph.h"
109 #include "Graph/GraphIO.h"
10 #include "IOUtil.h"
11 #include "Uncompress.h"
12 #include "config.h"
1113 #include <algorithm>
1214 #include <cassert>
1315 #include <cerrno>
16 #include <cstdlib>
1417 #include <cstring> // for strerror
15 #include <cstdlib>
18 #include <fstream>
1619 #include <functional>
20 #include <getopt.h>
1721 #include <iostream>
18 #include <fstream>
19 #include <getopt.h>
2022 #include <map>
2123 #include <vector>
22 #include "DataBase/Options.h"
23 #include "DataBase/DB.h"
2424
2525 using namespace std;
2626
2828
2929 DB db;
3030
31 static const char *VERSION_MESSAGE =
32 PROGRAM " (ABySS) " VERSION "\n"
33 "Written by Shaun Jackman and Tony Raymond.\n"
34 "\n"
35 "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
36
37 static const char *USAGE_MESSAGE =
38 "Usage: " PROGRAM " -k<kmer> [OPTION]... ADJ PATH\n"
39 "Find paths that overlap. Either output the graph of overlapping\n"
40 "paths, assemble overlapping paths into larger paths, or trim the\n"
41 "overlapping paths.\n"
42 "\n"
43 " Arguments:\n"
44 "\n"
45 " ADJ contig adjacency graph\n"
46 " PATH sequences of contig IDs\n"
47 "\n"
48 " Options:\n"
49 "\n"
50 " -k, --kmer=N k-mer size\n"
51 " -g, --graph=FILE write the contig adjacency graph to FILE\n"
52 " -r, --repeats=FILE write repeat contigs to FILE\n"
53 " --overlap find overlapping paths [default]\n"
54 " --assemble assemble overlapping paths\n"
55 " --trim trim overlapping paths\n"
56 " --adj output the graph in ADJ format [default]\n"
57 " --asqg output the graph in ASQG format\n"
58 " --dot output the graph in GraphViz format\n"
59 " --gfa output the graph in GFA1 format\n"
60 " --gfa1 output the graph in GFA1 format\n"
61 " --gfa2 output the graph in GFA2 format\n"
62 " --gv output the graph in GraphViz format\n"
63 " --sam output the graph in SAM format\n"
64 " --SS expect contigs to be oriented correctly\n"
65 " --no-SS no assumption about contig orientation [default]\n"
66 " -v, --verbose display verbose output\n"
67 " --help display this help and exit\n"
68 " --version output version information and exit\n"
69 " --db=FILE specify path of database repository in FILE\n"
70 " --library=NAME specify library NAME for sqlite\n"
71 " --strain=NAME specify strain NAME for sqlite\n"
72 " --species=NAME specify species NAME for sqlite\n"
73 "\n"
74 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
31 static const char* VERSION_MESSAGE =
32 PROGRAM " (ABySS) " VERSION "\n"
33 "Written by Shaun Jackman and Tony Raymond.\n"
34 "\n"
35 "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
36
37 static const char* USAGE_MESSAGE =
38 "Usage: " PROGRAM " -k<kmer> [OPTION]... ADJ PATH\n"
39 "Find paths that overlap. Either output the graph of overlapping\n"
40 "paths, assemble overlapping paths into larger paths, or trim the\n"
41 "overlapping paths.\n"
42 "\n"
43 " Arguments:\n"
44 "\n"
45 " ADJ contig adjacency graph\n"
46 " PATH sequences of contig IDs\n"
47 "\n"
48 " Options:\n"
49 "\n"
50 " -k, --kmer=N k-mer size\n"
51 " -g, --graph=FILE write the contig adjacency graph to FILE\n"
52 " -r, --repeats=FILE write repeat contigs to FILE\n"
53 " --overlap find overlapping paths [default]\n"
54 " --assemble assemble overlapping paths\n"
55 " --trim trim overlapping paths\n"
56 " --adj output the graph in ADJ format [default]\n"
57 " --asqg output the graph in ASQG format\n"
58 " --dot output the graph in GraphViz format\n"
59 " --gfa output the graph in GFA1 format\n"
60 " --gfa1 output the graph in GFA1 format\n"
61 " --gfa2 output the graph in GFA2 format\n"
62 " --gv output the graph in GraphViz format\n"
63 " --sam output the graph in SAM format\n"
64 " --SS expect contigs to be oriented correctly\n"
65 " --no-SS no assumption about contig orientation [default]\n"
66 " -v, --verbose display verbose output\n"
67 " --help display this help and exit\n"
68 " --version output version information and exit\n"
69 " --db=FILE specify path of database repository in FILE\n"
70 " --library=NAME specify library NAME for sqlite\n"
71 " --strain=NAME specify strain NAME for sqlite\n"
72 " --species=NAME specify species NAME for sqlite\n"
73 "\n"
74 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
7575
7676 namespace opt {
77 string db;
78 dbVars metaVars;
79 unsigned k;
80
81 /** Output format. */
82 int format; // used by ContigProperties
83
84 /** Write the contig adjacency graph to this file. */
85 static string graphPath;
86
87 /** Output the IDs of contigs in overlaps to this file. */
88 static string repeatContigs;
89
90 /** Run a strand-specific RNA-Seq assembly. */
91 static int ss;
92
93 /** Mode of operation. */
94 enum {
95 /** Find overlapping paths, do not assemble. */
96 OVERLAP,
97 /** Assemble overlapping paths. */
98 ASSEMBLE,
99 /** Trim overlapping paths. */
100 TRIM,
101 };
102 static int mode;
103
104 static int verbose;
77 string db;
78 dbVars metaVars;
79 unsigned k;
80
81 /** Output format. */
82 int format; // used by ContigProperties
83
84 /** Write the contig adjacency graph to this file. */
85 static string graphPath;
86
87 /** Output the IDs of contigs in overlaps to this file. */
88 static string repeatContigs;
89
90 /** Run a strand-specific RNA-Seq assembly. */
91 static int ss;
92
93 /** Mode of operation. */
94 enum
95 {
96 /** Find overlapping paths, do not assemble. */
97 OVERLAP,
98 /** Assemble overlapping paths. */
99 ASSEMBLE,
100 /** Trim overlapping paths. */
101 TRIM,
102 };
103 static int mode;
104
105 static int verbose;
105106 }
106107
107108 static const char* shortopts = "g:k:r:v";
108109
109 enum { OPT_HELP = 1, OPT_VERSION, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
110 //enum { OPT_HELP = 1, OPT_VERSION };
111
112 static const struct option longopts[] = {
113 { "graph", required_argument, NULL, 'g' },
114 { "kmer", required_argument, NULL, 'k' },
115 { "assemble", no_argument, &opt::mode, opt::ASSEMBLE },
116 { "overlap", no_argument, &opt::mode, opt::OVERLAP },
117 { "trim", no_argument, &opt::mode, opt::TRIM },
118 { "adj", no_argument, &opt::format, ADJ },
119 { "asqg", no_argument, &opt::format, ASQG },
120 { "dot", no_argument, &opt::format, DOT },
121 { "gfa", no_argument, &opt::format, GFA1 },
122 { "gfa1", no_argument, &opt::format, GFA1 },
123 { "gfa2", no_argument, &opt::format, GFA2 },
124 { "gv", no_argument, &opt::format, DOT },
125 { "sam", no_argument, &opt::format, SAM },
126 { "SS", no_argument, &opt::ss, 1 },
127 { "no-SS", no_argument, &opt::ss, 0 },
128 { "repeats", required_argument, NULL, 'r' },
129 { "verbose", no_argument, NULL, 'v' },
130 { "help", no_argument, NULL, OPT_HELP },
131 { "version", no_argument, NULL, OPT_VERSION },
132 { "db", required_argument, NULL, OPT_DB },
133 { "library", required_argument, NULL, OPT_LIBRARY },
134 { "strain", required_argument, NULL, OPT_STRAIN },
135 { "species", required_argument, NULL, OPT_SPECIES },
136 { NULL, 0, NULL, 0 }
110 enum
111 {
112 OPT_HELP = 1,
113 OPT_VERSION,
114 OPT_DB,
115 OPT_LIBRARY,
116 OPT_STRAIN,
117 OPT_SPECIES
137118 };
119 // enum { OPT_HELP = 1, OPT_VERSION };
120
121 static const struct option longopts[] = { { "graph", required_argument, NULL, 'g' },
122 { "kmer", required_argument, NULL, 'k' },
123 { "assemble", no_argument, &opt::mode, opt::ASSEMBLE },
124 { "overlap", no_argument, &opt::mode, opt::OVERLAP },
125 { "trim", no_argument, &opt::mode, opt::TRIM },
126 { "adj", no_argument, &opt::format, ADJ },
127 { "asqg", no_argument, &opt::format, ASQG },
128 { "dot", no_argument, &opt::format, DOT },
129 { "gfa", no_argument, &opt::format, GFA1 },
130 { "gfa1", no_argument, &opt::format, GFA1 },
131 { "gfa2", no_argument, &opt::format, GFA2 },
132 { "gv", no_argument, &opt::format, DOT },
133 { "sam", no_argument, &opt::format, SAM },
134 { "SS", no_argument, &opt::ss, 1 },
135 { "no-SS", no_argument, &opt::ss, 0 },
136 { "repeats", required_argument, NULL, 'r' },
137 { "verbose", no_argument, NULL, 'v' },
138 { "help", no_argument, NULL, OPT_HELP },
139 { "version", no_argument, NULL, OPT_VERSION },
140 { "db", required_argument, NULL, OPT_DB },
141 { "library", required_argument, NULL, OPT_LIBRARY },
142 { "strain", required_argument, NULL, OPT_STRAIN },
143 { "species", required_argument, NULL, OPT_SPECIES },
144 { NULL, 0, NULL, 0 } };
138145
139146 /** A vertex of the overlap graph. */
140 struct Vertex {
147 struct Vertex
148 {
141149 unsigned id;
142150 bool sense;
143151
145153 static unsigned s_offset;
146154
147155 Vertex(unsigned id, bool sense)
148 : id(id), sense(sense) { }
149
150 bool operator ==(const Vertex& v) const
151 {
152 return id == v.id && sense == v.sense;
153 }
154
155 ContigNode descriptor() const
156 {
157 return ContigNode(s_offset + id, sense);
158 }
156 : id(id)
157 , sense(sense)
158 {}
159
160 bool operator==(const Vertex& v) const { return id == v.id && sense == v.sense; }
161
162 ContigNode descriptor() const { return ContigNode(s_offset + id, sense); }
159163 };
160164
161165 unsigned Vertex::s_offset;
162166
163167 /** An alignment of two overlapping contigs. */
164 struct Overlap {
168 struct Overlap
169 {
165170 Vertex source;
166171 Vertex target;
167172
171176 /** Overlap measured in bp. */
172177 int distance;
173178
174 Overlap(const Vertex& source, const Vertex& target,
175 unsigned overlap, int distance)
176 : source(source), target(target),
177 overlap(overlap), distance(distance) { }
179 Overlap(const Vertex& source, const Vertex& target, unsigned overlap, int distance)
180 : source(source)
181 , target(target)
182 , overlap(overlap)
183 , distance(distance)
184 {}
178185 };
179186
180187 /** The contig IDs that have been removed from paths. */
187194 typedef vector<ContigPath> Paths;
188195
189196 /** Return whether this vertex is a path or a contig. */
190 static bool isPath(const ContigNode& u)
197 static bool
198 isPath(const ContigNode& u)
191199 {
192200 return u.id() >= Vertex::s_offset;
193201 }
194202
195203 /** Return a path, complemented if necessary. */
196 static ContigPath getPath(const Paths& paths, const ContigNode& u)
204 static ContigPath
205 getPath(const Paths& paths, const ContigNode& u)
197206 {
198207 if (isPath(u)) {
199208 unsigned i = u.id() - Vertex::s_offset;
208217 * @param[out] pathIDs the path IDs
209218 * @return the paths
210219 */
211 static Paths readPaths(Graph& g,
212 const string& inPath, vector<string>& pathIDs)
220 static Paths
221 readPaths(Graph& g, const string& inPath, vector<string>& pathIDs)
213222 {
214223 typedef graph_traits<Graph>::vertex_descriptor V;
215224
244253
245254 /** Index the first and last contig of each path to facilitate finding
246255 * overlaps between paths. */
247 static SeedMap makeSeedMap(const Paths& paths)
256 static SeedMap
257 makeSeedMap(const Paths& paths)
248258 {
249259 SeedMap seedMap;
250 for (Paths::const_iterator it = paths.begin();
251 it != paths.end(); ++it) {
260 for (Paths::const_iterator it = paths.begin(); it != paths.end(); ++it) {
252261 if (it->empty())
253262 continue;
254263 assert(!it->front().ambiguous());
255 seedMap.insert(make_pair(it->front(),
256 Vertex(it - paths.begin(), false)));
264 seedMap.insert(make_pair(it->front(), Vertex(it - paths.begin(), false)));
257265 assert(!it->back().ambiguous());
258 seedMap.insert(make_pair(it->back() ^ 1,
259 Vertex(it - paths.begin(), true)));
266 seedMap.insert(make_pair(it->back() ^ 1, Vertex(it - paths.begin(), true)));
260267 }
261268 return seedMap;
262269 }
263270
264271 /** Check whether path starts with the sequence [first, last). */
265 static bool startsWith(ContigPath path, bool rc,
266 ContigPath::const_iterator first,
267 ContigPath::const_iterator last)
272 static bool
273 startsWith(
274 ContigPath path,
275 bool rc,
276 ContigPath::const_iterator first,
277 ContigPath::const_iterator last)
268278 {
269279 if (rc)
270280 reverseComplement(path.begin(), path.end());
271281 assert(*first == path.front());
272282 assert(first < last);
273 return unsigned(last - first) > path.size() ? false
274 : equal(first, last, path.begin());
283 return unsigned(last - first) > path.size() ? false : equal(first, last, path.begin());
275284 }
276285
277286 /** Check whether path starts with the sequence [first, last). */
278 static unsigned findOverlap(const Graph& g,
279 const Paths& paths,
280 ContigPath::const_iterator first,
281 ContigPath::const_iterator last,
282 const Vertex& v, int &distance)
287 static unsigned
288 findOverlap(
289 const Graph& g,
290 const Paths& paths,
291 ContigPath::const_iterator first,
292 ContigPath::const_iterator last,
293 const Vertex& v,
294 int& distance)
283295 {
284296 if (!startsWith(paths[v.id], v.sense, first, last))
285297 return 0;
290302 typedef vector<Overlap> Overlaps;
291303
292304 /** Find every path that overlaps with the specified path. */
293 static void findOverlaps(const Graph& g,
294 const Paths& paths, const SeedMap& seedMap,
295 const Vertex& v, Overlaps& overlaps)
305 static void
306 findOverlaps(
307 const Graph& g,
308 const Paths& paths,
309 const SeedMap& seedMap,
310 const Vertex& v,
311 Overlaps& overlaps)
296312 {
297313 ContigPath rc;
298314 if (v.sense) {
301317 }
302318 const ContigPath& path = v.sense ? rc : paths[v.id];
303319
304 for (ContigPath::const_iterator it = path.begin();
305 it != path.end(); ++it) {
320 for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) {
306321 if (it->ambiguous())
307322 continue;
308323
309 pair<SeedMap::const_iterator, SeedMap::const_iterator>
310 range = seedMap.equal_range(*it);
311 for (SeedMap::const_iterator seed = range.first;
312 seed != range.second; ++seed) {
324 pair<SeedMap::const_iterator, SeedMap::const_iterator> range = seedMap.equal_range(*it);
325 for (SeedMap::const_iterator seed = range.first; seed != range.second; ++seed) {
313326 if (v == seed->second)
314327 continue;
315328 int distance = 0;
316 unsigned overlap = findOverlap(g, paths, it, path.end(),
317 seed->second, distance);
329 unsigned overlap = findOverlap(g, paths, it, path.end(), seed->second, distance);
318330 if (overlap > 0)
319 overlaps.push_back(Overlap(v, seed->second,
320 overlap, distance));
321
331 overlaps.push_back(Overlap(v, seed->second, overlap, distance));
322332 }
323333 }
324334 }
325335
326336 /** Find every pair of overlapping paths. */
327 static Overlaps findOverlaps(const Graph& g, const Paths& paths)
337 static Overlaps
338 findOverlaps(const Graph& g, const Paths& paths)
328339 {
329340 SeedMap seedMap = makeSeedMap(paths);
330341
331342 Overlaps overlaps;
332 for (Paths::const_iterator it = paths.begin();
333 it != paths.end(); ++it) {
343 for (Paths::const_iterator it = paths.begin(); it != paths.end(); ++it) {
334344 unsigned i = it - paths.begin();
335345 findOverlaps(g, paths, seedMap, Vertex(i, false), overlaps);
336346 findOverlaps(g, paths, seedMap, Vertex(i, true), overlaps);
339349 }
340350
341351 /** Record the trimmed contigs. */
342 static void recordTrimmedContigs(
343 ContigPath::const_iterator first,
344 ContigPath::const_iterator last)
352 static void
353 recordTrimmedContigs(ContigPath::const_iterator first, ContigPath::const_iterator last)
345354 {
346355 for (ContigPath::const_iterator it = first; it != last; ++it)
347356 if (!it->ambiguous())
349358 }
350359
351360 /** Remove ambiguous contigs from the ends of the path. */
352 static void removeAmbiguousContigs(ContigPath& path)
361 static void
362 removeAmbiguousContigs(ContigPath& path)
353363 {
354364 if (!path.empty() && path.back().ambiguous())
355365 path.erase(path.end() - 1);
358368 }
359369
360370 /** Remove the overlapping portion of the specified contig. */
361 static void removeContigs(ContigPath& path,
362 unsigned first, unsigned last)
371 static void
372 removeContigs(ContigPath& path, unsigned first, unsigned last)
363373 {
364374 assert(first <= path.size());
365375 assert(last <= path.size());
376386 }
377387
378388 /** Find the largest overlap for each contig and remove it. */
379 static void trimOverlaps(Paths& paths, const Overlaps& overlaps)
389 static void
390 trimOverlaps(Paths& paths, const Overlaps& overlaps)
380391 {
381392 vector<unsigned> removed[2];
382393 removed[0].resize(paths.size());
383394 removed[1].resize(paths.size());
384395
385 for (Overlaps::const_iterator it = overlaps.begin();
386 it != overlaps.end(); ++it) {
396 for (Overlaps::const_iterator it = overlaps.begin(); it != overlaps.end(); ++it) {
387397 unsigned& a = removed[!it->source.sense][it->source.id];
388398 unsigned& b = removed[it->target.sense][it->target.id];
389399 a = max(a, it->overlap);
391401 }
392402
393403 for (Paths::iterator it = paths.begin(); it != paths.end(); ++it)
394 removeContigs(*it, removed[0][it - paths.begin()],
395 it->size() - removed[1][it - paths.begin()]);
404 removeContigs(
405 *it, removed[0][it - paths.begin()], it->size() - removed[1][it - paths.begin()]);
396406 }
397407
398408 /** Trim the ends of paths that overlap another path. */
399 static void trimOverlaps(const Graph& g, Paths& paths)
400 {
401 for (Overlaps overlaps = findOverlaps(g, paths);
402 !overlaps.empty();
403 overlaps = findOverlaps(g, paths)) {
409 static void
410 trimOverlaps(const Graph& g, Paths& paths)
411 {
412 for (Overlaps overlaps = findOverlaps(g, paths); !overlaps.empty();
413 overlaps = findOverlaps(g, paths)) {
404414 cerr << "Found " << overlaps.size() / 2 << " overlaps.\n";
405415 trimOverlaps(paths, overlaps);
406416 }
407417 }
408418
409 static inline
410 ContigProperties get(vertex_bundle_t, const Graph& g, ContigNode u)
411 {
412 return u.ambiguous()
413 ? ContigProperties(u.length() + opt::k - 1, 0)
414 : g[u];
419 static inline ContigProperties
420 get(vertex_bundle_t, const Graph& g, ContigNode u)
421 {
422 return u.ambiguous() ? ContigProperties(u.length() + opt::k - 1, 0) : g[u];
415423 }
416424
417425 /** Add the path overlap edges to the specified graph. */
418 static void addPathOverlapEdges(Graph& g,
419 const Paths& paths, const vector<string>& pathIDs,
420 const Overlaps& overlaps)
426 static void
427 addPathOverlapEdges(
428 Graph& g,
429 const Paths& paths,
430 const vector<string>& pathIDs,
431 const Overlaps& overlaps)
421432 {
422433 typedef graph_traits<Graph>::vertex_descriptor V;
423434 const bool allowParallelEdge = opt::mode == opt::ASSEMBLE;
424435
425436 // Add the path vertices.
426437 g_contigNames.unlock();
427 for (Paths::const_iterator it = paths.begin();
428 it != paths.end(); ++it) {
438 for (Paths::const_iterator it = paths.begin(); it != paths.end(); ++it) {
429439 const ContigPath& path = *it;
430440 const string& id = pathIDs[it - paths.begin()];
431441 if (!path.empty()) {
436446 g_contigNames.lock();
437447
438448 // Remove the single-end contigs that are in paths.
439 for (Paths::const_iterator it = paths.begin();
440 it != paths.end(); ++it)
441 remove_vertex_if(g, it->begin(), it->end(),
442 not1(std::mem_fun_ref(&ContigNode::ambiguous)));
449 for (Paths::const_iterator it = paths.begin(); it != paths.end(); ++it)
450 remove_vertex_if(
451 g, it->begin(), it->end(), [](const ContigNode& c) { return !c.ambiguous(); });
443452
444453 // Add the path edges.
445 for (Overlaps::const_iterator it = overlaps.begin();
446 it != overlaps.end(); ++it) {
454 for (Overlaps::const_iterator it = overlaps.begin(); it != overlaps.end(); ++it) {
447455 V u = it->source.descriptor();
448456 V v = it->target.descriptor();
449457 if (allowParallelEdge || !edge(u, v, g).second)
450458 add_edge(u, v, it->distance, static_cast<DG&>(g));
451459 else if (opt::verbose > 0)
452 cerr << "ambiguous overlap: " << get(vertex_name, g, u)
453 << " -> " << get(vertex_name, g, v) << '\n';
460 cerr << "ambiguous overlap: " << get(vertex_name, g, u) << " -> "
461 << get(vertex_name, g, v) << '\n';
454462 }
455463 }
456464
461469 typedef map<edge_descriptor, unsigned> OverlapMap;
462470
463471 /** Return the number of contigs by which the two paths overlap. */
464 static unsigned getOverlap(const OverlapMap& pmap,
465 graph_traits<Graph>::vertex_descriptor u,
466 graph_traits<Graph>::vertex_descriptor v)
472 static unsigned
473 getOverlap(
474 const OverlapMap& pmap,
475 graph_traits<Graph>::vertex_descriptor u,
476 graph_traits<Graph>::vertex_descriptor v)
467477 {
468478 if (isPath(u) && isPath(v)) {
469479 // Both vertices are paths.
470 OverlapMap::const_iterator it = pmap.find(
471 edge_descriptor(u, v));
480 OverlapMap::const_iterator it = pmap.find(edge_descriptor(u, v));
472481 return it == pmap.end() ? 0 : it->second;
473482 } else {
474483 // One of the two vertices is a contig.
477486 }
478487
479488 /** Merge a sequence of overlapping paths. */
480 static ContigPath mergePaths(const Paths& paths,
481 const OverlapMap& overlaps, const ContigPath& merge)
489 static ContigPath
490 mergePaths(const Paths& paths, const OverlapMap& overlaps, const ContigPath& merge)
482491 {
483492 assert(!merge.empty());
484493 ContigNode u = merge.front();
485494 ContigPath path(getPath(paths, u));
486 for (ContigPath::const_iterator it = merge.begin() + 1;
487 it != merge.end(); ++it) {
495 for (ContigPath::const_iterator it = merge.begin() + 1; it != merge.end(); ++it) {
488496 ContigNode v = *it;
489497 ContigPath vpath(getPath(paths, v));
490498 unsigned overlap = getOverlap(overlaps, u, v);
491499 assert(path.size() > overlap);
492500 assert(vpath.size() > overlap);
493 assert(equal(path.end() - overlap, path.end(),
494 vpath.begin()));
501 assert(equal(path.end() - overlap, path.end(), vpath.begin()));
495502 path.insert(path.end(), vpath.begin() + overlap, vpath.end());
496503 u = v;
497504 }
499506 }
500507
501508 /** Return true if the edge e is a path overlap. */
502 struct IsPathOverlap : unary_function<edge_descriptor, bool> {
503 IsPathOverlap(const Graph& g, const OverlapMap& pmap,
504 const IsPositive<Graph>& pred)
505 : m_g(g), m_pmap(pmap), m_isPositive(pred) { }
509 struct IsPathOverlap : unary_function<edge_descriptor, bool>
510 {
511 IsPathOverlap(const Graph& g, const OverlapMap& pmap, const IsPositive<Graph>& pred)
512 : m_g(g)
513 , m_pmap(pmap)
514 , m_isPositive(pred)
515 {}
506516 bool operator()(edge_descriptor e) const
507517 {
508518 bool stranded = true;
509519 if (opt::ss)
510520 stranded = m_isPositive(e);
511 return stranded &&
512 getOverlap(m_pmap, source(e, m_g), target(e, m_g));
513 }
521 return stranded && getOverlap(m_pmap, source(e, m_g), target(e, m_g));
522 }
523
514524 private:
515525 const Graph& m_g;
516526 const OverlapMap& m_pmap;
518528 };
519529
520530 /** Assemble overlapping paths. */
521 static void assembleOverlappingPaths(Graph& g,
522 Paths& paths, vector<string>& pathIDs)
531 static void
532 assembleOverlappingPaths(Graph& g, Paths& paths, vector<string>& pathIDs)
523533 {
524534 if (paths.empty())
525535 return;
530540
531541 // Create a property map of path overlaps.
532542 OverlapMap overlapMap;
533 for (Overlaps::const_iterator it = overlaps.begin();
534 it != overlaps.end(); ++it)
543 for (Overlaps::const_iterator it = overlaps.begin(); it != overlaps.end(); ++it)
535544 overlapMap.insert(OverlapMap::value_type(
536 OverlapMap::key_type(
537 it->source.descriptor(),
538 it->target.descriptor()),
539 it->overlap));
545 OverlapMap::key_type(it->source.descriptor(), it->target.descriptor()), it->overlap));
540546
541547 // Assemble unambiguously overlapping paths.
542548 Paths merges;
543 assemble_if(g, back_inserter(merges),
544 IsPathOverlap(g, overlapMap, IsPositive<Graph>(g)));
549 assemble_if(g, back_inserter(merges), IsPathOverlap(g, overlapMap, IsPositive<Graph>(g)));
545550
546551 // Merge overlapping paths.
547552 g_contigNames.unlock();
548553 assert(!pathIDs.empty());
549554 setNextContigName(pathIDs.back());
550 for (Paths::const_iterator it = merges.begin();
551 it != merges.end(); ++it) {
555 for (Paths::const_iterator it = merges.begin(); it != merges.end(); ++it) {
552556 string name = createContigName();
553557 if (opt::verbose > 0)
554558 cerr << name << '\t' << *it << '\n';
558562 paths.push_back(mergePaths(paths, overlapMap, *it));
559563
560564 // Remove the merged paths.
561 for (ContigPath::const_iterator it2 = it->begin();
562 it2 != it->end(); ++it2) {
565 for (ContigPath::const_iterator it2 = it->begin(); it2 != it->end(); ++it2) {
563566 if (isPath(*it2))
564567 paths[it2->id() - Vertex::s_offset].clear();
565568 }
567570 g_contigNames.lock();
568571 }
569572
570 int main(int argc, char** argv)
573 int
574 main(int argc, char** argv)
571575 {
572576 string commandLine;
573577 {
574578 ostringstream ss;
575579 char** last = argv + argc - 1;
576 copy(argv, last, ostream_iterator<const char *>(ss, " "));
580 copy(argv, last, ostream_iterator<const char*>(ss, " "));
577581 ss << *last;
578582 commandLine = ss.str();
579583 }
582586 opt::metaVars.resize(3);
583587
584588 bool die = false;
585 for (int c; (c = getopt_long(argc, argv,
586 shortopts, longopts, NULL)) != -1;) {
589 for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
587590 istringstream arg(optarg != NULL ? optarg : "");
588591 switch (c) {
589 case '?': die = true; break;
590 case 'g': arg >> opt::graphPath; break;
591 case 'k': arg >> opt::k; break;
592 case 'r': arg >> opt::repeatContigs; break;
593 case 'v': opt::verbose++; break;
594 case OPT_HELP:
595 cout << USAGE_MESSAGE;
596 exit(EXIT_SUCCESS);
597 case OPT_VERSION:
598 cout << VERSION_MESSAGE;
599 exit(EXIT_SUCCESS);
600 case OPT_DB:
601 arg >> opt::db; break;
602 case OPT_LIBRARY:
603 arg >> opt::metaVars[0]; break;
604 case OPT_STRAIN:
605 arg >> opt::metaVars[1]; break;
606 case OPT_SPECIES:
607 arg >> opt::metaVars[2]; break;
592 case '?':
593 die = true;
594 break;
595 case 'g':
596 arg >> opt::graphPath;
597 break;
598 case 'k':
599 arg >> opt::k;
600 break;
601 case 'r':
602 arg >> opt::repeatContigs;
603 break;
604 case 'v':
605 opt::verbose++;
606 break;
607 case OPT_HELP:
608 cout << USAGE_MESSAGE;
609 exit(EXIT_SUCCESS);
610 case OPT_VERSION:
611 cout << VERSION_MESSAGE;
612 exit(EXIT_SUCCESS);
613 case OPT_DB:
614 arg >> opt::db;
615 break;
616 case OPT_LIBRARY:
617 arg >> opt::metaVars[0];
618 break;
619 case OPT_STRAIN:
620 arg >> opt::metaVars[1];
621 break;
622 case OPT_SPECIES:
623 arg >> opt::metaVars[2];
624 break;
608625 }
609626 if (optarg != NULL && !arg.eof()) {
610 cerr << PROGRAM ": invalid option: `-"
611 << (char)c << optarg << "'\n";
627 cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n";
612628 exit(EXIT_FAILURE);
613629 }
614630 }
627643 }
628644
629645 if (die) {
630 cerr << "Try `" << PROGRAM
631 << " --help' for more information.\n";
646 cerr << "Try `" << PROGRAM << " --help' for more information.\n";
632647 exit(EXIT_FAILURE);
633648 }
634649
635 const char *adjPath = argv[optind++];
650 const char* adjPath = argv[optind++];
636651 if (opt::verbose > 0)
637652 cerr << "Reading `" << adjPath << "'..." << endl;
638653 ifstream fin(adjPath);
646661 Paths paths = readPaths(g, pathsFile, pathIDs);
647662
648663 switch (opt::mode) {
649 case opt::OVERLAP:
664 case opt::OVERLAP:
650665 // Find overlapping paths, do not assemble.
651 addPathOverlapEdges(g, paths, pathIDs,
652 findOverlaps(g, paths));
666 addPathOverlapEdges(g, paths, pathIDs, findOverlaps(g, paths));
653667 paths.clear();
654668 if (opt::graphPath.empty())
655669 opt::graphPath = "-";
656670 break;
657671
658 case opt::ASSEMBLE:
672 case opt::ASSEMBLE:
659673 // Assemble overlapping paths.
660674 assembleOverlappingPaths(g, paths, pathIDs);
661675 break;
662676
663 case opt::TRIM:
677 case opt::TRIM:
664678 // Trim overlapping paths.
665679 trimOverlaps(g, paths);
666680 // Remove paths consisting of a single contig.
667 for_each_if(paths.begin(), paths.end(),
668 mem_fun_ref(&ContigPath::clear),
669 compose1(
670 bind2nd(equal_to<ContigPath::size_type>(), 1),
671 mem_fun_ref(&ContigPath::size)));
681 for_each_if(
682 paths.begin(),
683 paths.end(),
684 [](ContigPath& c) { return c.clear(); },
685 [](const ContigPath& c) { return c.size() == 1; });
672686 // Add the paths to the graph.
673687 addPathOverlapEdges(g, paths, pathIDs, Overlaps());
674688 break;
675689 }
676690
677691 // Output the paths.
678 for (Paths::const_iterator it = paths.begin();
679 it != paths.end(); ++it) {
692 for (Paths::const_iterator it = paths.begin(); it != paths.end(); ++it) {
680693 if (it->empty())
681694 continue;
682695 assert(it->size() != 1);
687700 // Output the graph.
688701 if (!opt::graphPath.empty()) {
689702 ofstream fout;
690 ostream& out = opt::graphPath == "-" ? cout
691 : (fout.open(opt::graphPath.c_str()), fout);
703 ostream& out = opt::graphPath == "-" ? cout : (fout.open(opt::graphPath.c_str()), fout);
692704 assert_good(out, opt::graphPath);
693705 write_graph(out, g, PROGRAM, commandLine);
694706 assert_good(out, opt::graphPath);
698710 if (!opt::repeatContigs.empty()) {
699711 sort(s_trimmedContigs.begin(), s_trimmedContigs.end());
700712 s_trimmedContigs.erase(
701 unique(s_trimmedContigs.begin(),
702 s_trimmedContigs.end()), s_trimmedContigs.end());
713 unique(s_trimmedContigs.begin(), s_trimmedContigs.end()), s_trimmedContigs.end());
703714 ofstream out(opt::repeatContigs.c_str());
704715 assert_good(out, opt::repeatContigs);
705 for (vector<ContigID>::const_iterator it
706 = s_trimmedContigs.begin();
707 it != s_trimmedContigs.end(); ++it)
716 for (vector<ContigID>::const_iterator it = s_trimmedContigs.begin();
717 it != s_trimmedContigs.end();
718 ++it)
708719 out << get(g_contigNames, *it) << '\n';
709720 assert_good(out, opt::repeatContigs);
710721 }
711722
712723 if (!opt::db.empty()) {
713 init(db,
714 opt::db,
715 opt::verbose,
716 PROGRAM,
717 opt::getCommand(argc, argv),
718 opt::metaVars);
724 init(db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars);
719725 addToDb(db, "SS", opt::ss);
720726 addToDb(db, "K", opt::k);
721727 }
22 * Written by Shaun Jackman <sjackman@bcgsc.ca>.
33 */
44
5 #include "config.h"
5 #include "Graph/PopBubbles.h"
66 #include "Common/Options.h"
77 #include "ConstString.h"
88 #include "ContigPath.h"
99 #include "ContigProperties.h"
1010 #include "FastaReader.h"
11 #include "IOUtil.h"
12 #include "Sequence.h"
13 #include "Uncompress.h"
14 #include "alignGlobal.h"
1511 #include "Graph/ContigGraph.h"
1612 #include "Graph/ContigGraphAlgorithms.h"
1713 #include "Graph/DepthFirstSearch.h"
1814 #include "Graph/DirectedGraph.h"
1915 #include "Graph/GraphIO.h"
2016 #include "Graph/GraphUtil.h"
21 #include "Graph/PopBubbles.h"
17 #include "IOUtil.h"
18 #include "Sequence.h"
19 #include "Uncompress.h"
20 #include "alignGlobal.h"
21 #include "config.h"
22 #include <algorithm>
2223 #include <boost/lambda/bind.hpp>
2324 #include <boost/lambda/lambda.hpp>
24 #include <algorithm>
2525 #include <climits> // for UINT_MAX
2626 #include <fstream>
2727 #include <functional>
2828 #include <getopt.h>
29 #include <map>
3029 #include <iostream>
3130 #include <iterator>
31 #include <map>
3232 #include <set>
3333 #include <sstream>
3434 #include <string>
3535 #include <utility>
3636 #include <vector>
3737 #if _OPENMP
38 # include <omp.h>
38 #include <omp.h>
3939 #endif
4040
4141 using namespace std;
4545 #define PROGRAM "PopBubbles"
4646
4747 static const char VERSION_MESSAGE[] =
48 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
49 "Written by Shaun Jackman.\n"
50 "\n"
51 "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
48 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
49 "Written by Shaun Jackman.\n"
50 "\n"
51 "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
5252
5353 static const char USAGE_MESSAGE[] =
54 "Usage: " PROGRAM " -k<kmer> [OPTION]... FASTA ADJ\n"
55 "Identify and pop simple bubbles.\n"
56 "\n"
57 " Arguments:\n"
58 "\n"
59 " FASTA contigs in FASTA format\n"
60 " ADJ contig adjacency graph\n"
61 "\n"
62 " Options:\n"
63 "\n"
64 " -k, --kmer=N k-mer size\n"
65 " -a, --branches=N maximum number of branches, default: 2\n"
66 " -b, --bubble-length=N pop bubbles shorter than N bp\n"
67 " default is 10000\n"
68 " -p, --identity=REAL minimum identity, default: 0.9\n"
69 " -c, --coverage=REAL remove contigs with mean k-mer coverage\n"
70 " less than this threshold [0]\n"
71 " --scaffold scaffold over bubbles that have\n"
72 " insufficient identity\n"
73 " --no-scaffold disable scaffolding [default]\n"
74 " --SS expect contigs to be oriented correctly\n"
75 " --no-SS no assumption about contig orientation [default]\n"
76 " -g, --graph=FILE write the contig adjacency graph to FILE\n"
77 " --adj output the graph in ADJ format [default]\n"
78 " --asqg output the graph in ASQG format\n"
79 " --dot output the graph in GraphViz format\n"
80 " --gfa output the graph in GFA1 format\n"
81 " --gfa1 output the graph in GFA1 format\n"
82 " --gfa2 output the graph in GFA2 format\n"
83 " --gv output the graph in GraphViz format\n"
84 " --sam output the graph in SAM format\n"
85 " --bubble-graph output a graph of the bubbles\n"
86 " -j, --threads=N use N parallel threads [1]\n"
87 " -v, --verbose display verbose output\n"
88 " --help display this help and exit\n"
89 " --version output version information and exit\n"
90 "\n"
91 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
54 "Usage: " PROGRAM " -k<kmer> [OPTION]... FASTA ADJ\n"
55 "Identify and pop simple bubbles.\n"
56 "\n"
57 " Arguments:\n"
58 "\n"
59 " FASTA contigs in FASTA format\n"
60 " ADJ contig adjacency graph\n"
61 "\n"
62 " Options:\n"
63 "\n"
64 " -k, --kmer=N k-mer size\n"
65 " -a, --branches=N maximum number of branches, default: 2\n"
66 " -b, --bubble-length=N pop bubbles shorter than N bp\n"
67 " default is 10000\n"
68 " -p, --identity=REAL minimum identity, default: 0.9\n"
69 " -c, --coverage=REAL remove contigs with mean k-mer coverage\n"
70 " less than this threshold [0]\n"
71 " --scaffold scaffold over bubbles that have\n"
72 " insufficient identity\n"
73 " --no-scaffold disable scaffolding [default]\n"
74 " --SS expect contigs to be oriented correctly\n"
75 " --no-SS no assumption about contig orientation [default]\n"
76 " -g, --graph=FILE write the contig adjacency graph to FILE\n"
77 " --adj output the graph in ADJ format [default]\n"
78 " --asqg output the graph in ASQG format\n"
79 " --dot output the graph in GraphViz format\n"
80 " --gfa output the graph in GFA1 format\n"
81 " --gfa1 output the graph in GFA1 format\n"
82 " --gfa2 output the graph in GFA2 format\n"
83 " --gv output the graph in GraphViz format\n"
84 " --sam output the graph in SAM format\n"
85 " --bubble-graph output a graph of the bubbles\n"
86 " -j, --threads=N use N parallel threads [1]\n"
87 " -v, --verbose display verbose output\n"
88 " --help display this help and exit\n"
89 " --version output version information and exit\n"
90 "\n"
91 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
9292
9393 namespace opt {
94 unsigned k; // used by ContigProperties
95
96 /** Maximum number of branches. */
97 static unsigned maxBranches = 2;
98
99 /** Pop bubbles shorter than this threshold. */
100 static unsigned maxLength = 10000;
101
102 /** Minimum identity. */
103 static float identity = 0.9;
104
105 /** Minimum mean k-mer coverage. */
106 static float minCoverage;
107
108 /** Scaffold over bubbles that have insufficient identity. */
109 static int scaffold;
110
111 /** Write the contig adjacency graph to this file. */
112 static string graphPath;
113
114 /** Output a graph of the bubbles. */
115 static int bubbleGraph;
116
117 int format; // used by ContigProperties
118
119 /** Run a strand-specific RNA-Seq assembly. */
120 static int ss;
121
122 /** Number of threads. */
123 static int threads = 1;
94 unsigned k; // used by ContigProperties
95
96 /** Maximum number of branches. */
97 static unsigned maxBranches = 2;
98
99 /** Pop bubbles shorter than this threshold. */
100 static unsigned maxLength = 10000;
101
102 /** Minimum identity. */
103 static float identity = 0.9;
104
105 /** Minimum mean k-mer coverage. */
106 static float minCoverage;
107
108 /** Scaffold over bubbles that have insufficient identity. */
109 static int scaffold;
110
111 /** Write the contig adjacency graph to this file. */
112 static string graphPath;
113
114 /** Output a graph of the bubbles. */
115 static int bubbleGraph;
116
117 int format; // used by ContigProperties
118
119 /** Run a strand-specific RNA-Seq assembly. */
120 static int ss;
121
122 /** Number of threads. */
123 static int threads = 1;
124124 }
125125
126126 static const char shortopts[] = "a:b:c:g:j:k:p:v";
127127
128 enum { OPT_HELP = 1, OPT_VERSION };
129
130 static const struct option longopts[] = {
131 { "branches", required_argument, NULL, 'a' },
132 { "bubble-length", required_argument, NULL, 'b' },
133 { "coverage", required_argument, NULL, 'c' },
134 { "bubble-graph", no_argument, &opt::bubbleGraph, 1, },
135 { "graph", required_argument, NULL, 'g' },
136 { "adj", no_argument, &opt::format, ADJ },
137 { "asqg", no_argument, &opt::format, ASQG },
138 { "dot", no_argument, &opt::format, DOT },
139 { "gfa", no_argument, &opt::format, GFA1 },
140 { "gfa1", no_argument, &opt::format, GFA1 },
141 { "gfa2", no_argument, &opt::format, GFA2 },
142 { "gv", no_argument, &opt::format, DOT },
143 { "sam", no_argument, &opt::format, SAM },
144 { "kmer", required_argument, NULL, 'k' },
145 { "identity", required_argument, NULL, 'p' },
146 { "scaffold", no_argument, &opt::scaffold, 1},
147 { "no-scaffold", no_argument, &opt::scaffold, 0},
148 { "SS", no_argument, &opt::ss, 1 },
149 { "no-SS", no_argument, &opt::ss, 0 },
150 { "threads", required_argument, NULL, 'j' },
151 { "verbose", no_argument, NULL, 'v' },
152 { "help", no_argument, NULL, OPT_HELP },
153 { "version", no_argument, NULL, OPT_VERSION },
154 { NULL, 0, NULL, 0 }
128 enum
129 {
130 OPT_HELP = 1,
131 OPT_VERSION
155132 };
133
134 static const struct option longopts[] = { { "branches", required_argument, NULL, 'a' },
135 { "bubble-length", required_argument, NULL, 'b' },
136 { "coverage", required_argument, NULL, 'c' },
137 {
138 "bubble-graph",
139 no_argument,
140 &opt::bubbleGraph,
141 1,
142 },
143 { "graph", required_argument, NULL, 'g' },
144 { "adj", no_argument, &opt::format, ADJ },
145 { "asqg", no_argument, &opt::format, ASQG },
146 { "dot", no_argument, &opt::format, DOT },
147 { "gfa", no_argument, &opt::format, GFA1 },
148 { "gfa1", no_argument, &opt::format, GFA1 },
149 { "gfa2", no_argument, &opt::format, GFA2 },
150 { "gv", no_argument, &opt::format, DOT },
151 { "sam", no_argument, &opt::format, SAM },
152 { "kmer", required_argument, NULL, 'k' },
153 { "identity", required_argument, NULL, 'p' },
154 { "scaffold", no_argument, &opt::scaffold, 1 },
155 { "no-scaffold", no_argument, &opt::scaffold, 0 },
156 { "SS", no_argument, &opt::ss, 1 },
157 { "no-SS", no_argument, &opt::ss, 0 },
158 { "threads", required_argument, NULL, 'j' },
159 { "verbose", no_argument, NULL, 'v' },
160 { "help", no_argument, NULL, OPT_HELP },
161 { "version", no_argument, NULL, OPT_VERSION },
162 { NULL, 0, NULL, 0 } };
156163
157164 /** Popped branches. */
158165 static vector<ContigID> g_popped;
159166
160167 /** Contig adjacency graph. */
161 typedef ContigGraph<DirectedGraph<ContigProperties, Distance> > Graph;
168 typedef ContigGraph<DirectedGraph<ContigProperties, Distance>> Graph;
162169 typedef Graph::vertex_descriptor vertex_descriptor;
163170 typedef Graph::adjacency_iterator adjacency_iterator;
164171
165172 /** Return the distance from vertex u to v. */
166 static int getDistance(const Graph& g,
167 vertex_descriptor u, vertex_descriptor v)
173 static int
174 getDistance(const Graph& g, vertex_descriptor u, vertex_descriptor v)
168175 {
169176 typedef graph_traits<Graph>::edge_descriptor edge_descriptor;
170177 pair<edge_descriptor, bool> e = edge(u, v, g);
172179 return g[e.first].distance;
173180 }
174181
175 struct CompareCoverage {
182 struct CompareCoverage
183 {
176184 const Graph& g;
177 CompareCoverage(const Graph& g) : g(g) { }
185 CompareCoverage(const Graph& g)
186 : g(g)
187 {}
178188 bool operator()(vertex_descriptor u, vertex_descriptor v)
179189 {
180190 return g[u].coverage > g[v].coverage;
182192 };
183193
184194 /** Pop the bubble between vertices v and tail. */
185 static void popBubble(Graph& g,
186 vertex_descriptor v, vertex_descriptor tail)
195 static void
196 popBubble(Graph& g, vertex_descriptor v, vertex_descriptor tail)
187197 {
188198 unsigned nbranches = g.out_degree(v);
189199 assert(nbranches > 1);
190200 assert(nbranches == g.in_degree(tail));
191201 vector<vertex_descriptor> sorted(nbranches);
192 pair<adjacency_iterator, adjacency_iterator>
193 adj = g.adjacent_vertices(v);
202 pair<adjacency_iterator, adjacency_iterator> adj = g.adjacent_vertices(v);
194203 copy(adj.first, adj.second, sorted.begin());
195204 sort(sorted.begin(), sorted.end(), CompareCoverage(g));
196205 if (opt::bubbleGraph)
197206 #pragma omp critical(cout)
198207 {
199208 cout << '"' << get(vertex_name, g, v) << "\" -> {";
200 for (vector<vertex_descriptor>::const_iterator
201 it = sorted.begin(); it != sorted.end(); ++it)
209 for (vector<vertex_descriptor>::const_iterator it = sorted.begin(); it != sorted.end();
210 ++it)
202211 cout << " \"" << get(vertex_name, g, *it) << '"';
203212 cout << " } -> \"" << get(vertex_name, g, tail) << "\"\n";
204213 }
205214 #pragma omp critical(g_popped)
206 transform(sorted.begin() + 1, sorted.end(),
207 back_inserter(g_popped),
208 mem_fun_ref(&ContigNode::contigIndex));
209 }
210
211 static struct {
215 transform(sorted.begin() + 1, sorted.end(), back_inserter(g_popped), [](const ContigNode& c) {
216 return c.contigIndex();
217 });
218 }
219
220 static struct
221 {
212222 unsigned bubbles;
213223 unsigned popped;
214224 unsigned scaffold;
223233 static Contigs g_contigs;
224234
225235 /** Return the sequence of vertex u. */
226 static string getSequence(const Graph* g, vertex_descriptor u)
236 static string
237 getSequence(const Graph* g, vertex_descriptor u)
227238 {
228239 size_t i = get(vertex_contig_index, *g, u);
229240 assert(i < g_contigs.size());
232243 }
233244
234245 /** Return the length of vertex v. */
235 static unsigned getLength(const Graph* g, vertex_descriptor v)
246 static unsigned
247 getLength(const Graph* g, vertex_descriptor v)
236248 {
237249 return (*g)[v].length;
238250 }
242254 * @param v the vertex to the right of the bubble
243255 * @return the identity of the global alignment
244256 */
245 template <typename It>
246 static float getAlignmentIdentity(const Graph& g,
247 vertex_descriptor t, vertex_descriptor v,
248 It first, It last)
257 template<typename It>
258 static float
259 getAlignmentIdentity(const Graph& g, vertex_descriptor t, vertex_descriptor v, It first, It last)
249260 {
250261 unsigned nbranches = distance(first, last);
251262 vector<int> inDists(nbranches);
252 transform(first, last, inDists.begin(),
253 boost::lambda::bind(getDistance, boost::cref(g), t, _1));
263 transform(
264 first, last, inDists.begin(), boost::lambda::bind(getDistance, boost::cref(g), t, _1));
254265 vector<int> outDists(nbranches);
255 transform(first, last, outDists.begin(),
256 boost::lambda::bind(getDistance, boost::cref(g), _1, v));
266 transform(
267 first, last, outDists.begin(), boost::lambda::bind(getDistance, boost::cref(g), _1, v));
257268 vector<int> insertLens(nbranches);
258 transform(first, last, insertLens.begin(),
259 boost::lambda::bind(getDistance, boost::cref(g), t, _1)
260 + boost::lambda::bind(getLength, &g, _1)
261 + boost::lambda::bind(getDistance, boost::cref(g), _1, v));
262
263 int max_in_overlap = -(*min_element(inDists.begin(),
264 inDists.end()));
269 transform(
270 first,
271 last,
272 insertLens.begin(),
273 boost::lambda::bind(getDistance, boost::cref(g), t, _1) +
274 boost::lambda::bind(getLength, &g, _1) +
275 boost::lambda::bind(getDistance, boost::cref(g), _1, v));
276
277 int max_in_overlap = -(*min_element(inDists.begin(), inDists.end()));
265278 assert(max_in_overlap >= 0);
266 int max_out_overlap = -(*min_element(outDists.begin(),
267 outDists.end()));
279 int max_out_overlap = -(*min_element(outDists.begin(), outDists.end()));
268280 assert(max_out_overlap >= 0);
269 int min_insert_len = *min_element(insertLens.begin(),
270 insertLens.end());
271 int max_insert_len = *max_element(insertLens.begin(),
272 insertLens.end());
273
274 float max_identity =
275 (float)(min_insert_len + max_in_overlap + max_out_overlap) /
276 (max_insert_len + max_in_overlap + max_out_overlap);
281 int min_insert_len = *min_element(insertLens.begin(), insertLens.end());
282 int max_insert_len = *max_element(insertLens.begin(), insertLens.end());
283
284 float max_identity = (float)(min_insert_len + max_in_overlap + max_out_overlap) /
285 (max_insert_len + max_in_overlap + max_out_overlap);
277286 if (min_insert_len <= 0 || max_identity < opt::identity)
278287 return max_identity;
279288
290299 unsigned matches, consensusSize;
291300 tie(matches, consensusSize) = align(seqs);
292301 return (float)(matches + max_in_overlap + max_out_overlap) /
293 (consensusSize + max_in_overlap + max_out_overlap);
302 (consensusSize + max_in_overlap + max_out_overlap);
294303 }
295304
296305 /** Pop the specified bubble if it is a simple bubble.
297306 * @return whether the bubble is popped
298307 */
299 static bool popSimpleBubble(Graph* pg, vertex_descriptor v)
308 static bool
309 popSimpleBubble(Graph* pg, vertex_descriptor v)
300310 {
301311 Graph& g = *pg;
302312 unsigned nbranches = g.out_degree(v);
309319 }
310320 vertex_descriptor tail = *g.adjacent_vertices(v1).first;
311321 if (v == get(vertex_complement, g, tail) // Palindrome
312 || g.in_degree(tail) != nbranches) {
322 || g.in_degree(tail) != nbranches) {
313323 #pragma omp atomic
314324 g_count.notSimple++;
315325 return false;
316326 }
317327
318328 // Check that every branch is simple and ends at the same node.
319 pair<adjacency_iterator, adjacency_iterator>
320 adj = g.adjacent_vertices(v);
329 pair<adjacency_iterator, adjacency_iterator> adj = g.adjacent_vertices(v);
321330 for (adjacency_iterator it = adj.first; it != adj.second; ++it) {
322331 if (g.out_degree(*it) != 1 || g.in_degree(*it) != 1) {
323332 #pragma omp atomic
336345 #pragma omp critical(cerr)
337346 {
338347 cerr << "\n* " << get(vertex_name, g, v) << " ->";
339 for (adjacency_iterator it = adj.first;
340 it != adj.second; ++it)
348 for (adjacency_iterator it = adj.first; it != adj.second; ++it)
341349 cerr << ' ' << get(vertex_name, g, *it);
342350 cerr << " -> " << get(vertex_name, g, tail) << '\n';
343351 }
353361 }
354362
355363 vector<unsigned> lengths(nbranches);
356 transform(adj.first, adj.second, lengths.begin(),
357 bind1st(ptr_fun(getLength), &g));
364 transform(adj.first, adj.second, lengths.begin(), [&g](const ContigNode& c) {
365 return getLength(&g, c);
366 });
358367 unsigned minLength = *min_element(lengths.begin(), lengths.end());
359368 unsigned maxLength = *max_element(lengths.begin(), lengths.end());
360369 if (maxLength >= opt::maxLength) {
363372 g_count.tooLong++;
364373 if (opt::verbose > 1)
365374 #pragma omp critical(cerr)
366 cerr << minLength << '\t' << maxLength
367 << "\t0\t(too long)\n";
375 cerr << minLength << '\t' << maxLength << "\t0\t(too long)\n";
368376 return false;
369377 }
370378
371 float identity = opt::identity == 0 ? 0
372 : getAlignmentIdentity(g, v, tail, adj.first, adj.second);
379 float identity =
380 opt::identity == 0 ? 0 : getAlignmentIdentity(g, v, tail, adj.first, adj.second);
373381 bool dissimilar = identity < opt::identity;
374382 if (opt::verbose > 1)
375383 #pragma omp critical(cerr)
376384 cerr << minLength << '\t' << maxLength << '\t' << identity
377 << (dissimilar ? "\t(dissimilar)" : "") << '\n';
385 << (dissimilar ? "\t(dissimilar)" : "") << '\n';
378386 if (dissimilar) {
379387 // Insufficient identity.
380388 #pragma omp atomic
389397 }
390398
391399 /** Add distances to a path. */
392 static ContigPath addDistance(const Graph& g, const ContigPath& path)
400 static ContigPath
401 addDistance(const Graph& g, const ContigPath& path)
393402 {
394403 ContigPath out;
395404 out.reserve(path.size());
396405 ContigNode u = path.front();
397406 out.push_back(u);
398 for (ContigPath::const_iterator it = path.begin() + 1;
399 it != path.end(); ++it) {
407 for (ContigPath::const_iterator it = path.begin() + 1; it != path.end(); ++it) {
400408 ContigNode v = *it;
401409 int distance = getDistance(g, u, v);
402410 if (distance >= 0) {
412420 }
413421
414422 /** Return the length of the longest path through the bubble. */
415 static int longestPath(const Graph& g, const Bubble& topo)
423 static int
424 longestPath(const Graph& g, const Bubble& topo)
416425 {
417426 typedef graph_traits<Graph>::edge_descriptor E;
418427 typedef graph_traits<Graph>::out_edge_iterator Eit;
421430 EdgeWeightMap<Graph> weight(g);
422431 map<ContigNode, int> distance;
423432 distance[topo.front()] = 0;
424 for (Bubble::const_iterator it = topo.begin();
425 it != topo.end(); ++it) {
433 for (Bubble::const_iterator it = topo.begin(); it != topo.end(); ++it) {
426434 V u = *it;
427435 Eit eit, elast;
428436 for (tie(eit, elast) = out_edges(u, g); eit != elast; ++eit) {
439447 * Add an edge (u,w) with the distance property set to the length of
440448 * the largest branch of the bubble.
441449 */
442 static void scaffoldBubble(Graph& g, const Bubble& bubble)
450 static void
451 scaffoldBubble(Graph& g, const Bubble& bubble)
443452 {
444453 typedef graph_traits<Graph>::vertex_descriptor V;
445454 assert(opt::scaffold);
455464 assert(bubble.size() > 2);
456465 size_t n = bubble.size() - 2;
457466 g_popped.reserve(g_popped.size() + n);
458 for (Bubble::const_iterator it = bubble.begin() + 1;
459 it != bubble.end() - 1; ++it)
467 for (Bubble::const_iterator it = bubble.begin() + 1; it != bubble.end() - 1; ++it)
460468 g_popped.push_back(it->contigIndex());
461469
462470 add_edge(u, w, max(longestPath(g, bubble), 1), g);
463471 }
464472
465473 /** Pop the specified bubble if it is simple, otherwise scaffold. */
466 static void popOrScaffoldBubble(Graph& g, const Bubble& bubble)
474 static void
475 popOrScaffoldBubble(Graph& g, const Bubble& bubble)
467476 {
468477 #pragma omp atomic
469478 g_count.bubbles++;
475484 }
476485
477486 /** Return the length of the specified vertex in k-mer. */
478 static unsigned getKmerLength(const ContigProperties& vp)
487 static unsigned
488 getKmerLength(const ContigProperties& vp)
479489 {
480490 assert(vp.length >= opt::k);
481491 return vp.length - opt::k + 1;
482492 }
483493
484494 /** Return the mean k-mer coverage of the specified vertex. */
485 static float getMeanCoverage(const ContigProperties& vp)
495 static float
496 getMeanCoverage(const ContigProperties& vp)
486497 {
487498 return (float)vp.coverage / getKmerLength(vp);
488499 }
489500
490501 /** Remove contigs with insufficient coverage. */
491 static void filterGraph(Graph& g)
502 static void
503 filterGraph(Graph& g)
492504 {
493505 typedef graph_traits<Graph> GTraits;
494506 typedef GTraits::vertex_descriptor V;
510522 }
511523 }
512524 if (opt::verbose > 0) {
513 cerr << "Removed " << removedKmer << " k-mer in "
514 << removedContigs << " contigs with mean k-mer coverage "
515 "less than " << opt::minCoverage << ".\n";
525 cerr << "Removed " << removedKmer << " k-mer in " << removedContigs
526 << " contigs with mean k-mer coverage "
527 "less than "
528 << opt::minCoverage << ".\n";
516529 printGraphStats(cerr, g);
517530 }
518531 }
519532
520533 /** Remove the specified contig from the adjacency graph. */
521 static void removeContig(Graph* g, ContigID id)
534 static void
535 removeContig(Graph* g, ContigID id)
522536 {
523537 ContigNode v(id, false);
524538 g->clear_vertex(v);
525539 g->remove_vertex(v);
526540 }
527541
528 int main(int argc, char** argv)
542 int
543 main(int argc, char** argv)
529544 {
530545 string commandLine;
531546 {
532547 ostringstream ss;
533548 char** last = argv + argc - 1;
534 copy(argv, last, ostream_iterator<const char *>(ss, " "));
549 copy(argv, last, ostream_iterator<const char*>(ss, " "));
535550 ss << *last;
536551 commandLine = ss.str();
537552 }
538553
539554 bool die = false;
540 for (int c; (c = getopt_long(argc, argv,
541 shortopts, longopts, NULL)) != -1;) {
555 for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
542556 istringstream arg(optarg != NULL ? optarg : "");
543557 switch (c) {
544 case '?': die = true; break;
545 case 'a': arg >> opt::maxBranches; break;
546 case 'b': arg >> opt::maxLength; break;
547 case 'c': arg >> opt::minCoverage; break;
548 case 'g': arg >> opt::graphPath; break;
549 case 'j': arg >> opt::threads; break;
550 case 'k': arg >> opt::k; break;
551 case 'p': arg >> opt::identity; break;
552 case 'v': opt::verbose++; break;
553 case OPT_HELP:
554 cout << USAGE_MESSAGE;
555 exit(EXIT_SUCCESS);
556 case OPT_VERSION:
557 cout << VERSION_MESSAGE;
558 exit(EXIT_SUCCESS);
558 case '?':
559 die = true;
560 break;
561 case 'a':
562 arg >> opt::maxBranches;
563 break;
564 case 'b':
565 arg >> opt::maxLength;
566 break;
567 case 'c':
568 arg >> opt::minCoverage;
569 break;
570 case 'g':
571 arg >> opt::graphPath;
572 break;
573 case 'j':
574 arg >> opt::threads;
575 break;
576 case 'k':
577 arg >> opt::k;
578 break;
579 case 'p':
580 arg >> opt::identity;
581 break;
582 case 'v':
583 opt::verbose++;
584 break;
585 case OPT_HELP:
586 cout << USAGE_MESSAGE;
587 exit(EXIT_SUCCESS);
588 case OPT_VERSION:
589 cout << VERSION_MESSAGE;
590 exit(EXIT_SUCCESS);
559591 }
560592 if (optarg != NULL && !arg.eof()) {
561 cerr << PROGRAM ": invalid option: `-"
562 << (char)c << optarg << "'\n";
593 cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n";
563594 exit(EXIT_FAILURE);
564595 }
565596 }
566597
567598 if (opt::k <= 0) {
568 cerr << PROGRAM ": " << "missing -k,--kmer option\n";
599 cerr << PROGRAM ": "
600 << "missing -k,--kmer option\n";
569601 die = true;
570602 }
571603
580612 }
581613
582614 if (die) {
583 cerr << "Try `" << PROGRAM
584 << " --help' for more information.\n";
615 cerr << "Try `" << PROGRAM << " --help' for more information.\n";
585616 exit(EXIT_FAILURE);
586617 }
587618
625656 cout << "digraph bubbles {\n";
626657
627658 Bubbles bubbles = discoverBubbles(g);
628 for (Bubbles::const_iterator it = bubbles.begin();
629 it != bubbles.end(); ++it)
659 for (Bubbles::const_iterator it = bubbles.begin(); it != bubbles.end(); ++it)
630660 popOrScaffoldBubble(g, *it);
631661
632662 // Each bubble should be identified twice. Remove the duplicate.
633663 sort(g_popped.begin(), g_popped.end());
634 g_popped.erase(unique(g_popped.begin(), g_popped.end()),
635 g_popped.end());
664 g_popped.erase(unique(g_popped.begin(), g_popped.end()), g_popped.end());
636665
637666 if (opt::bubbleGraph) {
638667 cout << "}\n";
639668 } else {
640 for (vector<ContigID>::const_iterator it = g_popped.begin();
641 it != g_popped.end(); ++it)
669 for (vector<ContigID>::const_iterator it = g_popped.begin(); it != g_popped.end(); ++it)
642670 cout << get(g_contigNames, *it) << '\n';
643671 }
644672
645673 if (opt::verbose > 0)
646 cerr << "Bubbles: " << (g_count.bubbles + 1) / 2
647 << " Popped: " << (g_count.popped + 1) / 2
648 << " Scaffolds: " << (g_count.scaffold + 1) / 2
649 << " Complex: " << (g_count.notSimple + 1) / 2
650 << " Too long: " << (g_count.tooLong + 1) / 2
651 << " Too many: " << (g_count.tooMany + 1) / 2
652 << " Dissimilar: " << (g_count.dissimilar + 1) / 2
653 << '\n';
674 cerr << "Bubbles: " << (g_count.bubbles + 1) / 2 << " Popped: " << (g_count.popped + 1) / 2
675 << " Scaffolds: " << (g_count.scaffold + 1) / 2
676 << " Complex: " << (g_count.notSimple + 1) / 2
677 << " Too long: " << (g_count.tooLong + 1) / 2
678 << " Too many: " << (g_count.tooMany + 1) / 2
679 << " Dissimilar: " << (g_count.dissimilar + 1) / 2 << '\n';
654680
655681 if (!opt::graphPath.empty()) {
656682 // Remove the popped contigs from the adjacency graph.
657 for_each(g_popped.begin(), g_popped.end(),
658 bind1st(ptr_fun(removeContig), &g));
683 for_each(g_popped.begin(), g_popped.end(), [&g](const ContigID& c) {
684 return removeContig(&g, c);
685 });
659686
660687 // Assemble unambiguous paths.
661688 g_contigNames.unlock();
668695 assemble_stranded(g, back_inserter(paths));
669696 else
670697 assemble(g, back_inserter(paths));
671 for (ContigPaths::const_iterator it = paths.begin();
672 it != paths.end(); ++it) {
698 for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) {
673699 ContigNode u(numContigs + it - paths.begin(), false);
674700 string name = createContigName();
675701 put(vertex_name, g, u, name);
676 cout << name << '\t'
677 << addDistance(gorig, *it) << '\n';
702 cout << name << '\t' << addDistance(gorig, *it) << '\n';
678703 }
679704 } else {
680705 if (opt::ss)
681706 assemble_stranded(g, back_inserter(paths));
682707 else
683708 assemble(g, back_inserter(paths));
684 for (ContigPaths::const_iterator it = paths.begin();
685 it != paths.end(); ++it) {
709 for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) {
686710 ContigNode u(numContigs + it - paths.begin(), false);
687711 string name = createContigName();
688712 put(vertex_name, g, u, name);
0 [![Release](https://img.shields.io/github/release/bcgsc/abyss.svg)](https://github.com/bcgsc/abyss/releases)
1 [![Downloads](https://img.shields.io/github/downloads/bcgsc/abyss/total?logo=github)](https://github.com/bcgsc/abyss/releases/download/2.2.3/abyss-2.2.3.tar.gz)
2 [![Conda](https://img.shields.io/conda/dn/bioconda/abyss?label=Conda)](https://anaconda.org/bioconda/abyss)
3 [![Issues](https://img.shields.io/github/issues/bcgsc/abyss.svg)](https://github.com/bcgsc/abyss/issues)
4
05 ABySS
16 =====
27
0 #include "config.h"
0 #include "Common/UnorderedMap.h"
11 #include "ContigNode.h"
22 #include "ContigPath.h"
33 #include "ContigProperties.h"
4 #include "DataBase/DB.h"
5 #include "DataBase/Options.h"
46 #include "Estimate.h"
5 #include "IOUtil.h"
6 #include "Iterator.h"
7 #include "Uncompress.h"
8 #include "Common/UnorderedMap.h"
97 #include "Graph/Assemble.h"
108 #include "Graph/ContigGraph.h"
119 #include "Graph/ContigGraphAlgorithms.h"
1412 #include "Graph/GraphIO.h"
1513 #include "Graph/GraphUtil.h"
1614 #include "Graph/PopBubbles.h"
15 #include "IOUtil.h"
16 #include "Iterator.h"
17 #include "Uncompress.h"
18 #include "config.h"
1719 #include <cassert>
1820 #include <climits>
1921 #include <cmath>
2325 #include <getopt.h>
2426 #include <iostream>
2527 #include <utility>
26 #include "DataBase/Options.h"
27 #include "DataBase/DB.h"
2828
2929 using namespace std;
3030 using namespace std::rel_ops;
3636 DB db;
3737
3838 static const char VERSION_MESSAGE[] =
39 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
40 "Written by Shaun Jackman.\n"
41 "\n"
42 "Copyright 2018 Canada's Michael Smith Genome Sciences Centre\n";
39 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
40 "Written by Shaun Jackman.\n"
41 "\n"
42 "Copyright 2018 Canada's Michael Smith Genome Sciences Centre\n";
4343
4444 static const char USAGE_MESSAGE[] =
45 "Usage: " PROGRAM " -k<kmer> [OPTION]... FASTA|OVERLAP DIST...\n"
46 "Scaffold contigs using the distance estimate graph.\n"
47 "\n"
48 " Arguments:\n"
49 "\n"
50 " FASTA contigs in FASTA format\n"
51 " OVERLAP the contig overlap graph\n"
52 " DIST estimates of the distance between contigs\n"
53 "\n"
54 " Options:\n"
55 "\n"
56 " -n, --npairs=N minimum number of pairs [0]\n"
57 " or -n A-B:S Find the value of n in [A,B] with step size S\n"
58 " that maximizes the scaffold N50.\n"
59 " Default value for the step size is 1, if unspecified.\n"
60 " -s, --seed-length=N minimum contig length [1000]\n"
61 " or -s A-B Find the value of s in [A,B]\n"
62 " that maximizes the scaffold N50.\n"
63 " --grid optimize using a grid search [default]\n"
64 " --line optimize using a line search\n"
65 " -k, --kmer=N length of a k-mer\n"
66 " -G, --genome-size=N expected genome size. Used to calculate NG50\n"
67 " and associated stats [disabled]\n"
68 " --min-gap=N minimum scaffold gap length to output [50]\n"
69 " --max-gap=N maximum scaffold gap length to output [inf]\n"
70 " --complex remove complex transitive edges\n"
71 " --no-complex don't remove complex transitive edges [default]\n"
72 " --SS expect contigs to be oriented correctly\n"
73 " --no-SS no assumption about contig orientation [default]\n"
74 " -o, --out=FILE write the paths to FILE\n"
75 " -g, --graph=FILE write the graph to FILE\n"
76 " -v, --verbose display verbose output\n"
77 " --help display this help and exit\n"
78 " --version output version information and exit\n"
79 " --db=FILE specify path of database repository in FILE\n"
80 " --library=NAME specify library NAME for sqlite\n"
81 " --strain=NAME specify strain NAME for sqlite\n"
82 " --species=NAME specify species NAME for sqlite\n"
83 "\n"
84 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
45 "Usage: " PROGRAM " -k<kmer> [OPTION]... FASTA|OVERLAP DIST...\n"
46 "Scaffold contigs using the distance estimate graph.\n"
47 "\n"
48 " Arguments:\n"
49 "\n"
50 " FASTA contigs in FASTA format\n"
51 " OVERLAP the contig overlap graph\n"
52 " DIST estimates of the distance between contigs\n"
53 "\n"
54 " Options:\n"
55 "\n"
56 " -n, --npairs=N minimum number of pairs [0]\n"
57 " or -n A-B:S Find the value of n in [A,B] with step size S\n"
58 " that maximizes the scaffold N50.\n"
59 " Default value for the step size is 1, if unspecified.\n"
60 " -s, --seed-length=N minimum contig length [1000]\n"
61 " or -s A-B Find the value of s in [A,B]\n"
62 " that maximizes the scaffold N50.\n"
63 " --grid optimize using a grid search [default]\n"
64 " --line optimize using a line search\n"
65 " -k, --kmer=N length of a k-mer\n"
66 " -G, --genome-size=N expected genome size. Used to calculate NG50\n"
67 " and associated stats [disabled]\n"
68 " --min-gap=N minimum scaffold gap length to output [50]\n"
69 " --max-gap=N maximum scaffold gap length to output [inf]\n"
70 " --complex remove complex transitive edges\n"
71 " --no-complex don't remove complex transitive edges [default]\n"
72 " --SS expect contigs to be oriented correctly\n"
73 " --no-SS no assumption about contig orientation [default]\n"
74 " -o, --out=FILE write the paths to FILE\n"
75 " -g, --graph=FILE write the graph to FILE\n"
76 " -v, --verbose display verbose output\n"
77 " --help display this help and exit\n"
78 " --version output version information and exit\n"
79 " --db=FILE specify path of database repository in FILE\n"
80 " --library=NAME specify library NAME for sqlite\n"
81 " --strain=NAME specify strain NAME for sqlite\n"
82 " --species=NAME specify species NAME for sqlite\n"
83 "\n"
84 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
8585
8686 namespace opt {
87 string db;
88 dbVars metaVars;
89
90 unsigned k; // used by ContigProperties
91
92 /** Optimization search strategy. */
93 static int searchStrategy;
94
95 /** Minimum number of pairs. */
96 static unsigned minEdgeWeight;
97 static unsigned minEdgeWeightEnd;
98 static unsigned minEdgeWeightStep;
99
100 /** Minimum contig length. */
101 static unsigned minContigLength = 1000;
102 static unsigned minContigLengthEnd = 1000;
103
104 /** Genome size. Used to calculate NG50. */
105 static long long unsigned genomeSize;
106
107 /** Minimum scaffold gap length to output. */
108 static int minGap = 50;
109
110 /** Maximum scaffold gap length to output.
111 * -ve value means no maximum. */
112 static int maxGap = -1;
113
114 /** Write the paths to this file. */
115 static string out;
116
117 /** Write the graph to this file. */
118 static string graphPath;
119
120 /** Run a strand-specific RNA-Seq assembly. */
121 static int ss;
122
123 /** Verbose output. */
124 int verbose; // used by PopBubbles
125
126 /** Output format */
127 int format = DOT; // used by DistanceEst
128
129 /** Remove complex transitive edges */
130 static int comp_trans;
87 string db;
88 dbVars metaVars;
89
90 unsigned k; // used by ContigProperties
91
92 /** Optimization search strategy. */
93 static int searchStrategy;
94
95 /** Minimum number of pairs. */
96 static unsigned minEdgeWeight;
97 static unsigned minEdgeWeightEnd;
98 static unsigned minEdgeWeightStep;
99
100 /** Minimum contig length. */
101 static unsigned minContigLength = 1000;
102 static unsigned minContigLengthEnd = 1000;
103
104 /** Genome size. Used to calculate NG50. */
105 static long long unsigned genomeSize;
106
107 /** Minimum scaffold gap length to output. */
108 static int minGap = 50;
109
110 /** Maximum scaffold gap length to output.
111 * -ve value means no maximum. */
112 static int maxGap = -1;
113
114 /** Write the paths to this file. */
115 static string out;
116
117 /** Write the graph to this file. */
118 static string graphPath;
119
120 /** Run a strand-specific RNA-Seq assembly. */
121 static int ss;
122
123 /** Verbose output. */
124 int verbose; // used by PopBubbles
125
126 /** Output format */
127 int format = DOT; // used by DistanceEst
128
129 /** Remove complex transitive edges */
130 static int comp_trans;
131131 }
132132
133133 static const char shortopts[] = "G:g:k:n:o:s:v";
134134
135 enum { OPT_HELP = 1, OPT_VERSION, OPT_MIN_GAP, OPT_MAX_GAP, OPT_COMP,
136 OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
135 enum
136 {
137 OPT_HELP = 1,
138 OPT_VERSION,
139 OPT_MIN_GAP,
140 OPT_MAX_GAP,
141 OPT_COMP,
142 OPT_DB,
143 OPT_LIBRARY,
144 OPT_STRAIN,
145 OPT_SPECIES
146 };
137147
138148 /** Optimization search strategy. */
139 enum { GRID_SEARCH, LINE_SEARCH };
149 enum
150 {
151 GRID_SEARCH,
152 LINE_SEARCH
153 };
140154
141155 static const struct option longopts[] = {
142 { "graph", no_argument, NULL, 'g' },
143 { "kmer", required_argument, NULL, 'k' },
156 { "graph", no_argument, NULL, 'g' },
157 { "kmer", required_argument, NULL, 'k' },
144158 { "genome-size", required_argument, NULL, 'G' },
145 { "min-gap", required_argument, NULL, OPT_MIN_GAP },
146 { "max-gap", required_argument, NULL, OPT_MAX_GAP },
147 { "npairs", required_argument, NULL, 'n' },
148 { "grid", no_argument, &opt::searchStrategy, GRID_SEARCH },
149 { "line", no_argument, &opt::searchStrategy, LINE_SEARCH },
150 { "out", required_argument, NULL, 'o' },
159 { "min-gap", required_argument, NULL, OPT_MIN_GAP },
160 { "max-gap", required_argument, NULL, OPT_MAX_GAP },
161 { "npairs", required_argument, NULL, 'n' },
162 { "grid", no_argument, &opt::searchStrategy, GRID_SEARCH },
163 { "line", no_argument, &opt::searchStrategy, LINE_SEARCH },
164 { "out", required_argument, NULL, 'o' },
151165 { "seed-length", required_argument, NULL, 's' },
152 { "complex", no_argument, &opt::comp_trans, 1 },
153 { "no-complex", no_argument, &opt::comp_trans, 0 },
154 { "SS", no_argument, &opt::ss, 1 },
155 { "no-SS", no_argument, &opt::ss, 0 },
156 { "verbose", no_argument, NULL, 'v' },
157 { "help", no_argument, NULL, OPT_HELP },
158 { "version", no_argument, NULL, OPT_VERSION },
159 { "db", required_argument, NULL, OPT_DB },
160 { "library", required_argument, NULL, OPT_LIBRARY },
161 { "strain", required_argument, NULL, OPT_STRAIN },
162 { "species", required_argument, NULL, OPT_SPECIES },
166 { "complex", no_argument, &opt::comp_trans, 1 },
167 { "no-complex", no_argument, &opt::comp_trans, 0 },
168 { "SS", no_argument, &opt::ss, 1 },
169 { "no-SS", no_argument, &opt::ss, 0 },
170 { "verbose", no_argument, NULL, 'v' },
171 { "help", no_argument, NULL, OPT_HELP },
172 { "version", no_argument, NULL, OPT_VERSION },
173 { "db", required_argument, NULL, OPT_DB },
174 { "library", required_argument, NULL, OPT_LIBRARY },
175 { "strain", required_argument, NULL, OPT_STRAIN },
176 { "species", required_argument, NULL, OPT_SPECIES },
163177 { NULL, 0, NULL, 0 }
164178 };
165179
171185 * An edge is invalid when the overlap is larger than the length of
172186 * either of its incident sequences.
173187 */
174 struct InvalidEdge {
175 InvalidEdge(Graph& g) : m_g(g) { }
188 struct InvalidEdge
189 {
190 InvalidEdge(Graph& g)
191 : m_g(g)
192 {}
176193 bool operator()(graph_traits<Graph>::edge_descriptor e) const
177194 {
178195 int d = m_g[e].distance;
184201 };
185202
186203 /** Return whether the specified edges has sufficient support. */
187 struct PoorSupport {
188 PoorSupport(Graph& g, unsigned minEdgeWeight) : m_g(g), m_minEdgeWeight(minEdgeWeight) { }
204 struct PoorSupport
205 {
206 PoorSupport(Graph& g, unsigned minEdgeWeight)
207 : m_g(g)
208 , m_minEdgeWeight(minEdgeWeight)
209 {}
189210 bool operator()(graph_traits<Graph>::edge_descriptor e) const
190211 {
191212 return m_g[e].numPairs < m_minEdgeWeight;
195216 };
196217
197218 /** Remove short vertices and unsupported edges from the graph. */
198 static void filterGraph(Graph& g, unsigned minEdgeWeight, unsigned minContigLength)
219 static void
220 filterGraph(Graph& g, unsigned minEdgeWeight, unsigned minContigLength)
199221 {
200222 typedef graph_traits<Graph> GTraits;
201223 typedef GTraits::vertex_descriptor V;
229251 }
230252
231253 /** Return true if the specified edge is a cycle. */
232 static bool isCycle(Graph& g, graph_traits<Graph>::edge_descriptor e)
254 static bool
255 isCycle(Graph& g, graph_traits<Graph>::edge_descriptor e)
233256 {
234257 return edge(target(e, g), source(e, g), g).second;
235258 }
236259
237260 /** Remove simple cycles of length two from the graph. */
238 static void removeCycles(Graph& g)
261 static void
262 removeCycles(Graph& g)
239263 {
240264 typedef graph_traits<Graph>::edge_descriptor E;
241265 typedef graph_traits<Graph>::edge_iterator Eit;
264288 * For a pair of edges (u,v1) and (u,v2) in g, if exactly one of the
265289 * edges (v1,v2) or (v2,v1) exists in g0, add that edge to g.
266290 */
267 static void resolveForks(Graph& g, const Graph& g0)
291 static void
292 resolveForks(Graph& g, const Graph& g0)
268293 {
269294 typedef graph_traits<Graph>::adjacency_iterator Vit;
270295 typedef graph_traits<Graph>::edge_descriptor E;
292317 pair<E, bool> e21 = edge(v2, v1, g0);
293318 if (e12.second && e21.second) {
294319 if (opt::verbose > 1)
295 cerr << "cycle: " << get(vertex_name, g, v1)
296 << ' ' << get(vertex_name, g, v2) << '\n';
320 cerr << "cycle: " << get(vertex_name, g, v1) << ' '
321 << get(vertex_name, g, v2) << '\n';
297322 } else if (e12.second || e21.second) {
298323 E e = e12.second ? e12.first : e21.first;
299324 V v = source(e, g0), w = target(e, g0);
300325 add_edge(v, w, g0[e], g);
301326 numEdges++;
302327 if (opt::verbose > 1)
303 cerr << get(vertex_name, g, u)
304 << " -> " << get(vertex_name, g, v)
305 << " -> " << get(vertex_name, g, w)
306 << " [" << g0[e] << "]\n";
328 cerr << get(vertex_name, g, u) << " -> " << get(vertex_name, g, v) << " -> "
329 << get(vertex_name, g, w) << " [" << g0[e] << "]\n";
307330 }
308331 }
309332 }
310333 }
311334 if (opt::verbose > 0)
312 cerr << "Added " << numEdges
313 << " edges to ambiguous vertices.\n";
335 cerr << "Added " << numEdges << " edges to ambiguous vertices.\n";
314336 if (!opt::db.empty())
315337 addToDb(db, "E_added_ambig", numEdges);
316338 }
319341 * For an edge (u,v), remove the vertex v if deg+(u) > 1
320342 * and deg-(v) = 1 and deg+(v) = 0.
321343 */
322 static void pruneTips(Graph& g)
344 static void
345 pruneTips(Graph& g)
323346 {
324347 /** Identify the tips. */
325348 size_t n = 0;
339362 * operation: remove vertex u
340363 * output: digraph g { t1->v1 t2->v2 }
341364 */
342 static void removeRepeats(Graph& g)
365 static void
366 removeRepeats(Graph& g)
343367 {
344368 typedef graph_traits<Graph>::adjacency_iterator Ait;
345369 typedef graph_traits<Graph>::edge_descriptor E;
348372 vector<V> repeats;
349373 vector<E> transitive;
350374 find_transitive_edges(g, back_inserter(transitive));
351 for (vector<E>::const_iterator it = transitive.begin();
352 it != transitive.end(); ++it) {
375 for (vector<E>::const_iterator it = transitive.begin(); it != transitive.end(); ++it) {
353376 // Iterate through the transitive edges, u->w1.
354377 V u = source(*it, g), w1 = target(*it, g);
355378 Ait vit, vlast;
356 for (tie(vit, vlast) = adjacent_vertices(u, g);
357 vit != vlast; ++vit) {
379 for (tie(vit, vlast) = adjacent_vertices(u, g); vit != vlast; ++vit) {
358380 V v = *vit;
359381 assert(u != v); // no self loops
360382 if (!edge(v, w1, g).second)
361383 continue;
362384 // u->w1 is a transitive edge spanning u->v->w1.
363385 Ait wit, wlast;
364 for (tie(wit, wlast) = adjacent_vertices(v, g);
365 wit != wlast; ++wit) {
386 for (tie(wit, wlast) = adjacent_vertices(v, g); wit != wlast; ++wit) {
366387 // For each edge v->w2, check that an edge
367388 // w1->w2 or w2->w1 exists. If not, v is a repeat.
368389 V w2 = *wit;
369390 assert(v != w2); // no self loops
370 if (w1 != w2
371 && !edge(w1, w2, g).second
372 && !edge(w2, w1, g).second) {
391 if (w1 != w2 && !edge(w1, w2, g).second && !edge(w2, w1, g).second) {
373392 repeats.push_back(v);
374393 break;
375394 }
378397 }
379398
380399 sort(repeats.begin(), repeats.end());
381 repeats.erase(unique(repeats.begin(), repeats.end()),
382 repeats.end());
400 repeats.erase(unique(repeats.begin(), repeats.end()), repeats.end());
383401 if (opt::verbose > 1) {
384402 cerr << "Ambiguous:";
385 for (vector<V>::const_iterator it = repeats.begin();
386 it != repeats.end(); ++it)
403 for (vector<V>::const_iterator it = repeats.begin(); it != repeats.end(); ++it)
387404 cerr << ' ' << get(vertex_name, g, *it);
388405 cerr << '\n';
389406 }
390407
391408 // Remove the repetitive vertices.
392409 unsigned numRemoved = 0;
393 for (vector<V>::const_iterator it = repeats.begin();
394 it != repeats.end(); ++it) {
410 for (vector<V>::const_iterator it = repeats.begin(); it != repeats.end(); ++it) {
395411 V u = *it;
396412 V uc = get(vertex_complement, g, u);
397413 clear_out_edges(u, g);
402418 }
403419
404420 if (opt::verbose > 0) {
405 cerr << "Cleared "
406 << repeats.size() << " ambiguous vertices.\n"
407 << "Removed "
408 << numRemoved << " ambiguous vertices.\n";
421 cerr << "Cleared " << repeats.size() << " ambiguous vertices.\n"
422 << "Removed " << numRemoved << " ambiguous vertices.\n";
409423 printGraphStats(cerr, g);
410424 }
411425 if (!opt::db.empty()) {
420434 * operation: remove edge u1->v2
421435 * output: digraph g {u1->v1 u2->v2 }
422436 */
423 static void removeWeakEdges(Graph& g)
437 static void
438 removeWeakEdges(Graph& g)
424439 {
425440 typedef graph_traits<Graph>::edge_descriptor E;
426441 typedef graph_traits<Graph>::edge_iterator Eit;
477492
478493 if (opt::verbose > 1) {
479494 cerr << "Weak edges:\n";
480 for (vector<E>::const_iterator it = weak.begin();
481 it != weak.end(); ++it) {
495 for (vector<E>::const_iterator it = weak.begin(); it != weak.end(); ++it) {
482496 E e = *it;
483 cerr << '\t' << get(edge_name, g, e)
484 << " [" << g[e] << "]\n";
497 cerr << '\t' << get(edge_name, g, e) << " [" << g[e] << "]\n";
485498 }
486499 }
487500
495508 addToDb(db, "E_removed_weak", weak.size());
496509 }
497510
498 static void removeLongEdges(Graph& g)
511 static void
512 removeLongEdges(Graph& g)
499513 {
500514 typedef graph_traits<Graph>::edge_descriptor E;
501515 typedef graph_traits<Graph>::edge_iterator Eit;
513527 /** Return whether the specified distance estimate is an exact
514528 * overlap.
515529 */
516 static bool isOverlap(const DistanceEst& d)
530 static bool
531 isOverlap(const DistanceEst& d)
517532 {
518533 if (d.stdDev == 0) {
519534 assert(d.distance < 0);
526541 * @param g0 the original graph
527542 * @param g1 the transformed graph
528543 */
529 static ContigPath addDistEst(const Graph& g0, const Graph& g1,
530 const ContigPath& path)
544 static ContigPath
545 addDistEst(const Graph& g0, const Graph& g1, const ContigPath& path)
531546 {
532547 typedef graph_traits<Graph>::edge_descriptor E;
533548 typedef edge_bundle_type<Graph>::type EP;
536551 out.reserve(2 * path.size());
537552 ContigNode u = path.front();
538553 out.push_back(u);
539 for (ContigPath::const_iterator it = path.begin() + 1;
540 it != path.end(); ++it) {
554 for (ContigPath::const_iterator it = path.begin() + 1; it != path.end(); ++it) {
541555 ContigNode v = *it;
542556 assert(!v.ambiguous());
543557 pair<E, bool> e0 = edge(u, v, g0);
544558 pair<E, bool> e1 = edge(u, v, g1);
545559 if (!e0.second && !e1.second)
546 std::cerr << "error: missing edge: " << get(vertex_name, g0, u) << " -> " << get(vertex_name, g0, v) << '\n';
560 std::cerr << "error: missing edge: " << get(vertex_name, g0, u) << " -> "
561 << get(vertex_name, g0, v) << '\n';
547562 assert(e0.second || e1.second);
548563 const EP& ep = e0.second ? g0[e0.first] : g1[e1.first];
549564 if (!isOverlap(ep)) {
560575 }
561576
562577 /** Read a graph from the specified file. */
563 static void readGraph(const string& path, Graph& g)
578 static void
579 readGraph(const string& path, Graph& g)
564580 {
565581 if (opt::verbose > 0)
566582 cerr << "Reading `" << path << "'...\n";
573589 printGraphStats(cerr, g);
574590
575591 vector<int> vals = passGraphStatsVal(g);
576 vector<string> keys = make_vector<string>()
577 << "V_readGraph"
578 << "E_readGraph"
579 << "degree0_readGraph"
580 << "degree1_readGraph"
581 << "degree234_readGraph"
582 << "degree5_readGraph"
583 << "max_readGraph";
592 vector<string> keys = make_vector<string>() << "V_readGraph"
593 << "E_readGraph"
594 << "degree0_readGraph"
595 << "degree1_readGraph"
596 << "degree234_readGraph"
597 << "degree5_readGraph"
598 << "max_readGraph";
584599
585600 if (!opt::db.empty()) {
586 for(unsigned i=0; i<vals.size(); i++)
601 for (unsigned i = 0; i < vals.size(); i++)
587602 addToDb(db, keys[i], vals[i]);
588603 }
589604 g_contigNames.lock();
591606
592607 /** Return the scaffold length of [first, last), not counting gaps. */
593608 template<typename It>
594 unsigned addLength(const Graph& g, It first, It last)
609 unsigned
610 addLength(const Graph& g, It first, It last)
595611 {
596612 typedef typename graph_traits<Graph>::vertex_descriptor V;
597613 assert(first != last);
609625 typedef vector<ContigPath> ContigPaths;
610626
611627 /**
612 * Build the scaffold length histogram.
613 * @param g The graph g is destroyed.
614 */
615 static Histogram buildScaffoldLengthHistogram(
616 Graph& g, const ContigPaths& paths)
628 * Build the scaffold length histogram.
629 * @param g The graph g is destroyed.
630 */
631 static Histogram
632 buildScaffoldLengthHistogram(Graph& g, const ContigPaths& paths)
617633 {
618634 Histogram h;
619635
625641
626642 // Remove the vertices that are used in paths
627643 // and add the lengths of the scaffolds.
628 for (ContigPaths::const_iterator it = paths.begin();
629 it != paths.end(); ++it) {
644 for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) {
630645 h.insert(addLength(g, it->begin(), it->end()));
631 remove_vertex_if(g, it->begin(), it->end(),
632 not1(std::mem_fun_ref(&ContigNode::ambiguous)));
646 remove_vertex_if(
647 g, it->begin(), it->end(), [](const ContigNode& c) { return !c.ambiguous(); });
633648 }
634649
635650 // Add the contigs that were not used in paths.
644659 }
645660
646661 /** Add contiguity stats to database */
647 static void addCntgStatsToDb(
648 const Histogram h, const unsigned min)
662 static void
663 addCntgStatsToDb(const Histogram h, const unsigned min)
649664 {
650665 vector<int> vals = passContiguityStatsVal(h, min);
651 vector<string> keys = make_vector<string>()
652 << "n"
653 << "n200"
654 << "nN50"
655 << "min"
656 << "N75"
657 << "N50"
658 << "N25"
659 << "Esize"
660 << "max"
661 << "sum"
662 << "nNG50"
663 << "NG50";
666 vector<string> keys = make_vector<string>() << "n"
667 << "n200"
668 << "nN50"
669 << "min"
670 << "N75"
671 << "N50"
672 << "N25"
673 << "Esize"
674 << "max"
675 << "sum"
676 << "nNG50"
677 << "NG50";
664678 if (!opt::db.empty()) {
665 for(unsigned i=0; i<vals.size(); i++)
679 for (unsigned i = 0; i < vals.size(); i++)
666680 addToDb(db, keys[i], vals[i]);
667681 }
668682 }
669683
670684 /** Parameters of scaffolding. */
671 struct ScaffoldParam {
672 ScaffoldParam(unsigned n, unsigned s) : n(n), s(s) { }
685 struct ScaffoldParam
686 {
687 ScaffoldParam(unsigned n, unsigned s)
688 : n(n)
689 , s(s)
690 {}
673691 bool operator==(const ScaffoldParam& o) const { return n == o.n && s == o.s; }
674692 unsigned n;
675693 unsigned s;
676694 };
677695
678696 NAMESPACE_STD_HASH_BEGIN
679 template <> struct hash<ScaffoldParam> {
680 size_t operator()(const ScaffoldParam& param) const
681 {
682 return hash<unsigned>()(param.n) ^ hash<unsigned>()(param.s);
683 }
684 };
697 template<>
698 struct hash<ScaffoldParam>
699 {
700 size_t operator()(const ScaffoldParam& param) const
701 {
702 return hash<unsigned>()(param.n) ^ hash<unsigned>()(param.s);
703 }
704 };
685705 NAMESPACE_STD_HASH_END
686706
687707 /** Result of scaffolding. */
688 struct ScaffoldResult : ScaffoldParam{
689 ScaffoldResult() : ScaffoldParam(0, 0), n50(0) { }
708 struct ScaffoldResult : ScaffoldParam
709 {
710 ScaffoldResult()
711 : ScaffoldParam(0, 0)
712 , n50(0)
713 {}
690714 ScaffoldResult(unsigned n, unsigned s, unsigned n50, std::string metrics)
691 : ScaffoldParam(n, s), n50(n50), metrics(metrics) { }
715 : ScaffoldParam(n, s)
716 , n50(n50)
717 , metrics(metrics)
718 {}
692719 unsigned n50;
693720 std::string metrics;
694721 };
698725 * @return the scaffold N50
699726 */
700727 ScaffoldResult
701 scaffold(const Graph& g0,
702 unsigned minEdgeWeight, unsigned minContigLength,
703 bool output)
728 scaffold(const Graph& g0, unsigned minEdgeWeight, unsigned minContigLength, bool output)
704729 {
705730 Graph g(g0);
706731
743768 typedef graph_traits<Graph>::vertex_descriptor V;
744769 vector<V> popped = popBubbles(g);
745770 if (opt::verbose > 0) {
746 cerr << "Removed " << popped.size()
747 << " vertices in bubbles.\n";
771 cerr << "Removed " << popped.size() << " vertices in bubbles.\n";
748772 printGraphStats(cerr, g);
749773 }
750774
753777
754778 if (opt::verbose > 1) {
755779 cerr << "Popped:";
756 for (vector<V>::const_iterator it = popped.begin();
757 it != popped.end(); ++it)
780 for (vector<V>::const_iterator it = popped.begin(); it != popped.end(); ++it)
758781 cerr << ' ' << get(vertex_name, g, *it);
759782 cerr << '\n';
760783 }
772795 sort(paths.begin(), paths.end());
773796 unsigned n = 0;
774797 if (opt::verbose > 0) {
775 for (ContigPaths::const_iterator it = paths.begin();
776 it != paths.end(); ++it)
798 for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it)
777799 n += it->size();
778 cerr << "Assembled " << n << " contigs in "
779 << paths.size() << " scaffolds.\n";
800 cerr << "Assembled " << n << " contigs in " << paths.size() << " scaffolds.\n";
780801 printGraphStats(cerr, g);
781802 }
782803
791812 ostream& out = opt::out.empty() || opt::out == "-" ? cout : fout;
792813 assert_good(out, opt::out);
793814 g_contigNames.unlock();
794 for (vector<ContigPath>::const_iterator it = paths.begin();
795 it != paths.end(); ++it)
796 out << createContigName() << '\t'
797 << addDistEst(g0, g, *it) << '\n';
815 for (vector<ContigPath>::const_iterator it = paths.begin(); it != paths.end(); ++it)
816 out << createContigName() << '\t' << addDistEst(g0, g, *it) << '\n';
798817 assert_good(out, opt::out);
799818
800819 // Output the graph.
810829 const unsigned STATS_MIN_LENGTH = opt::minContigLength;
811830 std::ostringstream ss;
812831 Histogram scaffold_histogram = buildScaffoldLengthHistogram(g, paths);
813 printContiguityStats(ss, scaffold_histogram, STATS_MIN_LENGTH,
814 false, "\t", opt::genomeSize)
815 << "\tn=" << minEdgeWeight << " s=" << minContigLength << '\n';
832 printContiguityStats(ss, scaffold_histogram, STATS_MIN_LENGTH, false, "\t", opt::genomeSize)
833 << "\tn=" << minEdgeWeight << " s=" << minContigLength << '\n';
816834 std::string metrics = ss.str();
817835 addCntgStatsToDb(scaffold_histogram, STATS_MIN_LENGTH);
818836
819 return ScaffoldResult(minEdgeWeight, minContigLength,
820 scaffold_histogram.trimLow(STATS_MIN_LENGTH).n50(),
821 metrics);
837 return ScaffoldResult(
838 minEdgeWeight,
839 minContigLength,
840 scaffold_histogram.trimLow(STATS_MIN_LENGTH).n50(),
841 metrics);
822842 }
823843
824844 /** Memoize the optimization results so far. */
857877
858878 /** Find the value of n that maximizes the scaffold N50. */
859879 static ScaffoldResult
860 optimize_n(const Graph& g,
861 std::pair<unsigned, unsigned> minEdgeWeight,
862 unsigned minContigLength,
863 ScaffoldMemo& memo)
880 optimize_n(
881 const Graph& g,
882 std::pair<unsigned, unsigned> minEdgeWeight,
883 unsigned minContigLength,
884 ScaffoldMemo& memo)
864885 {
865886 std::string metrics_table;
866887 unsigned bestn = 0, bestN50 = 0;
878899
879900 /** Find the value of s that maximizes the scaffold N50. */
880901 static ScaffoldResult
881 optimize_s(const Graph& g,
882 unsigned minEdgeWeight,
883 std::pair<unsigned, unsigned> minContigLength,
884 ScaffoldMemo& memo)
902 optimize_s(
903 const Graph& g,
904 unsigned minEdgeWeight,
905 std::pair<unsigned, unsigned> minContigLength,
906 ScaffoldMemo& memo)
885907 {
886908 std::string metrics_table;
887909 unsigned bests = 0, bestN50 = 0;
888910 const double STEP = cbrt(10); // Three steps per decade.
889 unsigned ilast = (unsigned)round(
890 log(minContigLength.second) / log(STEP));
891 for (unsigned i = (unsigned)round(
892 log(minContigLength.first) / log(STEP));
893 i <= ilast; ++i) {
911 unsigned ilast = (unsigned)round(log(minContigLength.second) / log(STEP));
912 for (unsigned i = (unsigned)round(log(minContigLength.first) / log(STEP)); i <= ilast; ++i) {
894913 unsigned s = (unsigned)pow(STEP, (int)i);
895914
896915 // Round to 1 figure.
910929
911930 /** Find the values of n and s that maximizes the scaffold N50. */
912931 static ScaffoldResult
913 optimize_grid_search(const Graph& g,
914 std::pair<unsigned, unsigned> minEdgeWeight,
915 std::pair<unsigned, unsigned> minContigLength)
932 optimize_grid_search(
933 const Graph& g,
934 std::pair<unsigned, unsigned> minEdgeWeight,
935 std::pair<unsigned, unsigned> minContigLength)
916936 {
917937 const unsigned STATS_MIN_LENGTH = opt::minContigLength;
918938 if (opt::verbose == 0)
934954
935955 /** Find the values of n and s that maximizes the scaffold N50. */
936956 static ScaffoldResult
937 optimize_line_search(const Graph& g,
938 std::pair<unsigned, unsigned> minEdgeWeight,
939 std::pair<unsigned, unsigned> minContigLength)
957 optimize_line_search(
958 const Graph& g,
959 std::pair<unsigned, unsigned> minEdgeWeight,
960 std::pair<unsigned, unsigned> minContigLength)
940961 {
941962 const unsigned STATS_MIN_LENGTH = opt::minContigLength;
942963 if (opt::verbose == 0)
945966 ScaffoldMemo memo;
946967 std::string metrics_table;
947968 ScaffoldResult best(
948 (minEdgeWeight.first + minEdgeWeight.second) / 2,
949 minContigLength.second, 0, "");
969 (minEdgeWeight.first + minEdgeWeight.second) / 2, minContigLength.second, 0, "");
950970 // An upper limit on the number of iterations.
951 const unsigned MAX_ITERATIONS = 1 + (minEdgeWeight.second - minEdgeWeight.first) / opt::minEdgeWeightStep;
971 const unsigned MAX_ITERATIONS =
972 1 + (minEdgeWeight.second - minEdgeWeight.first) / opt::minEdgeWeightStep;
952973 for (unsigned i = 0; i < MAX_ITERATIONS; ++i) {
953974 // Optimize s.
954975 if (opt::verbose > 0) {
9801001 }
9811002
9821003 /** Run abyss-scaffold. */
983 int main(int argc, char** argv)
1004 int
1005 main(int argc, char** argv)
9841006 {
9851007 if (!opt::db.empty())
9861008 opt::metaVars.resize(3);
9871009
9881010 bool die = false;
989 for (int c; (c = getopt_long(argc, argv,
990 shortopts, longopts, NULL)) != -1;) {
1011 for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
9911012 istringstream arg(optarg != NULL ? optarg : "");
9921013 switch (c) {
993 case '?':
1014 case '?':
9941015 die = true;
9951016 break;
996 case 'k':
1017 case 'k':
9971018 arg >> opt::k;
9981019 break;
999 case 'G':
1000 {
1001 double x;
1002 arg >> x;
1003 opt::genomeSize = x;
1004 break;
1005 }
1006 case 'g':
1020 case 'G': {
1021 double x;
1022 arg >> x;
1023 opt::genomeSize = x;
1024 break;
1025 }
1026 case 'g':
10071027 arg >> opt::graphPath;
10081028 break;
1009 case 'n':
1029 case 'n':
10101030 arg >> opt::minEdgeWeight;
10111031 if (arg.peek() == '-') {
10121032 arg >> expect("-") >> opt::minEdgeWeightEnd;
10181038 else
10191039 opt::minEdgeWeightStep = 1;
10201040 break;
1021 case 'o':
1041 case 'o':
10221042 arg >> opt::out;
10231043 break;
1024 case 's':
1044 case 's':
10251045 arg >> opt::minContigLength;
10261046 if (arg.peek() == '-') {
10271047 opt::minContigLengthEnd = 100 * opt::minContigLength;
10281048 arg >> expect("-") >> opt::minContigLengthEnd;
1029 assert(opt::minContigLength
1030 <= opt::minContigLengthEnd);
1049 assert(opt::minContigLength <= opt::minContigLengthEnd);
10311050 } else
10321051 opt::minContigLengthEnd = opt::minContigLength;
10331052 break;
1034 case 'v':
1053 case 'v':
10351054 opt::verbose++;
10361055 break;
1037 case OPT_MIN_GAP:
1056 case OPT_MIN_GAP:
10381057 arg >> opt::minGap;
10391058 break;
1040 case OPT_MAX_GAP:
1059 case OPT_MAX_GAP:
10411060 arg >> opt::maxGap;
10421061 break;
1043 case OPT_HELP:
1062 case OPT_HELP:
10441063 cout << USAGE_MESSAGE;
10451064 exit(EXIT_SUCCESS);
1046 case OPT_VERSION:
1065 case OPT_VERSION:
10471066 cout << VERSION_MESSAGE;
10481067 exit(EXIT_SUCCESS);
1049 case OPT_DB:
1068 case OPT_DB:
10501069 arg >> opt::db;
10511070 break;
1052 case OPT_LIBRARY:
1071 case OPT_LIBRARY:
10531072 arg >> opt::metaVars[0];
10541073 break;
1055 case OPT_STRAIN:
1074 case OPT_STRAIN:
10561075 arg >> opt::metaVars[1];
10571076 break;
1058 case OPT_SPECIES:
1077 case OPT_SPECIES:
10591078 arg >> opt::metaVars[2];
10601079 break;
10611080 }
10621081 if (optarg != NULL && !arg.eof()) {
1063 cerr << PROGRAM ": invalid option: `-"
1064 << (char)c << optarg << "'\n";
1082 cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n";
10651083 exit(EXIT_FAILURE);
10661084 }
10671085 }
10681086
10691087 if (opt::k <= 0) {
1070 cerr << PROGRAM ": " << "missing -k,--kmer option\n";
1088 cerr << PROGRAM ": "
1089 << "missing -k,--kmer option\n";
10711090 die = true;
10721091 }
10731092
10771096 }
10781097
10791098 if (die) {
1080 cerr << "Try `" << PROGRAM
1081 << " --help' for more information.\n";
1099 cerr << "Try `" << PROGRAM << " --help' for more information.\n";
10821100 exit(EXIT_FAILURE);
10831101 }
10841102 if (!opt::db.empty()) {
1085 init(db,
1086 opt::db,
1087 opt::verbose,
1088 PROGRAM,
1089 opt::getCommand(argc, argv),
1090 opt::metaVars);
1103 init(db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars);
10911104 addToDb(db, "K", opt::k);
10921105 }
10931106
11131126 remove_edge_if(InvalidEdge(g), static_cast<DG&>(g));
11141127 unsigned numRemoved = numBefore - num_edges(g);
11151128 if (numRemoved > 0)
1116 cerr << "warning: Removed "
1117 << numRemoved << " invalid edges.\n";
1129 cerr << "warning: Removed " << numRemoved << " invalid edges.\n";
11181130
11191131 if (!opt::db.empty())
11201132 addToDb(db, "Edges_invalid", numRemoved);
11211133
11221134 const unsigned STATS_MIN_LENGTH = opt::minContigLength;
1123 if (opt::minEdgeWeight == opt::minEdgeWeightEnd
1124 && opt::minContigLength == opt::minContigLengthEnd) {
1135 if (opt::minEdgeWeight == opt::minEdgeWeightEnd &&
1136 opt::minContigLength == opt::minContigLengthEnd) {
11251137 ScaffoldResult result = scaffold(g, opt::minEdgeWeight, opt::minContigLength, true);
11261138 // Print assembly contiguity statistics.
11271139 if (opt::verbose > 0)
11311143 } else {
11321144 ScaffoldResult best(0, 0, 0, "");
11331145 switch (opt::searchStrategy) {
1134 case GRID_SEARCH:
1135 best = optimize_grid_search(g,
1136 std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd),
1137 std::make_pair(opt::minContigLength, opt::minContigLengthEnd));
1138 break;
1139 case LINE_SEARCH:
1140 best = optimize_line_search(g,
1141 std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd),
1142 std::make_pair(opt::minContigLength, opt::minContigLengthEnd));
1143 break;
1144 default:
1145 abort();
1146 break;
1146 case GRID_SEARCH:
1147 best = optimize_grid_search(
1148 g,
1149 std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd),
1150 std::make_pair(opt::minContigLength, opt::minContigLengthEnd));
1151 break;
1152 case LINE_SEARCH:
1153 best = optimize_line_search(
1154 g,
1155 std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd),
1156 std::make_pair(opt::minContigLength, opt::minContigLengthEnd));
1157 break;
1158 default:
1159 abort();
1160 break;
11471161 }
11481162
11491163 if (opt::verbose > 0)
11571171 std::cerr << best.metrics;
11581172 }
11591173
1160 std::cerr << '\n' << "Best scaffold N50 is " << best.n50 << " at n=" << best.n << " s=" << best.s << ".\n";
1174 std::cerr << '\n'
1175 << "Best scaffold N50 is " << best.n50 << " at n=" << best.n << " s=" << best.s
1176 << ".\n";
11611177
11621178 // Print assembly contiguity statistics.
11631179 std::cerr << '\n';
2323 Description
2424 ===========
2525
26 Sealer is an application of Konnector that closes intra-scaffold gaps. It performs three sequential functions. First, regions with Ns are identified from an input scaffold. Flanking nucleotues (2 x 100bp) are extracted from those regions while respecting the strand (5' to 3') direction on the sequence immediately downstream of each gap. In the second step, flanking sequence pairs are used as input to Konnector along with a set of reads with a high level of coverage redundancy. Ideally, the reads should represent the original dataset from which the draft assembly is generated, or further whole genome shotgun (WGS) sequencing data generated from the same sample. Within Konnector, the input WGS reads are used to populate a Bloom filter, tiling the reads with a sliding window of length *k*, thus generating a probabilistic representation of all the *k*-mers in the reads. Konnector also uses crude error removal and correctional algorithms, eliminating singletons (*k*-mers that are observed only once) and fixing base mismatches in the flanking sequence pairs. Sealer launches Konnector processes using a user-input range of *k*-mer lengths. In the third and final operation, succesfully merged sequences are inserted into the gaps of the original scaffolds, and Sealer outputs a new gap-filled scaffold file.
26 Sealer is an application of Konnector that closes intra-scaffold gaps. It performs three sequential functions. First, regions with Ns are identified from an input scaffold. Flanking nucleotues (2 x 100bp) are extracted from those regions while respecting the strand (5' to 3') direction on the sequence immediately downstream of each gap. In the second step, flanking sequence pairs are used as input to Konnector along with a set of reads with a high level of coverage redundancy. Ideally, the reads should represent the original dataset from which the draft assembly is generated, or further whole genome shotgun (WGS) sequencing data generated from the same sample. Within Konnector, the input WGS reads are used to populate a Bloom filter, tiling the reads with a sliding window of length *k*, thus generating a probabilistic representation of all the *k*-mers in the reads. Konnector also uses crude error removal and correctional algorithms, eliminating singletons (*k*-mers that are observed only once) and fixing base mismatches in the flanking sequence pairs. Sealer launches Konnector processes using a user-input range of *k*-mer lengths. In the third and final operation, successfully merged sequences are inserted into the gaps of the original scaffolds, and Sealer outputs a new gap-filled scaffold file.
2727
2828 Installation
2929 ============
158158 * `--help`: display this help and exit
159159 * `--version`: output version information and exit
160160
161 *k* is the size of *k*-mer for the de Bruijn graph. You may specify multiple values of *k*, which will increase the nubmer of gaps closed at the cost of increased run time. Multiple values of *k* ought to be specified in increasing order, as lower values of *k* have fewer coverage gaps and are less likely to misassemble.
161 *k* is the size of *k*-mer for the de Bruijn graph. You may specify multiple values of *k*, which will increase the number of gaps closed at the cost of increased run time. Multiple values of *k* ought to be specified in increasing order, as lower values of *k* have fewer coverage gaps and are less likely to misassemble.
162162
163163 *P* is the threshold for number of paths allowed to be traversed. When set to 10, Konnector will attempt to close gaps even when there are 10 different paths found. It would attempt to create a consensus sequence between these paths. The default setting is 2.
7070 " --print-flanks outputs flank files\n"
7171 " -S, --input-scaffold=FILE load scaffold from FILE\n"
7272 " -L, --flank-length=N length of flanks to be used as pseudoreads [100]\n"
73 " -D, --flank-distance=N distance of flank from gap [0]\n"
7473 " -G, --max-gap-length=N max gap size to fill in bp [800]; runtime increases\n"
7574 " exponentially with respect to this parameter\n"
7675 " -j, --threads=N use N parallel threads [1]\n"
397397 @echo 'Report bugs to https://github.com/bcgsc/abyss/issues or abyss-users@bcgsc.ca.'
398398
399399 version:
400 @echo "abyss-pe (ABySS) 2.2.3"
400 @echo "abyss-pe (ABySS) 2.2.4"
401401 @echo "Written by Shaun Jackman and Anthony Raymond."
402402 @echo
403403 @echo "Copyright 2012 Canada's Michael Smith Genome Science Centre"
00 AC_PREREQ(2.62)
1 AC_INIT(ABySS, 2.2.3, abyss-users@bcgsc.ca, abyss,
1 AC_INIT(ABySS, 2.2.4, abyss-users@bcgsc.ca, abyss,
22 http://www.bcgsc.ca/platform/bioinfo/software/abyss)
33
44 AC_CONFIG_MACRO_DIR([m4])
0 .TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.2.3" "User Commands"
0 .TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.2.4" "User Commands"
11 .SH NAME
22 ABYSS \- assemble short reads into contigs
33 .SH SYNOPSIS
0 .TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.2.3" "User Commands"
0 .TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.2.4" "User Commands"
11 .SH NAME
22 abyss-pe - assemble reads into contigs
33 .SH SYNOPSIS
112112 SS=--SS to assemble in strand-specific mode
113113 .br
114114 Requires that all libraries are strand-specific RNA-Seq libraries.
115 Assumes that the first read in a read pair is reveresed WRT the
115 Assumes that the first read in a read pair is reversed WRT the
116116 transcripts sequenced.
117117 .TP
118118 .B t
241241
242242 .SS "MPI COMPATIBILITY"
243243 Due to its use of multi-threading, DIDA has known deadlocking issues
244 with OpenMPI. Using the MPICH MPI library is strongly recommmended
244 with OpenMPI. Using the MPICH MPI library is strongly recommended
245245 when running assemblies with DIDA. Testing was done with MPICH 3.1.3,
246246 compiled with --enable-threads=funneled.
247247
0 .TH abyss-tofastq "1" "2015-May" "ABySS 2.2.3" "User Commands"
0 .TH abyss-tofastq "1" "2015-May" "ABySS 2.2.4" "User Commands"
11 .SH NAME
22 abyss-tofastq \- convert various file formats to FASTQ format
33 .br