Codebase list mash / de05b89
Merge tag 'upstream/2.2+dfsg' Upstream version 2.2+dfsg Sascha Steinbiss 4 years ago
13 changed file(s) with 137 addition(s) and 64 deletion(s). Raw diff Collapse all Expand all
00 Mash is normally distributed as a dependency-free binary for Linux or OSX (see
11 https://github.com/marbl/Mash/releases). This source distribution is intended
2 for other operating systems or for development. Mash requires c++11 to build,
3 which is available in and GCC >= 4.8 and OSX >= 10.7.
2 for other operating systems or for development. Mash requires c++14 to build,
3 which is available in and GCC >= 5 and XCode >= 6.
44
55 See http://mash.readthedocs.org for more information.
4242 master_doc = 'index'
4343
4444 # General information about the project.
45 project = u'mash'
45 project = u'Mash'
4646 copyright = u'2015, Brian Ondov, Todd Treangen, Adam Phillippy'
4747
4848 # The version info for the project you're documenting, acts as replacement for
3636 * `fig5.html <https://obj.umiacs.umd.edu/mash/screen/fig5/fig5.html>`_: Interactive version
3737 * `fig5.tsv <https://obj.umiacs.umd.edu/mash/screen/fig5/fig5.tsv>`_: Source data
3838
39 Public data sources
40 ~~~~~~~~~~~~~~~~~~~
41
42 The BLAST ``nr`` database was downloaded from ``ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr.*``.
43
44 HMP data were downloaded from ``ftp://public-ftp.ihmpdcc.org/``, reads from the ``Ilumina/`` directory
45 and coding sequences from the ``HMGI/`` directory. Within these folders, sample SRS015937 resides in
46 ``tongue_dorsum/`` and SRS020263 in ``right_retroauricular_crease/``.
47
48 SRA runs downloaded with the `SRA Toolkit <https://www.ncbi.nlm.nih.gov/sra/docs/toolkitsoft/>`_.
49
50 RefSeq genomes downloaded from the ``genomes/refseq/`` directory of ``ftp.ncbi.nlm.nih.gov``.
51
52 Public data products
53 ~~~~~~~~~~~~~~~~~~~~
54
55 Quebec Polyomavirus is submitted to GenBank as BK010702.
56
5739 Screen of SRA metagenomes vs. RefSeq
5840 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5941
8264 GCF_000001215.4 SRR3401361 SRR3540373
8365 GCF_000001405.36 SRR5127794 ERR1539652 SRR413753 ERR206081
8466 GCF_000001405.38 SRR5127794 ERR1539652 ERR1711677 SRR413753 ERR206081
67
68 We also provide simple scripts for searching these files: `search.tar <https://obj.umiacs.umd.edu/mash/screen/search.tar>`_
69
70 Public data sources
71 ~~~~~~~~~~~~~~~~~~~
72
73 The BLAST ``nr`` database was downloaded from ``ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr.*``.
74
75 HMP data were downloaded from ``ftp://public-ftp.ihmpdcc.org/``, reads from the ``Ilumina/`` directory
76 and coding sequences from the ``HMGI/`` directory. Within these folders, sample SRS015937 resides in
77 ``tongue_dorsum/`` and SRS020263 in ``right_retroauricular_crease/``.
78
79 SRA runs downloaded with the `SRA Toolkit <https://www.ncbi.nlm.nih.gov/sra/docs/toolkitsoft/>`_.
80
81 RefSeq genomes downloaded from the ``genomes/refseq/`` directory of ``ftp.ncbi.nlm.nih.gov``.
82
83 Public data products
84 ~~~~~~~~~~~~~~~~~~~~
85
86 Quebec Polyomavirus is submitted to GenBank as BK010702.
87
1515 Publication
1616 ===========
1717 `Mash: fast genome and metagenome distance estimation using MinHash. Ondov BD, Treangen TJ, Melsted P, Mallonee AB, Bergman NH, Koren S, Phillippy AM. Genome Biol. 2016 Jun 20;17(1):132. doi: 10.1186/s13059-016-0997-x. <http://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x>`_
18
19 `Mash Screen: High-throughput sequence containment estimation for genome discovery. Ondov BD, Starrett GJ, Sappington A, Kostic A, Koren S, Buck CB, Phillippy AM. BioRxiv. 2019 Mar. doi: 10.1101/557314 <https://doi.org/10.1101/557314>`_
1820
1921 .. toctree::
2022 :maxdepth: 1
8484
8585 if ( cont )
8686 {
87 m2j = exp(-k * dists[j]);
87 //m2j = exp(-k * dists[j]);
88 m2j = pow(1.0 - dists[j], k); // binomial model
8889 }
8990 else
9091 {
113114
114115 if ( cont )
115116 {
116 j2m = -1.0 / k * log(je);
117 //j2m = -1.0 / k * log(je);
118 j2m = 1.0 - pow(je, 1. / k);
117119 }
118120 else
119121 {
3636 //addOption("log", Option(Option::Boolean, "L", "Output", "Log scale distances and divide by k-mer size to provide a better analog to phylogenetic distance. The special case of zero shared min-hashes will result in a distance of 1.", ""));
3737 addOption("pvalue", Option(Option::Number, "v", "Output", "Maximum p-value to report.", "1.0", 0., 1.));
3838 addOption("distance", Option(Option::Number, "d", "Output", "Maximum distance to report.", "1.0", 0., 1.));
39 addOption("comment", Option(Option::Boolean, "C", "Output", "Show comment fields with reference/query names (denoted with ':').", "1.0", 0., 1.));
3940 useSketchOptions();
4041 }
4142
5051 int threads = options.at("threads").getArgumentAsNumber();
5152 bool list = options.at("list").active;
5253 bool table = options.at("table").active;
54 bool comment = options.at("comment").active;
5355 //bool log = options.at("log").active;
5456 double pValueMax = options.at("pvalue").getArgumentAsNumber();
5557 double distanceMax = options.at("distance").getArgumentAsNumber();
224226
225227 while ( threadPool.outputAvailable() )
226228 {
227 writeOutput(threadPool.popOutputWhenAvailable(), table);
229 writeOutput(threadPool.popOutputWhenAvailable(), table, comment);
228230 }
229231 }
230232
231233 while ( threadPool.running() )
232234 {
233 writeOutput(threadPool.popOutputWhenAvailable(), table);
235 writeOutput(threadPool.popOutputWhenAvailable(), table, comment);
234236 }
235237
236238 if ( warningCount > 0 && ! parameters.reads )
241243 return 0;
242244 }
243245
244 void CommandDistance::writeOutput(CompareOutput * output, bool table) const
246 void CommandDistance::writeOutput(CompareOutput * output, bool table, bool comment) const
245247 {
246248 uint64_t i = output->indexQuery;
247249 uint64_t j = output->indexRef;
266268 }
267269 else if ( pair->pass )
268270 {
269 cout << output->sketchRef.getReference(j).name << '\t' << output->sketchQuery.getReference(i).name << '\t' << pair->distance << '\t' << pair->pValue << '\t' << pair->numer << '/' << pair->denom << endl;
271 cout << output->sketchRef.getReference(j).name;
272
273 if ( comment )
274 {
275 cout << ':' << output->sketchRef.getReference(j).comment;
276 }
277
278 cout << '\t' << output->sketchQuery.getReference(i).name;
279
280 if ( comment )
281 {
282 cout << ':' << output->sketchQuery.getReference(i).comment;
283 }
284
285 cout << '\t' << pair->distance << '\t' << pair->pValue << '\t' << pair->numer << '/' << pair->denom << endl;
270286 }
271287
272288 j++;
8484
8585 private:
8686
87 void writeOutput(CompareOutput * output, bool table) const;
87 void writeOutput(CompareOutput * output, bool table, bool comment) const;
8888 };
8989
9090 CommandDistance::CompareOutput * compare(CommandDistance::CompareInput * input);
3838 : Command()
3939 {
4040 name = "screen";
41 summary = "Determine whether query sequences are within a larger pool of sequences.";
42 description = "Determine how well query sequences are contained within a pool of sequences. The queries must be formatted as a single Mash sketch file (.msh), created with the `mash sketch` command. The <pool> files can be contigs or reads, in fasta or fastq, gzipped or not, and \"-\" can be given for <pool> to read from standard input. The <pool> sequences are assumed to be nucleotides, and will be 6-frame translated if the <queries> are amino acids. The output fields are [identity, shared-hashes, median-multiplicity, p-value, query-ID, query-comment], where median-multiplicity is computed for shared hashes, based on the number of observations of those hashes within the pool.";
43 argumentString = "<queries>.msh <pool> [<pool>] ...";
41 summary = "Determine whether query sequences are within a larger mixture of sequences.";
42 description = "Determine how well query sequences are contained within a mixture of sequences. The queries must be formatted as a single Mash sketch file (.msh), created with the `mash sketch` command. The <mixture> files can be contigs or reads, in fasta or fastq, gzipped or not, and \"-\" can be given for <mixture> to read from standard input. The <mixture> sequences are assumed to be nucleotides, and will be 6-frame translated if the <queries> are amino acids. The output fields are [identity, shared-hashes, median-multiplicity, p-value, query-ID, query-comment], where median-multiplicity is computed for shared hashes, based on the number of observations of those hashes within the mixture.";
43 argumentString = "<queries>.msh <mixture> [<mixture>] ...";
4444
4545 useOption("help");
4646 useOption("threads");
321321 */
322322
323323 uint64_t setSize = minHashHeap.estimateSetSize();
324 cerr << " Estimated distinct" << (trans ? " (translated)" : "") << " k-mers in pool: " << setSize << endl;
324 cerr << " Estimated distinct" << (trans ? " (translated)" : "") << " k-mers in mixture: " << setSize << endl;
325325
326326 if ( setSize == 0 )
327327 {
605605 return 1.;
606606 }
607607
608 double r = 1. / (1. + kmerSpace / setSize);
608 double r = double(setSize) / kmerSpace;
609609
610610 #ifdef USE_BOOST
611611 return cdf(complement(binomial(sketchSize, r), x - 1));
1818
1919 namespace mash {
2020
21 struct HashTableEntry
22 {
23 HashTableEntry() : count(0) {}
24
25 uint32_t count;
26 std::unordered_set<uint64_t> indices;
27 };
28
29 //typedef std::unordered_map< uint64_t, HashTableEntry > HashTable;
2130 typedef std::unordered_map< uint64_t, std::unordered_set<uint64_t> > HashTable;
2231
2332 static const std::unordered_map< std::string, char > codons =
3434 useOption("help");
3535 addOption("list", Option(Option::Boolean, "l", "Input", "List input. Lines in each <query> specify paths to sequence files, one per line. The reference file is not affected.", ""));
3636 addOption("comment", Option(Option::Boolean, "C", "Output", "Use comment fields for sequence names instead of IDs.", ""));
37 addOption("edge", Option(Option::Boolean, "E", "Output", "Output edge list instead of Phylip matrix, with fields [seq1, seq2, dist, p-val, shared-hashes].", ""));
38 addOption("pvalue", Option(Option::Number, "v", "Output", "Maximum p-value to report in edge list. Implies -" + getOption("edge").identifier + ".", "1.0", 0., 1.));
39 addOption("distance", Option(Option::Number, "d", "Output", "Maximum distance to report in edge list. Implies -" + getOption("edge").identifier + ".", "1.0", 0., 1.));
3740 //addOption("log", Option(Option::Boolean, "L", "Output", "Log scale distances and divide by k-mer size to provide a better analog to phylogenetic distance. The special case of zero shared min-hashes will result in a distance of 1.", ""));
3841 useSketchOptions();
3942 }
4952 int threads = options.at("threads").getArgumentAsNumber();
5053 bool list = options.at("list").active;
5154 //bool log = options.at("log").active;
52 double pValueMax = 0;
5355 bool comment = options.at("comment").active;
56 bool edge = options.at("edge").active;
57 double pValueMax = options.at("pvalue").getArgumentAsNumber();
58 double distanceMax = options.at("distance").getArgumentAsNumber();
59 double pValuePeakToSet = 0;
60
61 if ( options.at("pvalue").active || options.at("distance").active )
62 {
63 edge = true;
64 }
5465
5566 Sketch::Parameters parameters;
5667
108119 }
109120 }
110121
111 cout << '\t' << sketch.getReferenceCount() << endl;
112 cout << (comment ? sketch.getReference(0).comment : sketch.getReference(0).name) << endl;
122 if ( !edge )
123 {
124 cout << '\t' << sketch.getReferenceCount() << endl;
125 cout << (comment ? sketch.getReference(0).comment : sketch.getReference(0).name) << endl;
126 }
113127
114128 ThreadPool<TriangleInput, TriangleOutput> threadPool(compare, threads);
115129
116130 for ( uint64_t i = 1; i < sketch.getReferenceCount(); i++ )
117131 {
118 threadPool.runWhenThreadAvailable(new TriangleInput(sketch, i, parameters));
132 threadPool.runWhenThreadAvailable(new TriangleInput(sketch, i, parameters, distanceMax, pValueMax));
119133
120134 while ( threadPool.outputAvailable() )
121135 {
122 writeOutput(threadPool.popOutputWhenAvailable(), comment, pValueMax);
136 writeOutput(threadPool.popOutputWhenAvailable(), comment, edge, pValuePeakToSet);
123137 }
124138 }
125139
126140 while ( threadPool.running() )
127141 {
128 writeOutput(threadPool.popOutputWhenAvailable(), comment, pValueMax);
129 }
130
131 cerr << "Max p-value: " << pValueMax << endl;
142 writeOutput(threadPool.popOutputWhenAvailable(), comment, edge, pValuePeakToSet);
143 }
144
145 if ( !edge )
146 {
147 cerr << "Max p-value: " << pValuePeakToSet << endl;
148 }
132149
133150 if ( warningCount > 0 && ! parameters.reads )
134151 {
138155 return 0;
139156 }
140157
141 void CommandTriangle::writeOutput(TriangleOutput * output, bool comment, double & pValueMax) const
142 {
143 const Sketch & sketch = output->sketch;
144 const Sketch::Reference & ref = sketch.getReference(output->index);
145
146 cout << (comment ? ref.comment : ref.name);
147
158 void CommandTriangle::writeOutput(TriangleOutput * output, bool comment, bool edge, double & pValuePeakToSet) const
159 {
160 const Sketch & sketch = output->sketch;
161 const Sketch::Reference & ref = sketch.getReference(output->index);
162
163 if ( !edge )
164 {
165 cout << (comment ? ref.comment : ref.name);
166 }
167
148168 for ( uint64_t i = 0; i < output->index; i++ )
149169 {
150170 const CommandDistance::CompareOutput::PairOutput * pair = &output->pairs[i];
151 cout << '\t' << pair->distance;
152 if ( pair->pValue > pValueMax )
153 {
154 pValueMax = pair->pValue;
155 }
156 }
157 cout << endl;
171
172 if ( edge )
173 {
174 if ( pair->pass )
175 {
176 const Sketch::Reference & qry = sketch.getReference(i);
177 cout << (comment ? ref.comment : ref.name) << '\t'<< (comment ? qry.comment : qry.name) << '\t' << pair->distance << '\t' << pair->pValue << '\t' << pair->numer << '/' << pair->denom << endl;
178 }
179 }
180 else
181 {
182 cout << '\t' << pair->distance;
183 }
184
185 if ( pair->pValue > pValuePeakToSet )
186 {
187 pValuePeakToSet = pair->pValue;
188 }
189 }
190
191 if ( !edge )
192 {
193 cout << endl;
194 }
158195
159196 delete output;
160197 }
169206
170207 for ( uint64_t i = 0; i < input->index; i++ )
171208 {
172 compareSketches(&output->pairs[i], sketch.getReference(input->index), sketch.getReference(i), sketchSize, sketch.getKmerSize(), sketch.getKmerSpace(), -1., -1.);
209 compareSketches(&output->pairs[i], sketch.getReference(input->index), sketch.getReference(i), sketchSize, sketch.getKmerSize(), sketch.getKmerSpace(), input->maxDistance, input->maxPValue);
173210 }
174211
175212 return output;
1818
1919 struct TriangleInput
2020 {
21 TriangleInput(const Sketch & sketchNew, uint64_t indexNew, const Sketch::Parameters & parametersNew)
21 TriangleInput(const Sketch & sketchNew, uint64_t indexNew, const Sketch::Parameters & parametersNew, double maxDistanceNew, double maxPValueNew)
2222 :
2323 sketch(sketchNew),
2424 index(indexNew),
25 parameters(parametersNew)
25 parameters(parametersNew),
26 maxDistance(maxDistanceNew),
27 maxPValue(maxPValueNew)
2628 {}
2729
2830 const Sketch & sketch;
2931 uint64_t index;
3032 const Sketch::Parameters & parameters;
33 double maxDistance;
34 double maxPValue;
3135 };
3236
3337 struct TriangleOutput
6064 double pValueMax;
6165 bool comment;
6266
63 void writeOutput(TriangleOutput * output, bool comment, double & pValueMax) const;
67 void writeOutput(TriangleOutput * output, bool comment, bool edge, double & pValuePeakToSet) const;
6468 };
6569
6670 CommandTriangle::TriangleOutput * compare(CommandTriangle::TriangleInput * input);
33 //
44 // See the LICENSE.txt file included with this software for license information.
55
6 static const char * version = "2.1.1";
6 static const char * version = "2.2";
0 0.861792 44/1000 1 5.00739e-229 genome1.fna gi|49175990|ref|NC_000913.2| Escherichia coli str. K-12 substr. MG1655, complete genome
1 0.853596 36/1000 1 1.70479e-184 genome2.fna gi|47118301|dbj|BA000007.2| Escherichia coli O157:H7 str. Sakai DNA, complete genome
2 0.861792 44/1000 1 5.00739e-229 genome3.fna gi|682117612|gb|CP009273.1| Escherichia coli BW25113, complete genome
0 0.861792 44/1000 1 5.00742e-229 genome1.fna gi|49175990|ref|NC_000913.2| Escherichia coli str. K-12 substr. MG1655, complete genome
1 0.853596 36/1000 1 1.7048e-184 genome2.fna gi|47118301|dbj|BA000007.2| Escherichia coli O157:H7 str. Sakai DNA, complete genome
2 0.861792 44/1000 1 5.00742e-229 genome3.fna gi|682117612|gb|CP009273.1| Escherichia coli BW25113, complete genome