Merge tag 'upstream/2.2+dfsg'
Upstream version 2.2+dfsg
Sascha Steinbiss
4 years ago
0 | 0 | Mash is normally distributed as a dependency-free binary for Linux or OSX (see |
1 | 1 | https://github.com/marbl/Mash/releases). This source distribution is intended |
2 | for other operating systems or for development. Mash requires c++11 to build, | |
3 | which is available in and GCC >= 4.8 and OSX >= 10.7. | |
2 | for other operating systems or for development. Mash requires c++14 to build, | |
3 | which is available in and GCC >= 5 and XCode >= 6. | |
4 | 4 | |
5 | 5 | See http://mash.readthedocs.org for more information. |
42 | 42 | master_doc = 'index' |
43 | 43 | |
44 | 44 | # General information about the project. |
45 | project = u'mash' | |
45 | project = u'Mash' | |
46 | 46 | copyright = u'2015, Brian Ondov, Todd Treangen, Adam Phillippy' |
47 | 47 | |
48 | 48 | # The version info for the project you're documenting, acts as replacement for |
36 | 36 | * `fig5.html <https://obj.umiacs.umd.edu/mash/screen/fig5/fig5.html>`_: Interactive version |
37 | 37 | * `fig5.tsv <https://obj.umiacs.umd.edu/mash/screen/fig5/fig5.tsv>`_: Source data |
38 | 38 | |
39 | Public data sources | |
40 | ~~~~~~~~~~~~~~~~~~~ | |
41 | ||
42 | The BLAST ``nr`` database was downloaded from ``ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr.*``. | |
43 | ||
44 | HMP data were downloaded from ``ftp://public-ftp.ihmpdcc.org/``, reads from the ``Ilumina/`` directory | |
45 | and coding sequences from the ``HMGI/`` directory. Within these folders, sample SRS015937 resides in | |
46 | ``tongue_dorsum/`` and SRS020263 in ``right_retroauricular_crease/``. | |
47 | ||
48 | SRA runs downloaded with the `SRA Toolkit <https://www.ncbi.nlm.nih.gov/sra/docs/toolkitsoft/>`_. | |
49 | ||
50 | RefSeq genomes downloaded from the ``genomes/refseq/`` directory of ``ftp.ncbi.nlm.nih.gov``. | |
51 | ||
52 | Public data products | |
53 | ~~~~~~~~~~~~~~~~~~~~ | |
54 | ||
55 | Quebec Polyomavirus is submitted to GenBank as BK010702. | |
56 | ||
57 | 39 | Screen of SRA metagenomes vs. RefSeq |
58 | 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
59 | 41 | |
82 | 64 | GCF_000001215.4 SRR3401361 SRR3540373 |
83 | 65 | GCF_000001405.36 SRR5127794 ERR1539652 SRR413753 ERR206081 |
84 | 66 | GCF_000001405.38 SRR5127794 ERR1539652 ERR1711677 SRR413753 ERR206081 |
67 | ||
68 | We also provide simple scripts for searching these files: `search.tar <https://obj.umiacs.umd.edu/mash/screen/search.tar>`_ | |
69 | ||
70 | Public data sources | |
71 | ~~~~~~~~~~~~~~~~~~~ | |
72 | ||
73 | The BLAST ``nr`` database was downloaded from ``ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr.*``. | |
74 | ||
75 | HMP data were downloaded from ``ftp://public-ftp.ihmpdcc.org/``, reads from the ``Ilumina/`` directory | |
76 | and coding sequences from the ``HMGI/`` directory. Within these folders, sample SRS015937 resides in | |
77 | ``tongue_dorsum/`` and SRS020263 in ``right_retroauricular_crease/``. | |
78 | ||
79 | SRA runs downloaded with the `SRA Toolkit <https://www.ncbi.nlm.nih.gov/sra/docs/toolkitsoft/>`_. | |
80 | ||
81 | RefSeq genomes downloaded from the ``genomes/refseq/`` directory of ``ftp.ncbi.nlm.nih.gov``. | |
82 | ||
83 | Public data products | |
84 | ~~~~~~~~~~~~~~~~~~~~ | |
85 | ||
86 | Quebec Polyomavirus is submitted to GenBank as BK010702. | |
87 |
15 | 15 | Publication |
16 | 16 | =========== |
17 | 17 | `Mash: fast genome and metagenome distance estimation using MinHash. Ondov BD, Treangen TJ, Melsted P, Mallonee AB, Bergman NH, Koren S, Phillippy AM. Genome Biol. 2016 Jun 20;17(1):132. doi: 10.1186/s13059-016-0997-x. <http://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x>`_ |
18 | ||
19 | `Mash Screen: High-throughput sequence containment estimation for genome discovery. Ondov BD, Starrett GJ, Sappington A, Kostic A, Koren S, Buck CB, Phillippy AM. BioRxiv. 2019 Mar. doi: 10.1101/557314 <https://doi.org/10.1101/557314>`_ | |
18 | 20 | |
19 | 21 | .. toctree:: |
20 | 22 | :maxdepth: 1 |
84 | 84 | |
85 | 85 | if ( cont ) |
86 | 86 | { |
87 | m2j = exp(-k * dists[j]); | |
87 | //m2j = exp(-k * dists[j]); | |
88 | m2j = pow(1.0 - dists[j], k); // binomial model | |
88 | 89 | } |
89 | 90 | else |
90 | 91 | { |
113 | 114 | |
114 | 115 | if ( cont ) |
115 | 116 | { |
116 | j2m = -1.0 / k * log(je); | |
117 | //j2m = -1.0 / k * log(je); | |
118 | j2m = 1.0 - pow(je, 1. / k); | |
117 | 119 | } |
118 | 120 | else |
119 | 121 | { |
36 | 36 | //addOption("log", Option(Option::Boolean, "L", "Output", "Log scale distances and divide by k-mer size to provide a better analog to phylogenetic distance. The special case of zero shared min-hashes will result in a distance of 1.", "")); |
37 | 37 | addOption("pvalue", Option(Option::Number, "v", "Output", "Maximum p-value to report.", "1.0", 0., 1.)); |
38 | 38 | addOption("distance", Option(Option::Number, "d", "Output", "Maximum distance to report.", "1.0", 0., 1.)); |
39 | addOption("comment", Option(Option::Boolean, "C", "Output", "Show comment fields with reference/query names (denoted with ':').", "1.0", 0., 1.)); | |
39 | 40 | useSketchOptions(); |
40 | 41 | } |
41 | 42 | |
50 | 51 | int threads = options.at("threads").getArgumentAsNumber(); |
51 | 52 | bool list = options.at("list").active; |
52 | 53 | bool table = options.at("table").active; |
54 | bool comment = options.at("comment").active; | |
53 | 55 | //bool log = options.at("log").active; |
54 | 56 | double pValueMax = options.at("pvalue").getArgumentAsNumber(); |
55 | 57 | double distanceMax = options.at("distance").getArgumentAsNumber(); |
224 | 226 | |
225 | 227 | while ( threadPool.outputAvailable() ) |
226 | 228 | { |
227 | writeOutput(threadPool.popOutputWhenAvailable(), table); | |
229 | writeOutput(threadPool.popOutputWhenAvailable(), table, comment); | |
228 | 230 | } |
229 | 231 | } |
230 | 232 | |
231 | 233 | while ( threadPool.running() ) |
232 | 234 | { |
233 | writeOutput(threadPool.popOutputWhenAvailable(), table); | |
235 | writeOutput(threadPool.popOutputWhenAvailable(), table, comment); | |
234 | 236 | } |
235 | 237 | |
236 | 238 | if ( warningCount > 0 && ! parameters.reads ) |
241 | 243 | return 0; |
242 | 244 | } |
243 | 245 | |
244 | void CommandDistance::writeOutput(CompareOutput * output, bool table) const | |
246 | void CommandDistance::writeOutput(CompareOutput * output, bool table, bool comment) const | |
245 | 247 | { |
246 | 248 | uint64_t i = output->indexQuery; |
247 | 249 | uint64_t j = output->indexRef; |
266 | 268 | } |
267 | 269 | else if ( pair->pass ) |
268 | 270 | { |
269 | cout << output->sketchRef.getReference(j).name << '\t' << output->sketchQuery.getReference(i).name << '\t' << pair->distance << '\t' << pair->pValue << '\t' << pair->numer << '/' << pair->denom << endl; | |
271 | cout << output->sketchRef.getReference(j).name; | |
272 | ||
273 | if ( comment ) | |
274 | { | |
275 | cout << ':' << output->sketchRef.getReference(j).comment; | |
276 | } | |
277 | ||
278 | cout << '\t' << output->sketchQuery.getReference(i).name; | |
279 | ||
280 | if ( comment ) | |
281 | { | |
282 | cout << ':' << output->sketchQuery.getReference(i).comment; | |
283 | } | |
284 | ||
285 | cout << '\t' << pair->distance << '\t' << pair->pValue << '\t' << pair->numer << '/' << pair->denom << endl; | |
270 | 286 | } |
271 | 287 | |
272 | 288 | j++; |
84 | 84 | |
85 | 85 | private: |
86 | 86 | |
87 | void writeOutput(CompareOutput * output, bool table) const; | |
87 | void writeOutput(CompareOutput * output, bool table, bool comment) const; | |
88 | 88 | }; |
89 | 89 | |
90 | 90 | CommandDistance::CompareOutput * compare(CommandDistance::CompareInput * input); |
38 | 38 | : Command() |
39 | 39 | { |
40 | 40 | name = "screen"; |
41 | summary = "Determine whether query sequences are within a larger pool of sequences."; | |
42 | description = "Determine how well query sequences are contained within a pool of sequences. The queries must be formatted as a single Mash sketch file (.msh), created with the `mash sketch` command. The <pool> files can be contigs or reads, in fasta or fastq, gzipped or not, and \"-\" can be given for <pool> to read from standard input. The <pool> sequences are assumed to be nucleotides, and will be 6-frame translated if the <queries> are amino acids. The output fields are [identity, shared-hashes, median-multiplicity, p-value, query-ID, query-comment], where median-multiplicity is computed for shared hashes, based on the number of observations of those hashes within the pool."; | |
43 | argumentString = "<queries>.msh <pool> [<pool>] ..."; | |
41 | summary = "Determine whether query sequences are within a larger mixture of sequences."; | |
42 | description = "Determine how well query sequences are contained within a mixture of sequences. The queries must be formatted as a single Mash sketch file (.msh), created with the `mash sketch` command. The <mixture> files can be contigs or reads, in fasta or fastq, gzipped or not, and \"-\" can be given for <mixture> to read from standard input. The <mixture> sequences are assumed to be nucleotides, and will be 6-frame translated if the <queries> are amino acids. The output fields are [identity, shared-hashes, median-multiplicity, p-value, query-ID, query-comment], where median-multiplicity is computed for shared hashes, based on the number of observations of those hashes within the mixture."; | |
43 | argumentString = "<queries>.msh <mixture> [<mixture>] ..."; | |
44 | 44 | |
45 | 45 | useOption("help"); |
46 | 46 | useOption("threads"); |
321 | 321 | */ |
322 | 322 | |
323 | 323 | uint64_t setSize = minHashHeap.estimateSetSize(); |
324 | cerr << " Estimated distinct" << (trans ? " (translated)" : "") << " k-mers in pool: " << setSize << endl; | |
324 | cerr << " Estimated distinct" << (trans ? " (translated)" : "") << " k-mers in mixture: " << setSize << endl; | |
325 | 325 | |
326 | 326 | if ( setSize == 0 ) |
327 | 327 | { |
605 | 605 | return 1.; |
606 | 606 | } |
607 | 607 | |
608 | double r = 1. / (1. + kmerSpace / setSize); | |
608 | double r = double(setSize) / kmerSpace; | |
609 | 609 | |
610 | 610 | #ifdef USE_BOOST |
611 | 611 | return cdf(complement(binomial(sketchSize, r), x - 1)); |
18 | 18 | |
19 | 19 | namespace mash { |
20 | 20 | |
21 | struct HashTableEntry | |
22 | { | |
23 | HashTableEntry() : count(0) {} | |
24 | ||
25 | uint32_t count; | |
26 | std::unordered_set<uint64_t> indices; | |
27 | }; | |
28 | ||
29 | //typedef std::unordered_map< uint64_t, HashTableEntry > HashTable; | |
21 | 30 | typedef std::unordered_map< uint64_t, std::unordered_set<uint64_t> > HashTable; |
22 | 31 | |
23 | 32 | static const std::unordered_map< std::string, char > codons = |
34 | 34 | useOption("help"); |
35 | 35 | addOption("list", Option(Option::Boolean, "l", "Input", "List input. Lines in each <query> specify paths to sequence files, one per line. The reference file is not affected.", "")); |
36 | 36 | addOption("comment", Option(Option::Boolean, "C", "Output", "Use comment fields for sequence names instead of IDs.", "")); |
37 | addOption("edge", Option(Option::Boolean, "E", "Output", "Output edge list instead of Phylip matrix, with fields [seq1, seq2, dist, p-val, shared-hashes].", "")); | |
38 | addOption("pvalue", Option(Option::Number, "v", "Output", "Maximum p-value to report in edge list. Implies -" + getOption("edge").identifier + ".", "1.0", 0., 1.)); | |
39 | addOption("distance", Option(Option::Number, "d", "Output", "Maximum distance to report in edge list. Implies -" + getOption("edge").identifier + ".", "1.0", 0., 1.)); | |
37 | 40 | //addOption("log", Option(Option::Boolean, "L", "Output", "Log scale distances and divide by k-mer size to provide a better analog to phylogenetic distance. The special case of zero shared min-hashes will result in a distance of 1.", "")); |
38 | 41 | useSketchOptions(); |
39 | 42 | } |
49 | 52 | int threads = options.at("threads").getArgumentAsNumber(); |
50 | 53 | bool list = options.at("list").active; |
51 | 54 | //bool log = options.at("log").active; |
52 | double pValueMax = 0; | |
53 | 55 | bool comment = options.at("comment").active; |
56 | bool edge = options.at("edge").active; | |
57 | double pValueMax = options.at("pvalue").getArgumentAsNumber(); | |
58 | double distanceMax = options.at("distance").getArgumentAsNumber(); | |
59 | double pValuePeakToSet = 0; | |
60 | ||
61 | if ( options.at("pvalue").active || options.at("distance").active ) | |
62 | { | |
63 | edge = true; | |
64 | } | |
54 | 65 | |
55 | 66 | Sketch::Parameters parameters; |
56 | 67 | |
108 | 119 | } |
109 | 120 | } |
110 | 121 | |
111 | cout << '\t' << sketch.getReferenceCount() << endl; | |
112 | cout << (comment ? sketch.getReference(0).comment : sketch.getReference(0).name) << endl; | |
122 | if ( !edge ) | |
123 | { | |
124 | cout << '\t' << sketch.getReferenceCount() << endl; | |
125 | cout << (comment ? sketch.getReference(0).comment : sketch.getReference(0).name) << endl; | |
126 | } | |
113 | 127 | |
114 | 128 | ThreadPool<TriangleInput, TriangleOutput> threadPool(compare, threads); |
115 | 129 | |
116 | 130 | for ( uint64_t i = 1; i < sketch.getReferenceCount(); i++ ) |
117 | 131 | { |
118 | threadPool.runWhenThreadAvailable(new TriangleInput(sketch, i, parameters)); | |
132 | threadPool.runWhenThreadAvailable(new TriangleInput(sketch, i, parameters, distanceMax, pValueMax)); | |
119 | 133 | |
120 | 134 | while ( threadPool.outputAvailable() ) |
121 | 135 | { |
122 | writeOutput(threadPool.popOutputWhenAvailable(), comment, pValueMax); | |
136 | writeOutput(threadPool.popOutputWhenAvailable(), comment, edge, pValuePeakToSet); | |
123 | 137 | } |
124 | 138 | } |
125 | 139 | |
126 | 140 | while ( threadPool.running() ) |
127 | 141 | { |
128 | writeOutput(threadPool.popOutputWhenAvailable(), comment, pValueMax); | |
129 | } | |
130 | ||
131 | cerr << "Max p-value: " << pValueMax << endl; | |
142 | writeOutput(threadPool.popOutputWhenAvailable(), comment, edge, pValuePeakToSet); | |
143 | } | |
144 | ||
145 | if ( !edge ) | |
146 | { | |
147 | cerr << "Max p-value: " << pValuePeakToSet << endl; | |
148 | } | |
132 | 149 | |
133 | 150 | if ( warningCount > 0 && ! parameters.reads ) |
134 | 151 | { |
138 | 155 | return 0; |
139 | 156 | } |
140 | 157 | |
141 | void CommandTriangle::writeOutput(TriangleOutput * output, bool comment, double & pValueMax) const | |
142 | { | |
143 | const Sketch & sketch = output->sketch; | |
144 | const Sketch::Reference & ref = sketch.getReference(output->index); | |
145 | ||
146 | cout << (comment ? ref.comment : ref.name); | |
147 | ||
158 | void CommandTriangle::writeOutput(TriangleOutput * output, bool comment, bool edge, double & pValuePeakToSet) const | |
159 | { | |
160 | const Sketch & sketch = output->sketch; | |
161 | const Sketch::Reference & ref = sketch.getReference(output->index); | |
162 | ||
163 | if ( !edge ) | |
164 | { | |
165 | cout << (comment ? ref.comment : ref.name); | |
166 | } | |
167 | ||
148 | 168 | for ( uint64_t i = 0; i < output->index; i++ ) |
149 | 169 | { |
150 | 170 | const CommandDistance::CompareOutput::PairOutput * pair = &output->pairs[i]; |
151 | cout << '\t' << pair->distance; | |
152 | if ( pair->pValue > pValueMax ) | |
153 | { | |
154 | pValueMax = pair->pValue; | |
155 | } | |
156 | } | |
157 | cout << endl; | |
171 | ||
172 | if ( edge ) | |
173 | { | |
174 | if ( pair->pass ) | |
175 | { | |
176 | const Sketch::Reference & qry = sketch.getReference(i); | |
177 | cout << (comment ? ref.comment : ref.name) << '\t'<< (comment ? qry.comment : qry.name) << '\t' << pair->distance << '\t' << pair->pValue << '\t' << pair->numer << '/' << pair->denom << endl; | |
178 | } | |
179 | } | |
180 | else | |
181 | { | |
182 | cout << '\t' << pair->distance; | |
183 | } | |
184 | ||
185 | if ( pair->pValue > pValuePeakToSet ) | |
186 | { | |
187 | pValuePeakToSet = pair->pValue; | |
188 | } | |
189 | } | |
190 | ||
191 | if ( !edge ) | |
192 | { | |
193 | cout << endl; | |
194 | } | |
158 | 195 | |
159 | 196 | delete output; |
160 | 197 | } |
169 | 206 | |
170 | 207 | for ( uint64_t i = 0; i < input->index; i++ ) |
171 | 208 | { |
172 | compareSketches(&output->pairs[i], sketch.getReference(input->index), sketch.getReference(i), sketchSize, sketch.getKmerSize(), sketch.getKmerSpace(), -1., -1.); | |
209 | compareSketches(&output->pairs[i], sketch.getReference(input->index), sketch.getReference(i), sketchSize, sketch.getKmerSize(), sketch.getKmerSpace(), input->maxDistance, input->maxPValue); | |
173 | 210 | } |
174 | 211 | |
175 | 212 | return output; |
18 | 18 | |
19 | 19 | struct TriangleInput |
20 | 20 | { |
21 | TriangleInput(const Sketch & sketchNew, uint64_t indexNew, const Sketch::Parameters & parametersNew) | |
21 | TriangleInput(const Sketch & sketchNew, uint64_t indexNew, const Sketch::Parameters & parametersNew, double maxDistanceNew, double maxPValueNew) | |
22 | 22 | : |
23 | 23 | sketch(sketchNew), |
24 | 24 | index(indexNew), |
25 | parameters(parametersNew) | |
25 | parameters(parametersNew), | |
26 | maxDistance(maxDistanceNew), | |
27 | maxPValue(maxPValueNew) | |
26 | 28 | {} |
27 | 29 | |
28 | 30 | const Sketch & sketch; |
29 | 31 | uint64_t index; |
30 | 32 | const Sketch::Parameters & parameters; |
33 | double maxDistance; | |
34 | double maxPValue; | |
31 | 35 | }; |
32 | 36 | |
33 | 37 | struct TriangleOutput |
60 | 64 | double pValueMax; |
61 | 65 | bool comment; |
62 | 66 | |
63 | void writeOutput(TriangleOutput * output, bool comment, double & pValueMax) const; | |
67 | void writeOutput(TriangleOutput * output, bool comment, bool edge, double & pValuePeakToSet) const; | |
64 | 68 | }; |
65 | 69 | |
66 | 70 | CommandTriangle::TriangleOutput * compare(CommandTriangle::TriangleInput * input); |
3 | 3 | // |
4 | 4 | // See the LICENSE.txt file included with this software for license information. |
5 | 5 | |
6 | static const char * version = "2.1.1"; | |
6 | static const char * version = "2.2"; |
0 | 0.861792 44/1000 1 5.00739e-229 genome1.fna gi|49175990|ref|NC_000913.2| Escherichia coli str. K-12 substr. MG1655, complete genome | |
1 | 0.853596 36/1000 1 1.70479e-184 genome2.fna gi|47118301|dbj|BA000007.2| Escherichia coli O157:H7 str. Sakai DNA, complete genome | |
2 | 0.861792 44/1000 1 5.00739e-229 genome3.fna gi|682117612|gb|CP009273.1| Escherichia coli BW25113, complete genome | |
0 | 0.861792 44/1000 1 5.00742e-229 genome1.fna gi|49175990|ref|NC_000913.2| Escherichia coli str. K-12 substr. MG1655, complete genome | |
1 | 0.853596 36/1000 1 1.7048e-184 genome2.fna gi|47118301|dbj|BA000007.2| Escherichia coli O157:H7 str. Sakai DNA, complete genome | |
2 | 0.861792 44/1000 1 5.00742e-229 genome3.fna gi|682117612|gb|CP009273.1| Escherichia coli BW25113, complete genome |