Codebase list mash / 583a8da
Merge tag 'upstream/2.1.1+dfsg' Upstream version 2.1.1+dfsg Sascha Steinbiss 4 years ago
9 changed file(s) with 83 addition(s) and 81 deletion(s). Raw diff Collapse all Expand all
0 CXXFLAGS += -O3 -std=c++11 -Isrc -I@capnp@/include -I@mathinc@
0 CXXFLAGS += -O3 -std=c++14 -Isrc -I@capnp@/include -I@mathinc@
11 CPPFLAGS += @amcppflags@
22
33 UNAME_S=$(shell uname -s)
3232 AC_MSG_ERROR([Cap'n Proto compiler (capnp) not found.])
3333 fi
3434
35 CPPFLAGS="-I$with_capnp/include -std=c++11"
35 CPPFLAGS="-I$with_capnp/include -std=c++14"
3636
3737 AC_CHECK_HEADER(capnp/common.h, [result=1], [result=0])
3838
1414 `SRR2671867.BaAmes.poretools.fastq.gz <http://gembox.cbcb.umd.edu/mash/SRR2671867.BaAmes.poretools.fastq.gz>`_: Nanopore 1D + 2D sequences generated by poretools (157MB)
1515
1616 `SRR2671868.Bc10987.poretools.fastq.gz <http://gembox.cbcb.umd.edu/mash/SRR2671868.Bc10987.poretools.fastq.gz>`_: Nanopore 1D + 2D sequences generated by poretools (250MB)
17
18 Mash Screen: High-throughput sequence containment estimation for genome discovery
19 ---------------------------------------------------------------------------------
20
21 Custom scripts and intermediate data:
22 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
23
24 `MashScreen_supp.tar.gz <https://obj.umiacs.umd.edu/mash/screen/MashScreen_supp.tar.gz>`_
25
26 Data files:
27 ~~~~~~~~~~~
28
29 Mash Sketch databases for RefSeq release 88:
30 * `RefSeq88n.msh.gz <https://obj.umiacs.umd.edu/mash/screen/RefSeq88n.msh.gz>`_: Genomes (k=21, s=1000), 1.2Gb uncompressed
31 * `RefSeq88p.msh.gz <https://obj.umiacs.umd.edu/mash/screen/RefSeq88p.msh.gz>`_: Proteomes (k=9, s=1000), 1.1Gb uncompressed
32
33 `art.fastq.gz <https://obj.umiacs.umd.edu/mash/screen/art.fastq.gz>`_: Simulated reads for Shakya experiment
34
35 Figure 5:
36 * `fig5.html <https://obj.umiacs.umd.edu/mash/screen/fig5/fig5.html>`_: Interactive version
37 * `fig5.tsv <https://obj.umiacs.umd.edu/mash/screen/fig5/fig5.tsv>`_: Source data
38
39 Public data sources
40 ~~~~~~~~~~~~~~~~~~~
41
42 The BLAST ``nr`` database was downloaded from ``ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr.*``.
43
44 HMP data were downloaded from ``ftp://public-ftp.ihmpdcc.org/``, reads from the ``Ilumina/`` directory
45 and coding sequences from the ``HMGI/`` directory. Within these folders, sample SRS015937 resides in
46 ``tongue_dorsum/`` and SRS020263 in ``right_retroauricular_crease/``.
47
48 SRA runs downloaded with the `SRA Toolkit <https://www.ncbi.nlm.nih.gov/sra/docs/toolkitsoft/>`_.
49
50 RefSeq genomes downloaded from the ``genomes/refseq/`` directory of ``ftp.ncbi.nlm.nih.gov``.
51
52 Public data products
53 ~~~~~~~~~~~~~~~~~~~~
54
55 Quebec Polyomavirus is submitted to GenBank as BK010702.
56
57 Screen of SRA metagenomes vs. RefSeq
58 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
59
60 * `sra_meta_nucl_95idy.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_nucl_95idy.tsv.gz>`_ (2.3Gb uncompressed)
61 * `sra_meta_nucl_80idy_3x.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_nucl_80idy_3x.tsv.gz>`_ (6.7Gb uncompressed)
62 * `sra_meta_prot_95idy.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_prot_95idy.tsv.gz>`_ (2.1Gb uncompressed)
63 * `sra_meta_prot_80idy_3x.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_prot_80idy_3x.tsv.gz>`_ (8.3Gb uncompressed)
64
65 These files have a line for each RefSeq genome listing all metagenomic SRA runs
66 (as of August 2018) with Mash Containment Scores above the specified threshold.
67 They are provided for two screen modes:
68
69 * ``nucl``: Genomic RefSeq sequences
70 * ``prot``: Proteomic RefSeq sequences (combined amino acid sequences per organism). **NOTE:** Protein tables above are not p-value filtered and thus large (> ~50Gb) runs may have spurious hits. They also do not contain plasmids. Updates coming soon!
71
72 ...and at two thresholds:
73
74 * ``95idy``: 95% Mash Containment Score, any coverage. Useful for finding runs containing a specific genome.
75 * ``80idy_3x``: 80% Mash Containment Score, at least 3x median k-mer multiplicity.
76 Useful for finding related, but novel, sequences.
77
78 The files are tab separated, with each line beginning with a RefSeq assembly accession, followed by SRA accessions, for example:
79
80 ::
81
82 GCF_000001215.4 SRR3401361 SRR3540373
83 GCF_000001405.36 SRR5127794 ERR1539652 SRR413753 ERR206081
84 GCF_000001405.38 SRR5127794 ERR1539652 ERR1711677 SRR413753 ERR206081
290290
291291 for ( unordered_set<MinHashHeap *>::const_iterator i = minHashHeaps.begin(); i != minHashHeaps.end(); i++ )
292292 {
293 HashList hashList(parameters.kmerSize);
293 HashList hashList(parameters.use64);
294294
295295 (*i)->toHashList(hashList);
296296
549549 break;
550550 }
551551
552 //kmersTotal++; TODO
553
554 if ( ! noncanonical )
555 {
556 bool debug = false;
557 useRevComp = true;
558 bool prefixEqual = true;
559
560 if ( debug ) {for ( uint64_t k = j; k < j + kmerSize; k++ ) { cout << *(seq + k); } cout << endl;}
561
562 for ( uint64_t k = 0; k < kmerSize; k++ )
563 {
564 char base = seq[j + k];
565 char baseMinus = seqRev[l - j - kmerSize + k];
566
567 if ( debug ) cout << baseMinus;
568
569 if ( prefixEqual && baseMinus > base )
570 {
571 useRevComp = false;
572 break;
573 }
574
575 if ( prefixEqual && baseMinus < base )
576 {
577 prefixEqual = false;
578 }
579 }
580
581 if ( debug ) cout << endl;
582 }
583
584552 const char * kmer;
585553
586554 if ( trans )
589557 }
590558 else
591559 {
592 kmer = useRevComp ? seqRev + l - j - kmerSize : seq + j;
560 const char *kmer_fwd = seq + j;
561 const char *kmer_rev = seqRev + length - j - kmerSize;
562 kmer = (noncanonical || memcmp(kmer_fwd, kmer_rev, kmerSize) <= 0) ? kmer_fwd : kmer_rev;
593563 }
594564
595565 //cout << kmer << '\t' << kmerSize << endl;
596566 hash_u hash = getHash(kmer, kmerSize, seed, use64);
597567 //cout << kmer << '\t' << hash.hash64 << endl;
598568 input->minHashHeap->tryInsert(hash);
599
600569 uint64_t key = use64 ? hash.hash64 : hash.hash32;
601570
602571 if ( input->hashCounts.count(key) == 1 )
2727 addOption("list", Option(Option::Boolean, "l", "Input", "List input. Lines in each <input> specify paths to sequence files, one per line.", ""));
2828 addOption("prefix", Option(Option::File, "o", "Output", "Output prefix (first input file used if unspecified). The suffix '.msh' will be appended.", ""));
2929 addOption("id", Option(Option::File, "I", "Sketch", "ID field for sketch of reads (instead of first sequence ID).", ""));
30 addOption("comment", Option(Option::File, "C", "Sketch", "Comment for a sketch of reads (instead of first sequence comment.", ""));
30 addOption("comment", Option(Option::File, "C", "Sketch", "Comment for a sketch of reads (instead of first sequence comment).", ""));
3131 useSketchOptions();
3232 }
3333
1414 public:
1515
1616 HashList() {use64 = true;}
17 HashList(int kmerSize) {use64 = kmerSize > 16;}
17 HashList(bool use64new) {use64 = use64new;}
1818
1919 hash_u at(int index) const;
2020 void clear();
535535
536536 for ( uint64_t i = 0; i < length - kmerSize + 1; i++ )
537537 {
538 bool useRevComp = false;
539 bool debug = false;
540
541538 // repeatedly skip kmers with bad characters
542
539 //
543540 bool bad = false;
544
541 //
545542 for ( uint64_t j = i; j < i + kmerSize && i + kmerSize <= length; j++ )
546543 {
547544 if ( ! parameters.alphabet[seq[j]] )
551548 break;
552549 }
553550 }
554
551 //
555552 if ( bad )
556553 {
557554 continue;
558555 }
559
556 //
560557 if ( i + kmerSize > length )
561558 {
562559 // skipped to end
563560 break;
564561 }
565562
566 if ( ! noncanonical )
567 {
568 useRevComp = true;
569 bool prefixEqual = true;
570
571 if ( debug ) {for ( uint64_t j = i; j < i + kmerSize; j++ ) { cout << *(seq + j); } cout << endl;}
572
573 for ( uint64_t j = 0; j < kmerSize; j++ )
574 {
575 char base = seq[i + j];
576 char baseMinus = seqRev[length - i - kmerSize + j];
577
578 if ( debug ) cout << baseMinus;
579
580 if ( prefixEqual && baseMinus > base )
581 {
582 useRevComp = false;
583 break;
584 }
585
586 if ( prefixEqual && baseMinus < base )
587 {
588 prefixEqual = false;
589 }
590 }
591
592 if ( debug ) cout << endl;
593 }
594
595563 const char *kmer_fwd = seq + i;
596564 const char *kmer_rev = seqRev + length - i - kmerSize;
597 const char * kmer = memcmp(kmer_fwd, kmer_rev, kmerSize) <= 0 ? kmer_fwd : kmer_rev;
565 const char * kmer = (noncanonical || memcmp(kmer_fwd, kmer_rev, kmerSize) <= 0) ? kmer_fwd : kmer_rev;
598566 bool filter = false;
599567
600568 hash_u hash = getHash(kmer, kmerSize, parameters.seed, parameters.use64);
601
602 if ( debug ) cout << endl;
603569
604570 minHashHeap.tryInsert(hash);
605571 }
6060
6161 if ( parameters.reads && command.getOption("threads").active )
6262 {
63 cerr << "ERROR: The option " << command.getOption("threads").identifier << " cannot be used with " << command.getOption("reads").identifier << "." << endl;
64 return 1;
63 cerr << "WARNING: The option " << command.getOption("threads").identifier << " will be ignored with " << command.getOption("reads").identifier << "." << endl;
6564 }
6665
6766 if ( parameters.reads && ! parameters.concatenated )
33 //
44 // See the LICENSE.txt file included with this software for license information.
55
6 static const char * version = "2.1";
6 static const char * version = "2.1.1";