Merge tag 'upstream/2.1.1+dfsg'
Upstream version 2.1.1+dfsg
Sascha Steinbiss
4 years ago
0 | CXXFLAGS += -O3 -std=c++11 -Isrc -I@capnp@/include -I@mathinc@ | |
0 | CXXFLAGS += -O3 -std=c++14 -Isrc -I@capnp@/include -I@mathinc@ | |
1 | 1 | CPPFLAGS += @amcppflags@ |
2 | 2 | |
3 | 3 | UNAME_S=$(shell uname -s) |
32 | 32 | AC_MSG_ERROR([Cap'n Proto compiler (capnp) not found.]) |
33 | 33 | fi |
34 | 34 | |
35 | CPPFLAGS="-I$with_capnp/include -std=c++11" | |
35 | CPPFLAGS="-I$with_capnp/include -std=c++14" | |
36 | 36 | |
37 | 37 | AC_CHECK_HEADER(capnp/common.h, [result=1], [result=0]) |
38 | 38 |
14 | 14 | `SRR2671867.BaAmes.poretools.fastq.gz <http://gembox.cbcb.umd.edu/mash/SRR2671867.BaAmes.poretools.fastq.gz>`_: Nanopore 1D + 2D sequences generated by poretools (157MB) |
15 | 15 | |
16 | 16 | `SRR2671868.Bc10987.poretools.fastq.gz <http://gembox.cbcb.umd.edu/mash/SRR2671868.Bc10987.poretools.fastq.gz>`_: Nanopore 1D + 2D sequences generated by poretools (250MB) |
17 | ||
18 | Mash Screen: High-throughput sequence containment estimation for genome discovery | |
19 | --------------------------------------------------------------------------------- | |
20 | ||
21 | Custom scripts and intermediate data: | |
22 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
23 | ||
24 | `MashScreen_supp.tar.gz <https://obj.umiacs.umd.edu/mash/screen/MashScreen_supp.tar.gz>`_ | |
25 | ||
26 | Data files: | |
27 | ~~~~~~~~~~~ | |
28 | ||
29 | Mash Sketch databases for RefSeq release 88: | |
30 | * `RefSeq88n.msh.gz <https://obj.umiacs.umd.edu/mash/screen/RefSeq88n.msh.gz>`_: Genomes (k=21, s=1000), 1.2Gb uncompressed | |
31 | * `RefSeq88p.msh.gz <https://obj.umiacs.umd.edu/mash/screen/RefSeq88p.msh.gz>`_: Proteomes (k=9, s=1000), 1.1Gb uncompressed | |
32 | ||
33 | `art.fastq.gz <https://obj.umiacs.umd.edu/mash/screen/art.fastq.gz>`_: Simulated reads for Shakya experiment | |
34 | ||
35 | Figure 5: | |
36 | * `fig5.html <https://obj.umiacs.umd.edu/mash/screen/fig5/fig5.html>`_: Interactive version | |
37 | * `fig5.tsv <https://obj.umiacs.umd.edu/mash/screen/fig5/fig5.tsv>`_: Source data | |
38 | ||
39 | Public data sources | |
40 | ~~~~~~~~~~~~~~~~~~~ | |
41 | ||
42 | The BLAST ``nr`` database was downloaded from ``ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr.*``. | |
43 | ||
44 | HMP data were downloaded from ``ftp://public-ftp.ihmpdcc.org/``, reads from the ``Ilumina/`` directory | |
45 | and coding sequences from the ``HMGI/`` directory. Within these folders, sample SRS015937 resides in | |
46 | ``tongue_dorsum/`` and SRS020263 in ``right_retroauricular_crease/``. | |
47 | ||
48 | SRA runs downloaded with the `SRA Toolkit <https://www.ncbi.nlm.nih.gov/sra/docs/toolkitsoft/>`_. | |
49 | ||
50 | RefSeq genomes downloaded from the ``genomes/refseq/`` directory of ``ftp.ncbi.nlm.nih.gov``. | |
51 | ||
52 | Public data products | |
53 | ~~~~~~~~~~~~~~~~~~~~ | |
54 | ||
55 | Quebec Polyomavirus is submitted to GenBank as BK010702. | |
56 | ||
57 | Screen of SRA metagenomes vs. RefSeq | |
58 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
59 | ||
60 | * `sra_meta_nucl_95idy.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_nucl_95idy.tsv.gz>`_ (2.3Gb uncompressed) | |
61 | * `sra_meta_nucl_80idy_3x.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_nucl_80idy_3x.tsv.gz>`_ (6.7Gb uncompressed) | |
62 | * `sra_meta_prot_95idy.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_prot_95idy.tsv.gz>`_ (2.1Gb uncompressed) | |
63 | * `sra_meta_prot_80idy_3x.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_prot_80idy_3x.tsv.gz>`_ (8.3Gb uncompressed) | |
64 | ||
65 | These files have a line for each RefSeq genome listing all metagenomic SRA runs | |
66 | (as of August 2018) with Mash Containment Scores above the specified threshold. | |
67 | They are provided for two screen modes: | |
68 | ||
69 | * ``nucl``: Genomic RefSeq sequences | |
70 | * ``prot``: Proteomic RefSeq sequences (combined amino acid sequences per organism). **NOTE:** Protein tables above are not p-value filtered and thus large (> ~50Gb) runs may have spurious hits. They also do not contain plasmids. Updates coming soon! | |
71 | ||
72 | ...and at two thresholds: | |
73 | ||
74 | * ``95idy``: 95% Mash Containment Score, any coverage. Useful for finding runs containing a specific genome. | |
75 | * ``80idy_3x``: 80% Mash Containment Score, at least 3x median k-mer multiplicity. | |
76 | Useful for finding related, but novel, sequences. | |
77 | ||
78 | The files are tab separated, with each line beginning with a RefSeq assembly accession, followed by SRA accessions, for example: | |
79 | ||
80 | :: | |
81 | ||
82 | GCF_000001215.4 SRR3401361 SRR3540373 | |
83 | GCF_000001405.36 SRR5127794 ERR1539652 SRR413753 ERR206081 | |
84 | GCF_000001405.38 SRR5127794 ERR1539652 ERR1711677 SRR413753 ERR206081 |
290 | 290 | |
291 | 291 | for ( unordered_set<MinHashHeap *>::const_iterator i = minHashHeaps.begin(); i != minHashHeaps.end(); i++ ) |
292 | 292 | { |
293 | HashList hashList(parameters.kmerSize); | |
293 | HashList hashList(parameters.use64); | |
294 | 294 | |
295 | 295 | (*i)->toHashList(hashList); |
296 | 296 | |
549 | 549 | break; |
550 | 550 | } |
551 | 551 | |
552 | //kmersTotal++; TODO | |
553 | ||
554 | if ( ! noncanonical ) | |
555 | { | |
556 | bool debug = false; | |
557 | useRevComp = true; | |
558 | bool prefixEqual = true; | |
559 | ||
560 | if ( debug ) {for ( uint64_t k = j; k < j + kmerSize; k++ ) { cout << *(seq + k); } cout << endl;} | |
561 | ||
562 | for ( uint64_t k = 0; k < kmerSize; k++ ) | |
563 | { | |
564 | char base = seq[j + k]; | |
565 | char baseMinus = seqRev[l - j - kmerSize + k]; | |
566 | ||
567 | if ( debug ) cout << baseMinus; | |
568 | ||
569 | if ( prefixEqual && baseMinus > base ) | |
570 | { | |
571 | useRevComp = false; | |
572 | break; | |
573 | } | |
574 | ||
575 | if ( prefixEqual && baseMinus < base ) | |
576 | { | |
577 | prefixEqual = false; | |
578 | } | |
579 | } | |
580 | ||
581 | if ( debug ) cout << endl; | |
582 | } | |
583 | ||
584 | 552 | const char * kmer; |
585 | 553 | |
586 | 554 | if ( trans ) |
589 | 557 | } |
590 | 558 | else |
591 | 559 | { |
592 | kmer = useRevComp ? seqRev + l - j - kmerSize : seq + j; | |
560 | const char *kmer_fwd = seq + j; | |
561 | const char *kmer_rev = seqRev + length - j - kmerSize; | |
562 | kmer = (noncanonical || memcmp(kmer_fwd, kmer_rev, kmerSize) <= 0) ? kmer_fwd : kmer_rev; | |
593 | 563 | } |
594 | 564 | |
595 | 565 | //cout << kmer << '\t' << kmerSize << endl; |
596 | 566 | hash_u hash = getHash(kmer, kmerSize, seed, use64); |
597 | 567 | //cout << kmer << '\t' << hash.hash64 << endl; |
598 | 568 | input->minHashHeap->tryInsert(hash); |
599 | ||
600 | 569 | uint64_t key = use64 ? hash.hash64 : hash.hash32; |
601 | 570 | |
602 | 571 | if ( input->hashCounts.count(key) == 1 ) |
27 | 27 | addOption("list", Option(Option::Boolean, "l", "Input", "List input. Lines in each <input> specify paths to sequence files, one per line.", "")); |
28 | 28 | addOption("prefix", Option(Option::File, "o", "Output", "Output prefix (first input file used if unspecified). The suffix '.msh' will be appended.", "")); |
29 | 29 | addOption("id", Option(Option::File, "I", "Sketch", "ID field for sketch of reads (instead of first sequence ID).", "")); |
30 | addOption("comment", Option(Option::File, "C", "Sketch", "Comment for a sketch of reads (instead of first sequence comment.", "")); | |
30 | addOption("comment", Option(Option::File, "C", "Sketch", "Comment for a sketch of reads (instead of first sequence comment).", "")); | |
31 | 31 | useSketchOptions(); |
32 | 32 | } |
33 | 33 |
14 | 14 | public: |
15 | 15 | |
16 | 16 | HashList() {use64 = true;} |
17 | HashList(int kmerSize) {use64 = kmerSize > 16;} | |
17 | HashList(bool use64new) {use64 = use64new;} | |
18 | 18 | |
19 | 19 | hash_u at(int index) const; |
20 | 20 | void clear(); |
535 | 535 | |
536 | 536 | for ( uint64_t i = 0; i < length - kmerSize + 1; i++ ) |
537 | 537 | { |
538 | bool useRevComp = false; | |
539 | bool debug = false; | |
540 | ||
541 | 538 | // repeatedly skip kmers with bad characters |
542 | ||
539 | // | |
543 | 540 | bool bad = false; |
544 | ||
541 | // | |
545 | 542 | for ( uint64_t j = i; j < i + kmerSize && i + kmerSize <= length; j++ ) |
546 | 543 | { |
547 | 544 | if ( ! parameters.alphabet[seq[j]] ) |
551 | 548 | break; |
552 | 549 | } |
553 | 550 | } |
554 | ||
551 | // | |
555 | 552 | if ( bad ) |
556 | 553 | { |
557 | 554 | continue; |
558 | 555 | } |
559 | ||
556 | // | |
560 | 557 | if ( i + kmerSize > length ) |
561 | 558 | { |
562 | 559 | // skipped to end |
563 | 560 | break; |
564 | 561 | } |
565 | 562 | |
566 | if ( ! noncanonical ) | |
567 | { | |
568 | useRevComp = true; | |
569 | bool prefixEqual = true; | |
570 | ||
571 | if ( debug ) {for ( uint64_t j = i; j < i + kmerSize; j++ ) { cout << *(seq + j); } cout << endl;} | |
572 | ||
573 | for ( uint64_t j = 0; j < kmerSize; j++ ) | |
574 | { | |
575 | char base = seq[i + j]; | |
576 | char baseMinus = seqRev[length - i - kmerSize + j]; | |
577 | ||
578 | if ( debug ) cout << baseMinus; | |
579 | ||
580 | if ( prefixEqual && baseMinus > base ) | |
581 | { | |
582 | useRevComp = false; | |
583 | break; | |
584 | } | |
585 | ||
586 | if ( prefixEqual && baseMinus < base ) | |
587 | { | |
588 | prefixEqual = false; | |
589 | } | |
590 | } | |
591 | ||
592 | if ( debug ) cout << endl; | |
593 | } | |
594 | ||
595 | 563 | const char *kmer_fwd = seq + i; |
596 | 564 | const char *kmer_rev = seqRev + length - i - kmerSize; |
597 | const char * kmer = memcmp(kmer_fwd, kmer_rev, kmerSize) <= 0 ? kmer_fwd : kmer_rev; | |
565 | const char * kmer = (noncanonical || memcmp(kmer_fwd, kmer_rev, kmerSize) <= 0) ? kmer_fwd : kmer_rev; | |
598 | 566 | bool filter = false; |
599 | 567 | |
600 | 568 | hash_u hash = getHash(kmer, kmerSize, parameters.seed, parameters.use64); |
601 | ||
602 | if ( debug ) cout << endl; | |
603 | 569 | |
604 | 570 | minHashHeap.tryInsert(hash); |
605 | 571 | } |
60 | 60 | |
61 | 61 | if ( parameters.reads && command.getOption("threads").active ) |
62 | 62 | { |
63 | cerr << "ERROR: The option " << command.getOption("threads").identifier << " cannot be used with " << command.getOption("reads").identifier << "." << endl; | |
64 | return 1; | |
63 | cerr << "WARNING: The option " << command.getOption("threads").identifier << " will be ignored with " << command.getOption("reads").identifier << "." << endl; | |
65 | 64 | } |
66 | 65 | |
67 | 66 | if ( parameters.reads && ! parameters.concatenated ) |