diff --git a/README.rst b/README.rst index 40472b9..4bf6633 100644 --- a/README.rst +++ b/README.rst @@ -143,9 +143,9 @@ -w maximum tandem repeat period to consider -d probability decay per period (period-(i+1) / period-i) -i match score --j mismatch cost +-j mismatch cost (as a special case, 0 means no mismatches) -a gap existence cost --b gap extension cost +-b gap extension cost (as a special case, 0 means no gaps) -s minimum repeat probability for masking -n minimum copy number, affects -f4 only -f output type: 0=masked sequence, 1=repeat probabilities, @@ -195,6 +195,16 @@ lowercase letters are insertions relative to the previous repeat unit, and dashes are deletions relative to the previous repeat unit. +You can forbid insertions and deletions (which is faster) with +``-b0``:: + + tantan -f4 -b0 seqs.fa + +You can also forbid mismatches with ``-j0``, so this gets exact +repeats only:: + + tantan -f4 -b0 -j0 seqs.fa + Miscellaneous ------------- diff --git a/src/Makefile b/src/Makefile index e76369c..c0e7230 100644 --- a/src/Makefile +++ b/src/Makefile @@ -10,7 +10,7 @@ rm -f ../bin/tantan VERSION1 = git describe --dirty -VERSION2 = echo ' (HEAD -> main, tag: 31) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//' +VERSION2 = echo ' (HEAD -> main, tag: 39) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//' VERSION = \"`test -e ../.git && $(VERSION1) || $(VERSION2)`\" diff --git a/src/mcf_fasta_sequence.cc b/src/mcf_fasta_sequence.cc index 8837224..b0bc59e 100644 --- a/src/mcf_fasta_sequence.cc +++ b/src/mcf_fasta_sequence.cc @@ -2,98 +2,84 @@ #include "mcf_fasta_sequence.hh" -#include // isspace +#include + //#include // for debugging #include -#include // istreambuf_iterator, ostreambuf_iterator #include +#include namespace mcf { static void readSequence(std::istream &s, std::vector &sequence, char delimiter) { if (!s) return; - std::istreambuf_iterator inpos(s); - std::istreambuf_iterator endpos; - while (inpos != endpos) { - char c = *inpos; - if (c == delimiter) break; - uchar u = static_cast(c); - if (!std::isspace(u)) sequence.push_back(u); - ++inpos; + std::streambuf *b = s.rdbuf(); + int c = b->sgetc(); + while (c != std::streambuf::traits_type::eof()) { + if (c > ' ') { + if (c == delimiter) break; + sequence.push_back(c); + } + c = b->snextc(); } } static void readQualityCodes(std::istream &s, std::vector &qualityCodes, std::vector::size_type howMany) { if (!s) return; - std::istreambuf_iterator inpos(s); - std::istreambuf_iterator endpos; - while (inpos != endpos) { - if (qualityCodes.size() == howMany) break; - char c = *inpos; - uchar u = static_cast(c); - if (!std::isspace(u)) qualityCodes.push_back(u); - ++inpos; + std::streambuf *b = s.rdbuf(); + while (howMany > 0) { + int c = b->sbumpc(); + if (c == std::streambuf::traits_type::eof()) break; // xxx ??? + if (c > ' ') { + qualityCodes.push_back(c); + --howMany; + } } } std::istream &operator>>(std::istream &s, FastaSequence &f) { - std::string title; - std::vector sequence; - std::string secondTitle; - std::vector qualityCodes; - char firstChar = '>'; s >> firstChar; if (firstChar != '>' && firstChar != '@') s.setstate(std::ios::failbit); - getline(s, title); + if (!s) return s; + + f.title.clear(); + f.sequence.clear(); + f.secondTitle.clear(); + f.qualityCodes.clear(); + + getline(s, f.title); if (firstChar == '>') { - readSequence(s, sequence, '>'); + readSequence(s, f.sequence, '>'); } else { - readSequence(s, sequence, '+'); - char secondChar; - s >> secondChar; - getline(s, secondTitle); - readQualityCodes(s, qualityCodes, sequence.size()); + readSequence(s, f.sequence, '+'); + s >> firstChar; + getline(s, f.secondTitle); + readQualityCodes(s, f.qualityCodes, f.sequence.size()); // perhaps check whether we read enough quality codes } - - if (!s) return s; - - f.title.swap(title); - f.sequence.swap(sequence); - f.secondTitle.swap(secondTitle); - f.qualityCodes.swap(qualityCodes); return s; } static void writeOneLine(std::ostream &s, const std::vector &v) { - std::ostreambuf_iterator o(s); - std::vector::const_iterator b = v.begin(); - std::vector::const_iterator e = v.end(); - while (b != e) { - o = static_cast(*b); - ++b; - } - o = '\n'; + std::streambuf *b = s.rdbuf(); + size_t size = v.size(); + if (size) b->sputn(reinterpret_cast(&v[0]), size); + b->sputc('\n'); } static void writeMultiLines(std::ostream &s, const std::vector &v) { - const int lettersPerLine = 50; // ? - std::ostreambuf_iterator o(s); - std::vector::const_iterator b = v.begin(); - std::vector::const_iterator e = v.end(); - std::vector::const_iterator i = b; - while (i != e) { - o = static_cast(*i); - ++i; - if (i - b == lettersPerLine || i == e) { - o = '\n'; - b = i; - } + size_t lettersPerLine = 50; // ? + std::streambuf *b = s.rdbuf(); + size_t size = v.size(); + for (size_t i = 0; i < size; i += lettersPerLine) { + if (size - i < lettersPerLine) lettersPerLine = size - i; + b->sputn(reinterpret_cast(&v[i]), lettersPerLine); + b->sputc('\n'); } } diff --git a/src/mcf_tantan_options.cc b/src/mcf_tantan_options.cc index b04400e..3fdb6ab 100644 --- a/src/mcf_tantan_options.cc +++ b/src/mcf_tantan_options.cc @@ -5,6 +5,8 @@ #include "mcf_util.hh" #include + +#include #include // EXIT_SUCCESS #include @@ -53,7 +55,7 @@ maxCycleLength(-1), // depends on isProtein repeatOffsetProbDecay(0.9), matchScore(0), - mismatchCost(0), + mismatchCost(-1), gapExistenceCost(0), gapExtensionCost(-1), // means: no gaps minMaskProb(0.5), @@ -79,10 +81,10 @@ -d probability decay per period (" + stringify(repeatOffsetProbDecay) + ")\n\ -i match score (BLOSUM62 if -p, else 2 if -f4, else 1)\n\ - -j mismatch cost (BLOSUM62 if -p, else 7 if -f4, else 1)\n\ + -j mismatch cost, 0 means infinite (BLOSUM62 if -p, else 7 if -f4, else 1)\n\ -a gap existence cost (" + stringify(gapExistenceCost) + ")\n\ - -b gap extension cost (7 if -f4, else infinity)\n\ + -b gap extension cost, 0 means no gaps (7 if -f4, else 0)\n\ -s minimum repeat probability for masking (" + stringify(minMaskProb) + ")\n\ -n minimum copy number, affects -f4 only (" @@ -144,7 +146,7 @@ break; case 'j': unstringify(mismatchCost, optarg); - if (mismatchCost <= 0) + if (mismatchCost < 0) badopt(c, optarg); break; case 'a': @@ -152,7 +154,7 @@ break; case 'b': unstringify(gapExtensionCost, optarg); - if (gapExtensionCost <= 0) + if (gapExtensionCost < 0) badopt(c, optarg); break; case 's': @@ -179,9 +181,11 @@ if (maxCycleLength < 0) maxCycleLength = (isProtein ? 50 : 100); - if (!isProtein || matchScore || mismatchCost) { - if (matchScore == 0) matchScore = (outputType == repOut ? 2 : 1); - if (mismatchCost == 0) mismatchCost = (outputType == repOut ? 7 : 1); + if (mismatchCost == 0) mismatchCost = INT_MAX; + + if (!isProtein || matchScore > 0 || mismatchCost > 0) { + if (matchScore < 1) matchScore = (outputType == repOut ? 2 : 1); + if (mismatchCost < 1) mismatchCost = (outputType == repOut ? 7 : 1); } indexOfFirstNonOptionArgument = optind; diff --git a/src/tantan_app.cc b/src/tantan_app.cc index 914e704..ddb4da2 100644 --- a/src/tantan_app.cc +++ b/src/tantan_app.cc @@ -78,7 +78,7 @@ if (options.scoreMatrixFileName) { unfilify(scoreMatrix, options.scoreMatrixFileName); } else if (options.isProtein) { - if (options.matchScore && options.mismatchCost) { + if (options.matchScore > 0 && options.mismatchCost > 0) { scoreMatrix.initMatchMismatch(options.matchScore, options.mismatchCost, Alphabet::protein); } else { @@ -342,15 +342,8 @@ void processOneFile(std::istream &input, std::ostream &output) { bool isFirstSequence = true; - - // This code strives to minimize memory usage. The sequence-reading - // operation does not overwrite the sequence until it finishes - // reading successfully. So, we don't want to overwrite a large, - // old sequence. Hence, we make a brand-new FastaSequence each time - // through the loop. - while (true) { - FastaSequence f; - if (!(input >> f)) break; + FastaSequence f; + while (input >> f) { if (isFirstSequence && !options.isProtein && isDubiousDna(BEG(f.sequence), END(f.sequence))) std::cerr << "tantan: that's some funny-lookin DNA\n"; diff --git a/src/tantan_repeat_finder.cc b/src/tantan_repeat_finder.cc index e229a94..c06c54c 100644 --- a/src/tantan_repeat_finder.cc +++ b/src/tantan_repeat_finder.cc @@ -120,25 +120,6 @@ scoresPtr[0] = std::max(b2b + scoresPtr[0], b2fLast + toForeground); } -void RepeatFinder::calcBackwardTransitionScores() { - if (endGapScore > -HUGE_VAL) return calcBackwardTransitionScoresWithGaps(); - - double toBackground = f2b + scoresPtr[0]; - double toForeground = -HUGE_VAL; - double *foregroundPtr = scoresPtr + 1; - double *foregroundEnd = foregroundPtr + maxRepeatOffset; - - while (foregroundPtr < foregroundEnd) { - toForeground += b2fGrowth; - double f = *foregroundPtr; - toForeground = std::max(toForeground, f); - *foregroundPtr = std::max(toBackground, f2f0 + f); - ++foregroundPtr; - } - - scoresPtr[0] = std::max(b2b + scoresPtr[0], b2fLast + toForeground); -} - void RepeatFinder::calcEmissionScores() { const double *matrixRow = substitutionMatrix[*seqPtr]; double *oldScores = scoresPtr - dpScoresPerLetter; @@ -156,6 +137,36 @@ } std::copy(oldScores + i, scoresPtr, scoresPtr + i); +} + +void RepeatFinder::calcScoresForOneSequencePosition() { + if (endGapScore > -HUGE_VAL) { + calcEmissionScores(); + calcBackwardTransitionScoresWithGaps(); + return; + } + + double *oldScores = scoresPtr - dpScoresPerLetter; + double toBackground = f2b + oldScores[0]; + const double *matrixRow = substitutionMatrix[*seqPtr]; + int maxOffset = maxOffsetInTheSequence(); + double toForeground = -HUGE_VAL; + int i = 1; + + for (; i <= maxOffset; ++i) { + double f = oldScores[i] + matrixRow[seqPtr[-i]]; + toForeground = std::max(toForeground + b2fGrowth, f); + scoresPtr[i] = std::max(toBackground, f2f0 + f); + } + + for (; i <= maxRepeatOffset; ++i) { + toForeground += b2fGrowth; + scoresPtr[i] = toBackground; + } + + std::copy(oldScores + i, scoresPtr, scoresPtr + i); + + scoresPtr[0] = std::max(b2b + oldScores[0], b2fLast + toForeground); } void RepeatFinder::makeCheckpoint() { diff --git a/src/tantan_repeat_finder.hh b/src/tantan_repeat_finder.hh index 8d39b7c..59eb359 100644 --- a/src/tantan_repeat_finder.hh +++ b/src/tantan_repeat_finder.hh @@ -69,8 +69,8 @@ void initializeBackwardScores(); void calcBackwardTransitionScoresWithGaps(); - void calcBackwardTransitionScores(); void calcEmissionScores(); + void calcScoresForOneSequencePosition(); void makeCheckpoint(); void redoCheckpoint(); int offsetWithMaxScore() const; @@ -87,11 +87,6 @@ double scoreWithEmission(const double *matrixRow, int offset) const { return scoresPtr[offset] + matrixRow[seqPtr[-offset]]; } - - void calcScoresForOneSequencePosition() { - calcEmissionScores(); - calcBackwardTransitionScores(); - } }; } diff --git a/test/tantan_test.out b/test/tantan_test.out index 119e32b..8ef9992 100644 --- a/test/tantan_test.out +++ b/test/tantan_test.out @@ -1513,3 +1513,23 @@ chrM 13647 13691 30 1.48276 CCCCACCCTTACTAACATTAACGAAAATAA CCCCACCCTTACTAACATTAACGAAAATAA,CCCCACCCT-ACTAA chrM 15829 15844 6 2.5 AATCCT AATCCT,AATCCT,AAT chrM 16183 16195 1 12 C C,C,C,C,C,C,C,C,C,C,C,C + +SRR019778.4 6 35 7 4.14286 TTGTGTG TGTGTAT,TGTGTGT,TGTGTGT,GGTGTGT,G +SRR019778.42 3 44 4 10.25 TGTG TGTG,TGTG,TGTG,TGTG,TGTG,TGTG,TGTT,TGTT,TTTT,TTTT,T +SRR019778.50 0 21 7 3 GTGTGTT GTGTGTT,GTGTGTT,GAGTGTT +SRR019778.64 30 44 3 4.66667 GAG GAG,GAG,GAG,GAG,GA +SRR019778.65 2 13 2 5.5 TG TG,TG,TG,TG,TG,T +SRR019778.65 4 37 16 2.0625 TGTGTGTGTTTTCTGT TGTGTGTGTTTTCTGT,TGTGTGTGTTTTTTGT,T +SRR019778.78 9 31 10 2.2 TGCCTTACTA TGCCTTACTA,TGCCTTACTA,TG +SRR019778.95 2 18 4 4 TGAT TGAT,TGAT,TGAT,TGAT +SRR019778.95 22 45 4 5.75 ATAG ATAG,ATAG,ATAG,ATAG,ATAG,ATA + +SRR019778.4 28 38 2 5 GT GT,GT,GT,GT,GT +SRR019778.42 3 30 2 13.5 TG TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,T +SRR019778.42 33 44 1 11 T T,T,T,T,T,T,T,T,T,T,T +SRR019778.45 18 37 9 2.11111 TGTGTGTGT TGTGTGTGT,TGTGTGTGT,T +SRR019778.64 30 44 3 4.66667 GAG GAG,GAG,GAG,GAG,GA +SRR019778.65 2 13 2 5.5 TG TG,TG,TG,TG,TG,T +SRR019778.78 9 31 10 2.2 TGCCTTACTA TGCCTTACTA,TGCCTTACTA,TG +SRR019778.95 2 18 4 4 TGAT TGAT,TGAT,TGAT,TGAT +SRR019778.95 22 45 4 5.75 ATAG ATAG,ATAG,ATAG,ATAG,ATAG,ATA diff --git a/test/tantan_test.sh b/test/tantan_test.sh index 71c23dc..54d1a6a 100755 --- a/test/tantan_test.sh +++ b/test/tantan_test.sh @@ -35,5 +35,8 @@ tantan -f4 -b12 hard.fa echo tantan -f4 -n1 hg19_chrM.fa -} 2>&1 | -diff -u tantan_test.out - + echo + tantan -f4 -b0 panda.fastq + echo + tantan -f4 -b0 -j0 panda.fastq +} 2>&1 | diff -u tantan_test.out -