Codebase list tantan / 504a5cf
Update upstream source from tag 'upstream/39' Update to upstream version '39' with Debian dir bb2c9687829ebdbc69ad414798029b1c9fb1eb7f Nilesh Patra 1 year, 8 months ago
9 changed file(s) with 127 addition(s) and 105 deletion(s). Raw diff Collapse all Expand all
142142 -w maximum tandem repeat period to consider
143143 -d probability decay per period (period-(i+1) / period-i)
144144 -i match score
145 -j mismatch cost
145 -j mismatch cost (as a special case, 0 means no mismatches)
146146 -a gap existence cost
147 -b gap extension cost
147 -b gap extension cost (as a special case, 0 means no gaps)
148148 -s minimum repeat probability for masking
149149 -n minimum copy number, affects -f4 only
150150 -f output type: 0=masked sequence, 1=repeat probabilities,
194194 lowercase letters are insertions relative to the previous repeat unit,
195195 and dashes are deletions relative to the previous repeat unit.
196196
197 You can forbid insertions and deletions (which is faster) with
198 ``-b0``::
199
200 tantan -f4 -b0 seqs.fa
201
202 You can also forbid mismatches with ``-j0``, so this gets exact
203 repeats only::
204
205 tantan -f4 -b0 -j0 seqs.fa
206
197207 Miscellaneous
198208 -------------
199209
99 rm -f ../bin/tantan
1010
1111 VERSION1 = git describe --dirty
12 VERSION2 = echo ' (HEAD -> main, tag: 31) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
12 VERSION2 = echo ' (HEAD -> main, tag: 39) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
1313
1414 VERSION = \"`test -e ../.git && $(VERSION1) || $(VERSION2)`\"
1515
11
22 #include "mcf_fasta_sequence.hh"
33
4 #include <cctype> // isspace
4 #include <stddef.h>
5
56 //#include <iostream> // for debugging
67 #include <istream>
7 #include <iterator> // istreambuf_iterator, ostreambuf_iterator
88 #include <ostream>
9 #include <streambuf>
910
1011 namespace mcf {
1112
1213 static void readSequence(std::istream &s, std::vector<uchar> &sequence,
1314 char delimiter) {
1415 if (!s) return;
15 std::istreambuf_iterator<char> inpos(s);
16 std::istreambuf_iterator<char> endpos;
17 while (inpos != endpos) {
18 char c = *inpos;
19 if (c == delimiter) break;
20 uchar u = static_cast<uchar>(c);
21 if (!std::isspace(u)) sequence.push_back(u);
22 ++inpos;
16 std::streambuf *b = s.rdbuf();
17 int c = b->sgetc();
18 while (c != std::streambuf::traits_type::eof()) {
19 if (c > ' ') {
20 if (c == delimiter) break;
21 sequence.push_back(c);
22 }
23 c = b->snextc();
2324 }
2425 }
2526
2627 static void readQualityCodes(std::istream &s, std::vector<uchar> &qualityCodes,
2728 std::vector<uchar>::size_type howMany) {
2829 if (!s) return;
29 std::istreambuf_iterator<char> inpos(s);
30 std::istreambuf_iterator<char> endpos;
31 while (inpos != endpos) {
32 if (qualityCodes.size() == howMany) break;
33 char c = *inpos;
34 uchar u = static_cast<uchar>(c);
35 if (!std::isspace(u)) qualityCodes.push_back(u);
36 ++inpos;
30 std::streambuf *b = s.rdbuf();
31 while (howMany > 0) {
32 int c = b->sbumpc();
33 if (c == std::streambuf::traits_type::eof()) break; // xxx ???
34 if (c > ' ') {
35 qualityCodes.push_back(c);
36 --howMany;
37 }
3738 }
3839 }
3940
4041 std::istream &operator>>(std::istream &s, FastaSequence &f) {
41 std::string title;
42 std::vector<uchar> sequence;
43 std::string secondTitle;
44 std::vector<uchar> qualityCodes;
45
4642 char firstChar = '>';
4743 s >> firstChar;
4844 if (firstChar != '>' && firstChar != '@') s.setstate(std::ios::failbit);
49 getline(s, title);
45 if (!s) return s;
46
47 f.title.clear();
48 f.sequence.clear();
49 f.secondTitle.clear();
50 f.qualityCodes.clear();
51
52 getline(s, f.title);
5053
5154 if (firstChar == '>') {
52 readSequence(s, sequence, '>');
55 readSequence(s, f.sequence, '>');
5356 } else {
54 readSequence(s, sequence, '+');
55 char secondChar;
56 s >> secondChar;
57 getline(s, secondTitle);
58 readQualityCodes(s, qualityCodes, sequence.size());
57 readSequence(s, f.sequence, '+');
58 s >> firstChar;
59 getline(s, f.secondTitle);
60 readQualityCodes(s, f.qualityCodes, f.sequence.size());
5961 // perhaps check whether we read enough quality codes
6062 }
61
62 if (!s) return s;
63
64 f.title.swap(title);
65 f.sequence.swap(sequence);
66 f.secondTitle.swap(secondTitle);
67 f.qualityCodes.swap(qualityCodes);
6863
6964 return s;
7065 }
7166
7267 static void writeOneLine(std::ostream &s, const std::vector<uchar> &v) {
73 std::ostreambuf_iterator<char> o(s);
74 std::vector<uchar>::const_iterator b = v.begin();
75 std::vector<uchar>::const_iterator e = v.end();
76 while (b != e) {
77 o = static_cast<char>(*b);
78 ++b;
79 }
80 o = '\n';
68 std::streambuf *b = s.rdbuf();
69 size_t size = v.size();
70 if (size) b->sputn(reinterpret_cast<const char *>(&v[0]), size);
71 b->sputc('\n');
8172 }
8273
8374 static void writeMultiLines(std::ostream &s, const std::vector<uchar> &v) {
84 const int lettersPerLine = 50; // ?
85 std::ostreambuf_iterator<char> o(s);
86 std::vector<uchar>::const_iterator b = v.begin();
87 std::vector<uchar>::const_iterator e = v.end();
88 std::vector<uchar>::const_iterator i = b;
89 while (i != e) {
90 o = static_cast<char>(*i);
91 ++i;
92 if (i - b == lettersPerLine || i == e) {
93 o = '\n';
94 b = i;
95 }
75 size_t lettersPerLine = 50; // ?
76 std::streambuf *b = s.rdbuf();
77 size_t size = v.size();
78 for (size_t i = 0; i < size; i += lettersPerLine) {
79 if (size - i < lettersPerLine) lettersPerLine = size - i;
80 b->sputn(reinterpret_cast<const char *>(&v[i]), lettersPerLine);
81 b->sputc('\n');
9682 }
9783 }
9884
44 #include "mcf_util.hh"
55
66 #include <unistd.h>
7
8 #include <limits.h>
79
810 #include <cstdlib> // EXIT_SUCCESS
911 #include <iostream>
5254 maxCycleLength(-1), // depends on isProtein
5355 repeatOffsetProbDecay(0.9),
5456 matchScore(0),
55 mismatchCost(0),
57 mismatchCost(-1),
5658 gapExistenceCost(0),
5759 gapExtensionCost(-1), // means: no gaps
5860 minMaskProb(0.5),
7880 -d probability decay per period ("
7981 + stringify(repeatOffsetProbDecay) + ")\n\
8082 -i match score (BLOSUM62 if -p, else 2 if -f4, else 1)\n\
81 -j mismatch cost (BLOSUM62 if -p, else 7 if -f4, else 1)\n\
83 -j mismatch cost, 0 means infinite (BLOSUM62 if -p, else 7 if -f4, else 1)\n\
8284 -a gap existence cost ("
8385 + stringify(gapExistenceCost) + ")\n\
84 -b gap extension cost (7 if -f4, else infinity)\n\
86 -b gap extension cost, 0 means no gaps (7 if -f4, else 0)\n\
8587 -s minimum repeat probability for masking ("
8688 + stringify(minMaskProb) + ")\n\
8789 -n minimum copy number, affects -f4 only ("
143145 break;
144146 case 'j':
145147 unstringify(mismatchCost, optarg);
146 if (mismatchCost <= 0)
148 if (mismatchCost < 0)
147149 badopt(c, optarg);
148150 break;
149151 case 'a':
151153 break;
152154 case 'b':
153155 unstringify(gapExtensionCost, optarg);
154 if (gapExtensionCost <= 0)
156 if (gapExtensionCost < 0)
155157 badopt(c, optarg);
156158 break;
157159 case 's':
178180
179181 if (maxCycleLength < 0) maxCycleLength = (isProtein ? 50 : 100);
180182
181 if (!isProtein || matchScore || mismatchCost) {
182 if (matchScore == 0) matchScore = (outputType == repOut ? 2 : 1);
183 if (mismatchCost == 0) mismatchCost = (outputType == repOut ? 7 : 1);
183 if (mismatchCost == 0) mismatchCost = INT_MAX;
184
185 if (!isProtein || matchScore > 0 || mismatchCost > 0) {
186 if (matchScore < 1) matchScore = (outputType == repOut ? 2 : 1);
187 if (mismatchCost < 1) mismatchCost = (outputType == repOut ? 7 : 1);
184188 }
185189
186190 indexOfFirstNonOptionArgument = optind;
7777 if (options.scoreMatrixFileName) {
7878 unfilify(scoreMatrix, options.scoreMatrixFileName);
7979 } else if (options.isProtein) {
80 if (options.matchScore && options.mismatchCost) {
80 if (options.matchScore > 0 && options.mismatchCost > 0) {
8181 scoreMatrix.initMatchMismatch(options.matchScore, options.mismatchCost,
8282 Alphabet::protein);
8383 } else {
341341
342342 void processOneFile(std::istream &input, std::ostream &output) {
343343 bool isFirstSequence = true;
344
345 // This code strives to minimize memory usage. The sequence-reading
346 // operation does not overwrite the sequence until it finishes
347 // reading successfully. So, we don't want to overwrite a large,
348 // old sequence. Hence, we make a brand-new FastaSequence each time
349 // through the loop.
350 while (true) {
351 FastaSequence f;
352 if (!(input >> f)) break;
344 FastaSequence f;
345 while (input >> f) {
353346 if (isFirstSequence && !options.isProtein &&
354347 isDubiousDna(BEG(f.sequence), END(f.sequence)))
355348 std::cerr << "tantan: that's some funny-lookin DNA\n";
119119 scoresPtr[0] = std::max(b2b + scoresPtr[0], b2fLast + toForeground);
120120 }
121121
122 void RepeatFinder::calcBackwardTransitionScores() {
123 if (endGapScore > -HUGE_VAL) return calcBackwardTransitionScoresWithGaps();
124
125 double toBackground = f2b + scoresPtr[0];
126 double toForeground = -HUGE_VAL;
127 double *foregroundPtr = scoresPtr + 1;
128 double *foregroundEnd = foregroundPtr + maxRepeatOffset;
129
130 while (foregroundPtr < foregroundEnd) {
131 toForeground += b2fGrowth;
132 double f = *foregroundPtr;
133 toForeground = std::max(toForeground, f);
134 *foregroundPtr = std::max(toBackground, f2f0 + f);
135 ++foregroundPtr;
136 }
137
138 scoresPtr[0] = std::max(b2b + scoresPtr[0], b2fLast + toForeground);
139 }
140
141122 void RepeatFinder::calcEmissionScores() {
142123 const double *matrixRow = substitutionMatrix[*seqPtr];
143124 double *oldScores = scoresPtr - dpScoresPerLetter;
155136 }
156137
157138 std::copy(oldScores + i, scoresPtr, scoresPtr + i);
139 }
140
141 void RepeatFinder::calcScoresForOneSequencePosition() {
142 if (endGapScore > -HUGE_VAL) {
143 calcEmissionScores();
144 calcBackwardTransitionScoresWithGaps();
145 return;
146 }
147
148 double *oldScores = scoresPtr - dpScoresPerLetter;
149 double toBackground = f2b + oldScores[0];
150 const double *matrixRow = substitutionMatrix[*seqPtr];
151 int maxOffset = maxOffsetInTheSequence();
152 double toForeground = -HUGE_VAL;
153 int i = 1;
154
155 for (; i <= maxOffset; ++i) {
156 double f = oldScores[i] + matrixRow[seqPtr[-i]];
157 toForeground = std::max(toForeground + b2fGrowth, f);
158 scoresPtr[i] = std::max(toBackground, f2f0 + f);
159 }
160
161 for (; i <= maxRepeatOffset; ++i) {
162 toForeground += b2fGrowth;
163 scoresPtr[i] = toBackground;
164 }
165
166 std::copy(oldScores + i, scoresPtr, scoresPtr + i);
167
168 scoresPtr[0] = std::max(b2b + oldScores[0], b2fLast + toForeground);
158169 }
159170
160171 void RepeatFinder::makeCheckpoint() {
6868
6969 void initializeBackwardScores();
7070 void calcBackwardTransitionScoresWithGaps();
71 void calcBackwardTransitionScores();
7271 void calcEmissionScores();
72 void calcScoresForOneSequencePosition();
7373 void makeCheckpoint();
7474 void redoCheckpoint();
7575 int offsetWithMaxScore() const;
8686 double scoreWithEmission(const double *matrixRow, int offset) const {
8787 return scoresPtr[offset] + matrixRow[seqPtr[-offset]];
8888 }
89
90 void calcScoresForOneSequencePosition() {
91 calcEmissionScores();
92 calcBackwardTransitionScores();
93 }
9489 };
9590
9691 }
15121512 chrM 13647 13691 30 1.48276 CCCCACCCTTACTAACATTAACGAAAATAA CCCCACCCTTACTAACATTAACGAAAATAA,CCCCACCCT-ACTAA
15131513 chrM 15829 15844 6 2.5 AATCCT AATCCT,AATCCT,AAT
15141514 chrM 16183 16195 1 12 C C,C,C,C,C,C,C,C,C,C,C,C
1515
1516 SRR019778.4 6 35 7 4.14286 TTGTGTG TGTGTAT,TGTGTGT,TGTGTGT,GGTGTGT,G
1517 SRR019778.42 3 44 4 10.25 TGTG TGTG,TGTG,TGTG,TGTG,TGTG,TGTG,TGTT,TGTT,TTTT,TTTT,T
1518 SRR019778.50 0 21 7 3 GTGTGTT GTGTGTT,GTGTGTT,GAGTGTT
1519 SRR019778.64 30 44 3 4.66667 GAG GAG,GAG,GAG,GAG,GA
1520 SRR019778.65 2 13 2 5.5 TG TG,TG,TG,TG,TG,T
1521 SRR019778.65 4 37 16 2.0625 TGTGTGTGTTTTCTGT TGTGTGTGTTTTCTGT,TGTGTGTGTTTTTTGT,T
1522 SRR019778.78 9 31 10 2.2 TGCCTTACTA TGCCTTACTA,TGCCTTACTA,TG
1523 SRR019778.95 2 18 4 4 TGAT TGAT,TGAT,TGAT,TGAT
1524 SRR019778.95 22 45 4 5.75 ATAG ATAG,ATAG,ATAG,ATAG,ATAG,ATA
1525
1526 SRR019778.4 28 38 2 5 GT GT,GT,GT,GT,GT
1527 SRR019778.42 3 30 2 13.5 TG TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,T
1528 SRR019778.42 33 44 1 11 T T,T,T,T,T,T,T,T,T,T,T
1529 SRR019778.45 18 37 9 2.11111 TGTGTGTGT TGTGTGTGT,TGTGTGTGT,T
1530 SRR019778.64 30 44 3 4.66667 GAG GAG,GAG,GAG,GAG,GA
1531 SRR019778.65 2 13 2 5.5 TG TG,TG,TG,TG,TG,T
1532 SRR019778.78 9 31 10 2.2 TGCCTTACTA TGCCTTACTA,TGCCTTACTA,TG
1533 SRR019778.95 2 18 4 4 TGAT TGAT,TGAT,TGAT,TGAT
1534 SRR019778.95 22 45 4 5.75 ATAG ATAG,ATAG,ATAG,ATAG,ATAG,ATA
3434 tantan -f4 -b12 hard.fa
3535 echo
3636 tantan -f4 -n1 hg19_chrM.fa
37 } 2>&1 |
38 diff -u tantan_test.out -
37 echo
38 tantan -f4 -b0 panda.fastq
39 echo
40 tantan -f4 -b0 -j0 panda.fastq
41 } 2>&1 | diff -u tantan_test.out -