Update upstream source from tag 'upstream/39'
Update to upstream version '39'
with Debian dir bb2c9687829ebdbc69ad414798029b1c9fb1eb7f
Nilesh Patra
1 year, 8 months ago
142 | 142 | -w maximum tandem repeat period to consider |
143 | 143 | -d probability decay per period (period-(i+1) / period-i) |
144 | 144 | -i match score |
145 | -j mismatch cost | |
145 | -j mismatch cost (as a special case, 0 means no mismatches) | |
146 | 146 | -a gap existence cost |
147 | -b gap extension cost | |
147 | -b gap extension cost (as a special case, 0 means no gaps) | |
148 | 148 | -s minimum repeat probability for masking |
149 | 149 | -n minimum copy number, affects -f4 only |
150 | 150 | -f output type: 0=masked sequence, 1=repeat probabilities, |
194 | 194 | lowercase letters are insertions relative to the previous repeat unit, |
195 | 195 | and dashes are deletions relative to the previous repeat unit. |
196 | 196 | |
197 | You can forbid insertions and deletions (which is faster) with | |
198 | ``-b0``:: | |
199 | ||
200 | tantan -f4 -b0 seqs.fa | |
201 | ||
202 | You can also forbid mismatches with ``-j0``, so this gets exact | |
203 | repeats only:: | |
204 | ||
205 | tantan -f4 -b0 -j0 seqs.fa | |
206 | ||
197 | 207 | Miscellaneous |
198 | 208 | ------------- |
199 | 209 |
9 | 9 | rm -f ../bin/tantan |
10 | 10 | |
11 | 11 | VERSION1 = git describe --dirty |
12 | VERSION2 = echo ' (HEAD -> main, tag: 31) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//' | |
12 | VERSION2 = echo ' (HEAD -> main, tag: 39) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//' | |
13 | 13 | |
14 | 14 | VERSION = \"`test -e ../.git && $(VERSION1) || $(VERSION2)`\" |
15 | 15 |
1 | 1 | |
2 | 2 | #include "mcf_fasta_sequence.hh" |
3 | 3 | |
4 | #include <cctype> // isspace | |
4 | #include <stddef.h> | |
5 | ||
5 | 6 | //#include <iostream> // for debugging |
6 | 7 | #include <istream> |
7 | #include <iterator> // istreambuf_iterator, ostreambuf_iterator | |
8 | 8 | #include <ostream> |
9 | #include <streambuf> | |
9 | 10 | |
10 | 11 | namespace mcf { |
11 | 12 | |
12 | 13 | static void readSequence(std::istream &s, std::vector<uchar> &sequence, |
13 | 14 | char delimiter) { |
14 | 15 | if (!s) return; |
15 | std::istreambuf_iterator<char> inpos(s); | |
16 | std::istreambuf_iterator<char> endpos; | |
17 | while (inpos != endpos) { | |
18 | char c = *inpos; | |
19 | if (c == delimiter) break; | |
20 | uchar u = static_cast<uchar>(c); | |
21 | if (!std::isspace(u)) sequence.push_back(u); | |
22 | ++inpos; | |
16 | std::streambuf *b = s.rdbuf(); | |
17 | int c = b->sgetc(); | |
18 | while (c != std::streambuf::traits_type::eof()) { | |
19 | if (c > ' ') { | |
20 | if (c == delimiter) break; | |
21 | sequence.push_back(c); | |
22 | } | |
23 | c = b->snextc(); | |
23 | 24 | } |
24 | 25 | } |
25 | 26 | |
26 | 27 | static void readQualityCodes(std::istream &s, std::vector<uchar> &qualityCodes, |
27 | 28 | std::vector<uchar>::size_type howMany) { |
28 | 29 | if (!s) return; |
29 | std::istreambuf_iterator<char> inpos(s); | |
30 | std::istreambuf_iterator<char> endpos; | |
31 | while (inpos != endpos) { | |
32 | if (qualityCodes.size() == howMany) break; | |
33 | char c = *inpos; | |
34 | uchar u = static_cast<uchar>(c); | |
35 | if (!std::isspace(u)) qualityCodes.push_back(u); | |
36 | ++inpos; | |
30 | std::streambuf *b = s.rdbuf(); | |
31 | while (howMany > 0) { | |
32 | int c = b->sbumpc(); | |
33 | if (c == std::streambuf::traits_type::eof()) break; // xxx ??? | |
34 | if (c > ' ') { | |
35 | qualityCodes.push_back(c); | |
36 | --howMany; | |
37 | } | |
37 | 38 | } |
38 | 39 | } |
39 | 40 | |
40 | 41 | std::istream &operator>>(std::istream &s, FastaSequence &f) { |
41 | std::string title; | |
42 | std::vector<uchar> sequence; | |
43 | std::string secondTitle; | |
44 | std::vector<uchar> qualityCodes; | |
45 | ||
46 | 42 | char firstChar = '>'; |
47 | 43 | s >> firstChar; |
48 | 44 | if (firstChar != '>' && firstChar != '@') s.setstate(std::ios::failbit); |
49 | getline(s, title); | |
45 | if (!s) return s; | |
46 | ||
47 | f.title.clear(); | |
48 | f.sequence.clear(); | |
49 | f.secondTitle.clear(); | |
50 | f.qualityCodes.clear(); | |
51 | ||
52 | getline(s, f.title); | |
50 | 53 | |
51 | 54 | if (firstChar == '>') { |
52 | readSequence(s, sequence, '>'); | |
55 | readSequence(s, f.sequence, '>'); | |
53 | 56 | } else { |
54 | readSequence(s, sequence, '+'); | |
55 | char secondChar; | |
56 | s >> secondChar; | |
57 | getline(s, secondTitle); | |
58 | readQualityCodes(s, qualityCodes, sequence.size()); | |
57 | readSequence(s, f.sequence, '+'); | |
58 | s >> firstChar; | |
59 | getline(s, f.secondTitle); | |
60 | readQualityCodes(s, f.qualityCodes, f.sequence.size()); | |
59 | 61 | // perhaps check whether we read enough quality codes |
60 | 62 | } |
61 | ||
62 | if (!s) return s; | |
63 | ||
64 | f.title.swap(title); | |
65 | f.sequence.swap(sequence); | |
66 | f.secondTitle.swap(secondTitle); | |
67 | f.qualityCodes.swap(qualityCodes); | |
68 | 63 | |
69 | 64 | return s; |
70 | 65 | } |
71 | 66 | |
72 | 67 | static void writeOneLine(std::ostream &s, const std::vector<uchar> &v) { |
73 | std::ostreambuf_iterator<char> o(s); | |
74 | std::vector<uchar>::const_iterator b = v.begin(); | |
75 | std::vector<uchar>::const_iterator e = v.end(); | |
76 | while (b != e) { | |
77 | o = static_cast<char>(*b); | |
78 | ++b; | |
79 | } | |
80 | o = '\n'; | |
68 | std::streambuf *b = s.rdbuf(); | |
69 | size_t size = v.size(); | |
70 | if (size) b->sputn(reinterpret_cast<const char *>(&v[0]), size); | |
71 | b->sputc('\n'); | |
81 | 72 | } |
82 | 73 | |
83 | 74 | static void writeMultiLines(std::ostream &s, const std::vector<uchar> &v) { |
84 | const int lettersPerLine = 50; // ? | |
85 | std::ostreambuf_iterator<char> o(s); | |
86 | std::vector<uchar>::const_iterator b = v.begin(); | |
87 | std::vector<uchar>::const_iterator e = v.end(); | |
88 | std::vector<uchar>::const_iterator i = b; | |
89 | while (i != e) { | |
90 | o = static_cast<char>(*i); | |
91 | ++i; | |
92 | if (i - b == lettersPerLine || i == e) { | |
93 | o = '\n'; | |
94 | b = i; | |
95 | } | |
75 | size_t lettersPerLine = 50; // ? | |
76 | std::streambuf *b = s.rdbuf(); | |
77 | size_t size = v.size(); | |
78 | for (size_t i = 0; i < size; i += lettersPerLine) { | |
79 | if (size - i < lettersPerLine) lettersPerLine = size - i; | |
80 | b->sputn(reinterpret_cast<const char *>(&v[i]), lettersPerLine); | |
81 | b->sputc('\n'); | |
96 | 82 | } |
97 | 83 | } |
98 | 84 |
4 | 4 | #include "mcf_util.hh" |
5 | 5 | |
6 | 6 | #include <unistd.h> |
7 | ||
8 | #include <limits.h> | |
7 | 9 | |
8 | 10 | #include <cstdlib> // EXIT_SUCCESS |
9 | 11 | #include <iostream> |
52 | 54 | maxCycleLength(-1), // depends on isProtein |
53 | 55 | repeatOffsetProbDecay(0.9), |
54 | 56 | matchScore(0), |
55 | mismatchCost(0), | |
57 | mismatchCost(-1), | |
56 | 58 | gapExistenceCost(0), |
57 | 59 | gapExtensionCost(-1), // means: no gaps |
58 | 60 | minMaskProb(0.5), |
78 | 80 | -d probability decay per period (" |
79 | 81 | + stringify(repeatOffsetProbDecay) + ")\n\ |
80 | 82 | -i match score (BLOSUM62 if -p, else 2 if -f4, else 1)\n\ |
81 | -j mismatch cost (BLOSUM62 if -p, else 7 if -f4, else 1)\n\ | |
83 | -j mismatch cost, 0 means infinite (BLOSUM62 if -p, else 7 if -f4, else 1)\n\ | |
82 | 84 | -a gap existence cost (" |
83 | 85 | + stringify(gapExistenceCost) + ")\n\ |
84 | -b gap extension cost (7 if -f4, else infinity)\n\ | |
86 | -b gap extension cost, 0 means no gaps (7 if -f4, else 0)\n\ | |
85 | 87 | -s minimum repeat probability for masking (" |
86 | 88 | + stringify(minMaskProb) + ")\n\ |
87 | 89 | -n minimum copy number, affects -f4 only (" |
143 | 145 | break; |
144 | 146 | case 'j': |
145 | 147 | unstringify(mismatchCost, optarg); |
146 | if (mismatchCost <= 0) | |
148 | if (mismatchCost < 0) | |
147 | 149 | badopt(c, optarg); |
148 | 150 | break; |
149 | 151 | case 'a': |
151 | 153 | break; |
152 | 154 | case 'b': |
153 | 155 | unstringify(gapExtensionCost, optarg); |
154 | if (gapExtensionCost <= 0) | |
156 | if (gapExtensionCost < 0) | |
155 | 157 | badopt(c, optarg); |
156 | 158 | break; |
157 | 159 | case 's': |
178 | 180 | |
179 | 181 | if (maxCycleLength < 0) maxCycleLength = (isProtein ? 50 : 100); |
180 | 182 | |
181 | if (!isProtein || matchScore || mismatchCost) { | |
182 | if (matchScore == 0) matchScore = (outputType == repOut ? 2 : 1); | |
183 | if (mismatchCost == 0) mismatchCost = (outputType == repOut ? 7 : 1); | |
183 | if (mismatchCost == 0) mismatchCost = INT_MAX; | |
184 | ||
185 | if (!isProtein || matchScore > 0 || mismatchCost > 0) { | |
186 | if (matchScore < 1) matchScore = (outputType == repOut ? 2 : 1); | |
187 | if (mismatchCost < 1) mismatchCost = (outputType == repOut ? 7 : 1); | |
184 | 188 | } |
185 | 189 | |
186 | 190 | indexOfFirstNonOptionArgument = optind; |
77 | 77 | if (options.scoreMatrixFileName) { |
78 | 78 | unfilify(scoreMatrix, options.scoreMatrixFileName); |
79 | 79 | } else if (options.isProtein) { |
80 | if (options.matchScore && options.mismatchCost) { | |
80 | if (options.matchScore > 0 && options.mismatchCost > 0) { | |
81 | 81 | scoreMatrix.initMatchMismatch(options.matchScore, options.mismatchCost, |
82 | 82 | Alphabet::protein); |
83 | 83 | } else { |
341 | 341 | |
342 | 342 | void processOneFile(std::istream &input, std::ostream &output) { |
343 | 343 | bool isFirstSequence = true; |
344 | ||
345 | // This code strives to minimize memory usage. The sequence-reading | |
346 | // operation does not overwrite the sequence until it finishes | |
347 | // reading successfully. So, we don't want to overwrite a large, | |
348 | // old sequence. Hence, we make a brand-new FastaSequence each time | |
349 | // through the loop. | |
350 | while (true) { | |
351 | FastaSequence f; | |
352 | if (!(input >> f)) break; | |
344 | FastaSequence f; | |
345 | while (input >> f) { | |
353 | 346 | if (isFirstSequence && !options.isProtein && |
354 | 347 | isDubiousDna(BEG(f.sequence), END(f.sequence))) |
355 | 348 | std::cerr << "tantan: that's some funny-lookin DNA\n"; |
119 | 119 | scoresPtr[0] = std::max(b2b + scoresPtr[0], b2fLast + toForeground); |
120 | 120 | } |
121 | 121 | |
122 | void RepeatFinder::calcBackwardTransitionScores() { | |
123 | if (endGapScore > -HUGE_VAL) return calcBackwardTransitionScoresWithGaps(); | |
124 | ||
125 | double toBackground = f2b + scoresPtr[0]; | |
126 | double toForeground = -HUGE_VAL; | |
127 | double *foregroundPtr = scoresPtr + 1; | |
128 | double *foregroundEnd = foregroundPtr + maxRepeatOffset; | |
129 | ||
130 | while (foregroundPtr < foregroundEnd) { | |
131 | toForeground += b2fGrowth; | |
132 | double f = *foregroundPtr; | |
133 | toForeground = std::max(toForeground, f); | |
134 | *foregroundPtr = std::max(toBackground, f2f0 + f); | |
135 | ++foregroundPtr; | |
136 | } | |
137 | ||
138 | scoresPtr[0] = std::max(b2b + scoresPtr[0], b2fLast + toForeground); | |
139 | } | |
140 | ||
141 | 122 | void RepeatFinder::calcEmissionScores() { |
142 | 123 | const double *matrixRow = substitutionMatrix[*seqPtr]; |
143 | 124 | double *oldScores = scoresPtr - dpScoresPerLetter; |
155 | 136 | } |
156 | 137 | |
157 | 138 | std::copy(oldScores + i, scoresPtr, scoresPtr + i); |
139 | } | |
140 | ||
141 | void RepeatFinder::calcScoresForOneSequencePosition() { | |
142 | if (endGapScore > -HUGE_VAL) { | |
143 | calcEmissionScores(); | |
144 | calcBackwardTransitionScoresWithGaps(); | |
145 | return; | |
146 | } | |
147 | ||
148 | double *oldScores = scoresPtr - dpScoresPerLetter; | |
149 | double toBackground = f2b + oldScores[0]; | |
150 | const double *matrixRow = substitutionMatrix[*seqPtr]; | |
151 | int maxOffset = maxOffsetInTheSequence(); | |
152 | double toForeground = -HUGE_VAL; | |
153 | int i = 1; | |
154 | ||
155 | for (; i <= maxOffset; ++i) { | |
156 | double f = oldScores[i] + matrixRow[seqPtr[-i]]; | |
157 | toForeground = std::max(toForeground + b2fGrowth, f); | |
158 | scoresPtr[i] = std::max(toBackground, f2f0 + f); | |
159 | } | |
160 | ||
161 | for (; i <= maxRepeatOffset; ++i) { | |
162 | toForeground += b2fGrowth; | |
163 | scoresPtr[i] = toBackground; | |
164 | } | |
165 | ||
166 | std::copy(oldScores + i, scoresPtr, scoresPtr + i); | |
167 | ||
168 | scoresPtr[0] = std::max(b2b + oldScores[0], b2fLast + toForeground); | |
158 | 169 | } |
159 | 170 | |
160 | 171 | void RepeatFinder::makeCheckpoint() { |
68 | 68 | |
69 | 69 | void initializeBackwardScores(); |
70 | 70 | void calcBackwardTransitionScoresWithGaps(); |
71 | void calcBackwardTransitionScores(); | |
72 | 71 | void calcEmissionScores(); |
72 | void calcScoresForOneSequencePosition(); | |
73 | 73 | void makeCheckpoint(); |
74 | 74 | void redoCheckpoint(); |
75 | 75 | int offsetWithMaxScore() const; |
86 | 86 | double scoreWithEmission(const double *matrixRow, int offset) const { |
87 | 87 | return scoresPtr[offset] + matrixRow[seqPtr[-offset]]; |
88 | 88 | } |
89 | ||
90 | void calcScoresForOneSequencePosition() { | |
91 | calcEmissionScores(); | |
92 | calcBackwardTransitionScores(); | |
93 | } | |
94 | 89 | }; |
95 | 90 | |
96 | 91 | } |
1512 | 1512 | chrM 13647 13691 30 1.48276 CCCCACCCTTACTAACATTAACGAAAATAA CCCCACCCTTACTAACATTAACGAAAATAA,CCCCACCCT-ACTAA |
1513 | 1513 | chrM 15829 15844 6 2.5 AATCCT AATCCT,AATCCT,AAT |
1514 | 1514 | chrM 16183 16195 1 12 C C,C,C,C,C,C,C,C,C,C,C,C |
1515 | ||
1516 | SRR019778.4 6 35 7 4.14286 TTGTGTG TGTGTAT,TGTGTGT,TGTGTGT,GGTGTGT,G | |
1517 | SRR019778.42 3 44 4 10.25 TGTG TGTG,TGTG,TGTG,TGTG,TGTG,TGTG,TGTT,TGTT,TTTT,TTTT,T | |
1518 | SRR019778.50 0 21 7 3 GTGTGTT GTGTGTT,GTGTGTT,GAGTGTT | |
1519 | SRR019778.64 30 44 3 4.66667 GAG GAG,GAG,GAG,GAG,GA | |
1520 | SRR019778.65 2 13 2 5.5 TG TG,TG,TG,TG,TG,T | |
1521 | SRR019778.65 4 37 16 2.0625 TGTGTGTGTTTTCTGT TGTGTGTGTTTTCTGT,TGTGTGTGTTTTTTGT,T | |
1522 | SRR019778.78 9 31 10 2.2 TGCCTTACTA TGCCTTACTA,TGCCTTACTA,TG | |
1523 | SRR019778.95 2 18 4 4 TGAT TGAT,TGAT,TGAT,TGAT | |
1524 | SRR019778.95 22 45 4 5.75 ATAG ATAG,ATAG,ATAG,ATAG,ATAG,ATA | |
1525 | ||
1526 | SRR019778.4 28 38 2 5 GT GT,GT,GT,GT,GT | |
1527 | SRR019778.42 3 30 2 13.5 TG TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,TG,T | |
1528 | SRR019778.42 33 44 1 11 T T,T,T,T,T,T,T,T,T,T,T | |
1529 | SRR019778.45 18 37 9 2.11111 TGTGTGTGT TGTGTGTGT,TGTGTGTGT,T | |
1530 | SRR019778.64 30 44 3 4.66667 GAG GAG,GAG,GAG,GAG,GA | |
1531 | SRR019778.65 2 13 2 5.5 TG TG,TG,TG,TG,TG,T | |
1532 | SRR019778.78 9 31 10 2.2 TGCCTTACTA TGCCTTACTA,TGCCTTACTA,TG | |
1533 | SRR019778.95 2 18 4 4 TGAT TGAT,TGAT,TGAT,TGAT | |
1534 | SRR019778.95 22 45 4 5.75 ATAG ATAG,ATAG,ATAG,ATAG,ATAG,ATA |