Codebase list kmc / 01c3c17
[ Michael R. Crusoe ] [ Debian Janitor ] New upstream snapshot. Debian Janitor 2 years ago
13 changed file(s) with 326 addition(s) and 158 deletion(s). Raw diff Collapse all Expand all
00 KMC
11 =
2 [![GitHub downloads](https://img.shields.io/github/downloads/refresh-bio/kmc/total.svg?style=flag&label=GitHub%20downloads)](https://github.com/refresh-bio/KMC/releases)
3 [![Bioconda downloads](https://img.shields.io/conda/dn/bioconda/kmc.svg?style=flag&label=Bioconda%20downloads)](https://anaconda.org/bioconda/kmc)
4
25 KMC is a disk-based programm for counting k-mers from (possibly gzipped) FASTQ/FASTA files.
36 The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
47
0 kmc (3.1.1+dfsg-4) UNRELEASED; urgency=medium
0 kmc (3.1.2rc1+git20201001.1.a664708-1) UNRELEASED; urgency=medium
11
2 [ Michael R. Crusoe ]
23 * Team upload.
34 * debian/control: mark libkmc-dev as Multi-Arch: same
45
5 -- Michael R. Crusoe <crusoe@debian.org> Mon, 25 Jan 2021 08:04:44 +0100
6 [ Debian Janitor ]
7 * New upstream snapshot.
8
9 -- Michael R. Crusoe <crusoe@debian.org> Thu, 27 May 2021 05:50:55 -0000
610
711 kmc (3.1.1+dfsg-3) unstable; urgency=medium
812
3636 return false;
3737 if ((mmer & 0x3f) == 0x3b) // TGT suffix
3838 return false;
39 if ((mmer & 0x3c) == 0x3c) // TG* suffix
39 if ((mmer & 0x3c) == 0x3c) // TG* suffix !!!! consider issue #152
4040 return false;
4141
4242 for (uint32 j = 0; j < len - 3; ++j)
456456 suffix_file_name = desc.file_src + ".kmc_suf";
457457
458458 suffix_file = fopen(suffix_file_name.c_str(), "rb");
459 setvbuf(suffix_file, NULL, _IONBF, 0);
460
459
461460 if (!suffix_file)
462461 {
463462 std::cerr << "Error: cannot open file: " << suffix_file_name << "\n";
464463 exit(1);
465 }
464
465 }
466
466467 setvbuf(suffix_file, NULL, _IONBF, 0);
467468
468469 char marker[4];
497498 prefix_file_name = desc.file_src + ".kmc_pre";
498499
499500 prefix_file = fopen(prefix_file_name.c_str(), "rb");
500 setvbuf(prefix_file, NULL, _IONBF, 0);
501
501
502502 if (!prefix_file)
503503 {
504504 std::cerr << "Error: cannot open file: " << prefix_file_name << "\n";
505505 exit(1);
506506 }
507 setvbuf(prefix_file, NULL, _IONBF, 0);
508
507509 my_fseek(prefix_file, 4 + sizeof(uint64), SEEK_SET);//skip KMCP and first value as it must be 0
508510
509511 }
122122 std::string kmc_suf_file_name = output_desc.file_src + ".kmc_suf";
123123
124124 kmc_pre = fopen(kmc_pre_file_name.c_str(), "wb");
125
126 if (!kmc_pre)
127 {
128 std::cerr << "Error: cannot open file : " << kmc_pre_file_name << "\n";
129 exit(1);
130 }
131
125132 setvbuf(kmc_pre, NULL, _IONBF, 0);
126133
127 if (!kmc_pre)
128 {
129 std::cerr << "Error: cannot open file : " << kmc_pre_file_name << "\n";
130 exit(1);
131 }
132134 kmc_suf = fopen(kmc_suf_file_name.c_str(), "wb");
133 setvbuf(kmc_suf, NULL, _IONBF, 0);
134135
135136 if (!kmc_suf)
136137 {
138139 std::cerr << "Error: cannot open file : " << kmc_suf_file_name << "\n";
139140 exit(1);
140141 }
142 setvbuf(kmc_suf, NULL, _IONBF, 0);
141143
142144 setvbuf(kmc_pre, NULL, _IONBF, 0);
143145 setvbuf(kmc_suf, NULL, _IONBF, 0);
4141 kmer_len = _kmer_len;
4242
4343 // Size and pointer for the buffer
44 part_size = 1 << 23;
44 part_size = 1 << 23;
4545 part = nullptr;
4646
4747 containsNextChromosome = false;
6060 //----------------------------------------------------------------------------------
6161 // Set part size of the buffer
6262 bool CFastqReader::SetPartSize(uint64 _part_size)
63 {
63 {
6464 if (_part_size < (1 << 20) || _part_size >(1 << 30))
6565 return false;
6666
405405 int64 tmp = i;
406406 bool next_line = SkipNextEOL(part, i, total_filled);
407407 if (!next_line)
408 i = total_filled;
408 i = total_filled;
409409 copy(part + tmp, part + i, part + pos);
410410 last_header_pos = pos;
411411 pos += i - tmp;
467467 return true;
468468 }
469469
470 void CFastqReader::CleanUpAfterLongFastaRead()
471 {
472 pmm_fastq->reserve(part);
473
474 uchar symb;
475 int64 in_part = 0;
476 int64 skip_pos = 0;
477 while (GetNextSymbOfLongReadRecord(symb, skip_pos, in_part))
478 {
479 if (symb == '\n' || symb == '\r')
480 ;
481 else
482 {
483 if (symb != '>')
484 {
485 cerr << "Error: Wrong input file!\n";
486 exit(1);
487 }
488 std::copy(part + skip_pos - 1, part + in_part, part);
489 part_filled = in_part - (skip_pos - 1);
490 return;
491 }
492 }
493
494 //the file has ended
495 part_filled = 0;
496 }
470497 void CFastqReader::CleanUpAfterLongFastqRead(uint32 number_of_lines_to_skip)
471498 {
472499 pmm_fastq->reserve(part);
539566
540567 if (data_src.Finished() && !long_read_in_progress)
541568 {
569 read_type = ReadType::normal_read;
542570 _part = part;
543571 _size = total_filled;
544572
553581 } // Look for the end of the last complete record in a buffer
554582 else if (file_type == fasta) // FASTA files
555583 {
584 if (long_read_in_progress)
585 {
586 //check if there is EOL in the data
587 int64 pos = 0;
588 for (; pos < total_filled; ++pos)
589 {
590 if (part[pos] == '\n' || part[pos] == '\r')
591 {
592 long_read_in_progress = false;
593 break;
594 }
595 }
596
597 if (!long_read_in_progress)
598 {
599 _part = part;
600 _size = pos;
601
602 //all from this part was readed, maybe there is another EOL character in the file
603 if(pos == total_filled)
604 CleanUpAfterLongFastaRead();
605 else //there is still some important data in the part!!
606 {
607 //skip possible eol
608 for (; pos < total_filled; ++pos)
609 if (part[pos] != '\n' && part[pos] != '\r')
610 break;
611
612 pmm_fastq->reserve(part);
613 std::copy(_part + pos, _part + total_filled, part);
614 part_filled = total_filled - pos;
615 }
616 }
617 else
618 {
619 _part = part;
620 _size = total_filled;
621 pmm_fastq->reserve(part);
622 std::copy(_part + total_filled - kmer_len + 1, _part + total_filled, part);
623 part_filled = kmer_len - 1;
624 }
625 return true;
626 }
556627 // Looking for a FASTA record at the end of the area
557628 i = total_filled - 1;
558629 int64 start, end;
582653 // Looking for a FASTQ record at the end of the area
583654 if (!success)
584655 {
585 cerr << "Error: Wrong input file!\n";
586 exit(1);
656 if (readed_lines == 2) //because if successfully readed full 2 lines in the worst case there is only one read that begins at the buffer start, and there is only onle fasta record
657 {
658 std::cerr << "Error: some error while reading fasta file, please contact authors\n";
659 exit(1);
660 }
661 k = 4 - readed_lines;
662 if (line_start[k] != 0)
663 {
664 std::cerr << "Error: some error while reading fasta file, please contact authors\n";
665 exit(1);
666 }
667 if (part[0] != '>')
668 {
669 cerr << "Error: Wrong input file!\n";
670 exit(1);
671 }
672
673 if (readed_lines == 1)
674 {
675 long_read_in_progress = true;
676
677 _part = part;
678 _size = total_filled;
679 read_type = ReadType::long_read;
680
681 //copy last k-1 symbols
682 pmm_fastq->reserve(part);
683 copy(_part + total_filled - kmer_len + 1, _part + total_filled, part);
684 part_filled = kmer_len - 1;
685
686 return true;
687 }
688 else
689 {
690 std::cerr << "Error: some error while reading fasta file, please contact authors\n";
691 exit(1);
692 }
693
694 return true;
587695 }
588696
589697 _part = part;
755863
756864 //----------------------------------------------------------------------------------
757865 // Skip to next EOL from the current position in a buffer
758 bool CFastqReader::SkipNextEOL(uchar *part, int64 &pos, int64 max_pos)
866 bool CFastqReader::SkipNextEOL(uchar *part, int64 &pos, int64 size)
759867 {
760868 int64 i;
761 for (i = pos; i < max_pos - 2; ++i)
869 for (i = pos; i < size - 1; ++i)
762870 if ((part[i] == '\n' || part[i] == '\r') && !(part[i + 1] == '\n' || part[i + 1] == '\r'))
763871 break;
764872
765 if (i >= max_pos - 2)
873 if (i >= size - 1)
766874 return false;
767875
768876 pos = i + 1;
10711179 binary_pack_queue = _binary_pack_queue;
10721180 missingEOL_at_EOF_counter = Queues.missingEOL_at_EOF_counter;
10731181 bam_task_manager = Queues.bam_task_manager;
1074 part_size = Params.fastq_buffer_size;
1182 part_size = Params.fastq_buffer_size;
10751183 part_queue = Queues.part_queue;
10761184 file_type = Params.file_type;
10771185 kmer_len = Params.p_k;
103103
104104 bool containsNextChromosome; //for multiline_fasta processing
105105
106 bool SkipNextEOL(uchar *part, int64 &pos, int64 max_pos);
106 bool SkipNextEOL(uchar *part, int64 &pos, int64 size);
107107
108108 void GetFullLineFromEnd(int64& line_sart, int64& line_end, uchar* buff, int64& pos);
109109
114114
115115 void CleanUpAfterLongFastqRead(uint32 number_of_lines_to_skip);
116116
117 void CleanUpAfterLongFastaRead();
117118 void FixEOLIfNeeded(uchar* part, int64& size);
118119 public:
119120 CFastqReader(CMemoryMonitor *_mm, CMemoryPoolWithBamSupport *_pmm_fastq, input_type _file_type, int _kmer_len,
155155 Params.both_strands = Params.p_both_strands;
156156 Params.without_output = Params.p_without_output;
157157 Params.use_strict_mem = Params.p_strict_mem;
158 Params.homopolymer_compressed = Params.p_homopolymer_compressed;
158159 Params.mem_mode = Params.p_mem_mode;
159160
160161
151151 << " -k<len> - k-mer length (k from " << MIN_K << " to " << MAX_K << "; default: 25)\n"
152152 << " -m<size> - max amount of RAM in GB (from 1 to 1024); default: 12\n"
153153 << " -sm - use strict memory mode (memory limit from -m<n> switch will not be exceeded)\n"
154 << " -hc - count homopolymer compressed k-mers (approximate and experimental)\n"
154155 << " -p<par> - signature length (5, 6, 7, 8, 9, 10, 11); default: 9\n"
155156 << " -f<a/q/m/bam> - input in FASTA format (-fa), FASTQ format (-fq), multi FASTA (-fm) or BAM (-fbam); default: FASTQ\n"
156157 << " -ci<value> - exclude k-mers occurring less than <value> times (default: 2)\n"
237238 Params.p_cx = atoll(&argv[i][3]);
238239 // Maximal counter value
239240 else if (strncmp(argv[i], "-cs", 3) == 0)
240 Params.p_cs = atoll(&argv[i][3]);
241 Params.p_cs = atoll(&argv[i][3]);
241242 // Set p1
242243 else if (strncmp(argv[i], "-p", 2) == 0)
243244 {
268269 Params.p_verbose = true;
269270 else if (strncmp(argv[i], "-sm", 3) == 0 && strlen(argv[i]) == 3)
270271 Params.p_strict_mem = true;
272 else if (strncmp(argv[i], "-hc", 3) == 0 && strlen(argv[i]) == 3)
273 Params.p_homopolymer_compressed = true;
271274 else if (strncmp(argv[i], "-r", 2) == 0)
272275 Params.p_mem_mode = true;
273276 else if(strncmp(argv[i], "-b", 2) == 0)
493496 delete app;
494497
495498 cout << "1st stage: " << time1 << "s\n"
496 << "2nd stage: " << time2 << "s\n";
499 << "2nd stage: " << time2 << "s\n";
497500
498501 bool display_strict_mem_stats = Params.p_strict_mem && !was_small_k_opt;
499502 if (display_strict_mem_stats)
3636 return false;
3737 if ((mmer & 0x3f) == 0x3b) // TGT suffix
3838 return false;
39 if ((mmer & 0x3c) == 0x3c) // TG* suffix
39 if ((mmer & 0x3c) == 0x3c) // TG* suffix !!!! consider issue #152
4040 return false;
4141
4242 for (uint32 j = 0; j < len - 3; ++j)
3434 int64 p_cx; // do not count k-mers occurring more than
3535 int64 p_cs; // maximal counter value
3636 bool p_strict_mem; // use strict memory limit mode
37 bool p_homopolymer_compressed; // count homopolymer compressed k-mers
3738 bool p_mem_mode; // use RAM instead of disk
3839 input_type p_file_type; // input in FASTA format
3940 bool p_verbose; // verbose mode
9798 int64 cutoff_max; // exclude k-mers occurring more than times
9899 int64 counter_max; // maximal counter value
99100 bool use_strict_mem; // use strict memory limit mode
101 bool homopolymer_compressed; //count homopolymer compressed k-mers
100102 bool both_strands; // find canonical representation of each k-mer
101103 bool mem_mode; // use RAM instead of disk
102104
149151 p_cx = 1000000000;
150152 p_cs = 255;
151153 p_strict_mem = false;
154 p_homopolymer_compressed = false;
152155 p_mem_mode = false;
153156 p_file_type = fastq;
154157 p_verbose = false;
99
1010 #include "stdafx.h"
1111 #include "splitter.h"
12
1312
1413 //************************************************************************************************************
1514 // CSplitter class - splits kmers into bins according to their signatures
3433
3534 mem_part_pmm_bins = Params.mem_part_pmm_bins;
3635
37 mem_part_pmm_reads = Params.mem_part_pmm_reads;
36 mem_part_pmm_reads = Params.mem_part_pmm_reads;
3837
3938 s_mapper = Queues.s_mapper;
4039
5049
5150 n_reads = 0;
5251 bins = nullptr;
52
53 homopolymer_compressed = Params.homopolymer_compressed;
5354 }
5455
5556 void CSplitter::InitBins(CKMCParams &Params, CKMCQueues &Queues)
6667 }
6768
6869 //----------------------------------------------------------------------------------
70 // Parse long read, header_merker is '@' or '>'
71 bool CSplitter::GetSeqLongRead(char *seq, uint32 &seq_size, uchar header_marker)
72 {
73 uint32 pos = 0;
74 //long read may or may not contain header
75 if (part_pos == 0 && part[0] == header_marker)
76 {
77 ++n_reads;
78 for (; part[part_pos] != '\n' && part[part_pos] != '\r'; ++part_pos)
79 ;
80 }
81 while (pos < mem_part_pmm_reads && part_pos < part_size)
82 seq[pos++] = codes[part[part_pos++]];
83 seq_size = pos;
84 if (part_pos < part_size)
85 part_pos -= kmer_len - 1;
86 return true;
87 }
88
89
90
91 //----------------------------------------------------------------------------------
6992 // Return a single record from FASTA/FASTQ data
7093 bool CSplitter::GetSeq(char *seq, uint32 &seq_size, ReadType read_type)
7194 {
95 if (part_pos >= part_size)
96 return false;
97
7298 uchar c = 0;
7399 uint32 pos = 0;
74100
75101 if (file_type == fasta)
76 {
77 // Title
78 if (part_pos >= part_size)
79 return false;
102 {
103 if (read_type == ReadType::long_read)
104 return GetSeqLongRead(seq, seq_size, '>');
80105 if (curr_read_len == 0)
81106 {
107 // Title
82108 c = part[part_pos++];
83109 if (c != '>')
84110 return false;
108134 seq[pos++] = codes[c];
109135 }
110136
137 seq_size = pos;
138
111139 if (part_pos >= part_size)
112140 return true;
113141
114 seq_size = pos;
115142 curr_read_len = pos;
116143
117144 if (pos >= mem_part_pmm_reads) // read is too long to fit into out buff, it will be splitted into multiple buffers
131158 seq[pos++] = codes[c];
132159 }
133160
161 seq_size = pos;
162
134163 if (part_pos >= part_size)
135164 return true;
136165
137 seq_size = pos;
138166 curr_read_len += pos - kmer_len + 1;
139167
140168 if (pos >= mem_part_pmm_reads) // read is too long to fit into out buff, it will be splitted into multiple buffers
143171 return true;
144172 }
145173 }
174
175 curr_read_len = 0;
176
177 //end of last record
178 if (part_pos >= part_size)
179 return true;
180
146181 if (part[part_pos++] >= 32)
147182 part_pos--;
148183 else if (part_pos >= part_size)
150185 }
151186 else if (file_type == fastq)
152187 {
153 if (read_type == ReadType::long_read)
154 {
155 //long read may or may not contain header
188 if (read_type == ReadType::long_read)
189 return GetSeqLongRead(seq, seq_size, '@');
190
191 if (curr_read_len == 0)
192 {
193 // Title
194 c = part[part_pos++];
195 if (c != '@')
196 return false;
197 ++n_reads;
198
199 for (; part_pos < part_size;)
200 {
201 c = part[part_pos++];
202 if (c < 32) // newliners
203 break;
204 }
156205 if (part_pos >= part_size)
157206 return false;
158207
159 if (part_pos == 0 && part[0] == '@')
160 {
161 ++n_reads;
162 for (; part[part_pos] != '\n' && part[part_pos] != '\r'; ++part_pos)
163 ;
164 }
165 while(pos < mem_part_pmm_reads && part_pos < part_size)
166 seq[pos++] = codes[part[part_pos++]];
167 seq_size = pos;
168 if(part_pos < part_size)
169 part_pos -= kmer_len - 1;
170 return true;
171 }
172 else
173 {
174
175 // Title
176 if (part_pos >= part_size)
177 return false;
178
179 if (curr_read_len == 0)
180 {
181 c = part[part_pos++];
182 if (c != '@')
183 return false;
184 ++n_reads;
185
186 for (; part_pos < part_size;)
187 {
188 c = part[part_pos++];
189 if (c < 32) // newliners
190 break;
191 }
192 if (part_pos >= part_size)
193 return false;
194
195 c = part[part_pos++];
196 if (c >= 32 || c == part[part_pos - 2]) //read may be empty
197 part_pos--;
198 else if (part_pos >= part_size)
199 return false;
200
201 // Sequence
202 for (; part_pos < part_size && pos < mem_part_pmm_reads;)
203 {
204 c = part[part_pos++];
205 if (c < 32) // newliners
206 break;
207 seq[pos++] = codes[c];
208 }
209 if (part_pos >= part_size)
210 return false;
211
212 seq_size = pos;
213 curr_read_len = pos;
214
215 if (pos >= mem_part_pmm_reads) // read is too long to fit into out buff, it will be splitted into multiple buffers
216 {
217 part_pos -= kmer_len - 1;
218 return true;
219 }
220 }
221 else // we are inside read
222 {
223 // Sequence
224 for (; part_pos < part_size && pos < mem_part_pmm_reads;)
225 {
226 c = part[part_pos++];
227 if (c < 32) // newliners
228 break;
229 seq[pos++] = codes[c];
230 }
231 if (part_pos >= part_size)
232 return false;
233
234 seq_size = pos;
235 curr_read_len += pos - kmer_len + 1;
236 if (pos >= mem_part_pmm_reads) // read is too long to fit into out buff, it will be splitted into multiple buffers
237 {
238 part_pos -= kmer_len - 1;
239 return true;
240 }
241 }
242
243208 c = part[part_pos++];
244 if (c >= 32)
209 if (c >= 32 || c == part[part_pos - 2]) //read may be empty
245210 part_pos--;
246211 else if (part_pos >= part_size)
247212 return false;
248213
249 // Plus
250 c = part[part_pos++];
214 // Sequence
215 for (; part_pos < part_size && pos < mem_part_pmm_reads;)
216 {
217 c = part[part_pos++];
218 if (c < 32) // newliners
219 break;
220 seq[pos++] = codes[c];
221 }
251222 if (part_pos >= part_size)
252223 return false;
253 if (c != '+')
254 return false;
255 for (; part_pos < part_size;)
224
225 seq_size = pos;
226 curr_read_len = pos;
227
228 if (pos >= mem_part_pmm_reads) // read is too long to fit into out buff, it will be splitted into multiple buffers
229 {
230 part_pos -= kmer_len - 1;
231 return true;
232 }
233 }
234 else // we are inside read
235 {
236 // Sequence
237 for (; part_pos < part_size && pos < mem_part_pmm_reads;)
256238 {
257239 c = part[part_pos++];
258240 if (c < 32) // newliners
259241 break;
242 seq[pos++] = codes[c];
260243 }
261244 if (part_pos >= part_size)
262245 return false;
263246
247 seq_size = pos;
248 curr_read_len += pos - kmer_len + 1;
249 if (pos >= mem_part_pmm_reads) // read is too long to fit into out buff, it will be splitted into multiple buffers
250 {
251 part_pos -= kmer_len - 1;
252 return true;
253 }
254 }
255
256 c = part[part_pos++];
257 if (c >= 32)
258 part_pos--;
259 else if (part_pos >= part_size)
260 return false;
261
262 // Plus
263 c = part[part_pos++];
264 if (part_pos >= part_size)
265 return false;
266 if (c != '+')
267 return false;
268 for (; part_pos < part_size;)
269 {
264270 c = part[part_pos++];
265 if (c >= 32 || c == part[part_pos - 2]) //qual may be empty
266 part_pos--;
267 else if (part_pos >= part_size)
268 return false;
269
270 part_pos += curr_read_len;
271 curr_read_len = 0;
272
273 // Quality
274
275 if (part_pos >= part_size)
276 return false;
277 c = part[part_pos++];
278
279 if (part_pos >= part_size)
280 return true;
281
282 if (part[part_pos++] >= 32)
283 part_pos--;
284 else if (part_pos >= part_size)
285 return true;
286 }
271 if (c < 32) // newliners
272 break;
273 }
274 if (part_pos >= part_size)
275 return false;
276
277 c = part[part_pos++];
278 if (c >= 32 || c == part[part_pos - 2]) //qual may be empty
279 part_pos--;
280 else if (part_pos >= part_size)
281 return false;
282
283 // Quality
284 part_pos += curr_read_len; //skip quality
285
286 curr_read_len = 0;
287
288 if (part_pos >= part_size)
289 return false;
290 c = part[part_pos++];
291
292 //end of last record
293 if (part_pos >= part_size)
294 return true;
295
296 //may be additional EOL character
297 if (part[part_pos++] >= 32)
298 part_pos--;
299 else if (part_pos >= part_size)
300 return true;
301
287302 }
288303 else if (file_type == multiline_fasta)
289304 {
290 if (part_pos >= part_size)
291 return false;
292305 if (part[part_pos] == '>')//need to ommit header
293306 {
294307 ++n_reads;
407420 }
408421
409422 //----------------------------------------------------------------------------------
423 void CSplitter::HomopolymerCompressSeq(char* seq, uint32 &seq_size)
424 {
425 if (seq_size <= 1)
426 return;
427
428 uint32 read_pos = 1;
429 uint32 write_pos = 0;
430 for (; read_pos < seq_size; ++read_pos)
431 if (seq[read_pos] != seq[write_pos])
432 seq[++write_pos] = seq[read_pos];
433 seq_size = write_pos + 1;
434 }
435
436 //----------------------------------------------------------------------------------
410437 // Calculate statistics of m-mers
411438 void CSplitter::CalcStats(uchar* _part, uint64 _part_size, ReadType read_type, uint32* _stats)
412439 {
426453
427454 while (GetSeq(seq, seq_size, read_type))
428455 {
456 if (homopolymer_compressed)
457 HomopolymerCompressSeq(seq, seq_size);
429458 i = 0;
430459 len = 0;
431460 while (i + kmer_len - 1 < seq_size)
526555 uint32 len;//length of extended kmer
527556
528557 while (GetSeq(seq, seq_size, read_type))
529 {
558 {
559 if (homopolymer_compressed)
560 HomopolymerCompressSeq(seq, seq_size);
530561 //if (file_type != multiline_fasta && file_type != fastq) //read conting moved to GetSeq
531562 // n_reads++;
532563 i = 0;
652683 if (both_strands)
653684 while (GetSeq(seq, seq_size, read_type))
654685 {
655 if (file_type != multiline_fasta)
656 n_reads++;
686 if (homopolymer_compressed)
687 HomopolymerCompressSeq(seq, seq_size);
688 //if (file_type != multiline_fasta)
689 // n_reads++;
657690
658691 // Init k-mer
659692 kmer_str.clear();
705738 else
706739 while (GetSeq(seq, seq_size, read_type))
707740 {
708 if (file_type != multiline_fasta)
709 n_reads++;
741 if (homopolymer_compressed)
742 HomopolymerCompressSeq(seq, seq_size);
743 //if (file_type != multiline_fasta)
744 // n_reads++;
710745
711746 // Init k-mer
712747 kmer_str.clear();
804839 uint64 size;
805840 ReadType read_type;
806841 if (pq->pop(part, size, read_type))
807 {
842 {
808843 spl->ProcessReads(part, size, read_type);
809844 pmm_fastq->free(part);
810845 }
5353
5454 CSignatureMapper* s_mapper;
5555
56 bool homopolymer_compressed;
57
58 bool GetSeqLongRead(char *seq, uint32 &seq_size, uchar header_marker);
59
5660 bool GetSeq(char *seq, uint32 &seq_size, ReadType read_type);
61
62 void HomopolymerCompressSeq(char* seq, uint32 &seq_size);
5763
5864 public:
5965 static uint32 MAX_LINE_SIZE;