Import upstream version 3.1.2rc1+git20201001.1.a664708
Debian Janitor
2 years ago
0 | 0 | KMC |
1 | 1 | = |
2 | [![GitHub downloads](https://img.shields.io/github/downloads/refresh-bio/kmc/total.svg?style=flag&label=GitHub%20downloads)](https://github.com/refresh-bio/KMC/releases) | |
3 | [![Bioconda downloads](https://img.shields.io/conda/dn/bioconda/kmc.svg?style=flag&label=Bioconda%20downloads)](https://anaconda.org/bioconda/kmc) | |
4 | ||
2 | 5 | KMC is a disk-based programm for counting k-mers from (possibly gzipped) FASTQ/FASTA files. |
3 | 6 | The homepage of the KMC project is http://sun.aei.polsl.pl/kmc |
4 | 7 |
36 | 36 | return false; |
37 | 37 | if ((mmer & 0x3f) == 0x3b) // TGT suffix |
38 | 38 | return false; |
39 | if ((mmer & 0x3c) == 0x3c) // TG* suffix | |
39 | if ((mmer & 0x3c) == 0x3c) // TG* suffix !!!! consider issue #152 | |
40 | 40 | return false; |
41 | 41 | |
42 | 42 | for (uint32 j = 0; j < len - 3; ++j) |
456 | 456 | suffix_file_name = desc.file_src + ".kmc_suf"; |
457 | 457 | |
458 | 458 | suffix_file = fopen(suffix_file_name.c_str(), "rb"); |
459 | setvbuf(suffix_file, NULL, _IONBF, 0); | |
460 | ||
459 | ||
461 | 460 | if (!suffix_file) |
462 | 461 | { |
463 | 462 | std::cerr << "Error: cannot open file: " << suffix_file_name << "\n"; |
464 | 463 | exit(1); |
465 | } | |
464 | ||
465 | } | |
466 | ||
466 | 467 | setvbuf(suffix_file, NULL, _IONBF, 0); |
467 | 468 | |
468 | 469 | char marker[4]; |
497 | 498 | prefix_file_name = desc.file_src + ".kmc_pre"; |
498 | 499 | |
499 | 500 | prefix_file = fopen(prefix_file_name.c_str(), "rb"); |
500 | setvbuf(prefix_file, NULL, _IONBF, 0); | |
501 | ||
501 | ||
502 | 502 | if (!prefix_file) |
503 | 503 | { |
504 | 504 | std::cerr << "Error: cannot open file: " << prefix_file_name << "\n"; |
505 | 505 | exit(1); |
506 | 506 | } |
507 | setvbuf(prefix_file, NULL, _IONBF, 0); | |
508 | ||
507 | 509 | my_fseek(prefix_file, 4 + sizeof(uint64), SEEK_SET);//skip KMCP and first value as it must be 0 |
508 | 510 | |
509 | 511 | } |
122 | 122 | std::string kmc_suf_file_name = output_desc.file_src + ".kmc_suf"; |
123 | 123 | |
124 | 124 | kmc_pre = fopen(kmc_pre_file_name.c_str(), "wb"); |
125 | ||
126 | if (!kmc_pre) | |
127 | { | |
128 | std::cerr << "Error: cannot open file : " << kmc_pre_file_name << "\n"; | |
129 | exit(1); | |
130 | } | |
131 | ||
125 | 132 | setvbuf(kmc_pre, NULL, _IONBF, 0); |
126 | 133 | |
127 | if (!kmc_pre) | |
128 | { | |
129 | std::cerr << "Error: cannot open file : " << kmc_pre_file_name << "\n"; | |
130 | exit(1); | |
131 | } | |
132 | 134 | kmc_suf = fopen(kmc_suf_file_name.c_str(), "wb"); |
133 | setvbuf(kmc_suf, NULL, _IONBF, 0); | |
134 | 135 | |
135 | 136 | if (!kmc_suf) |
136 | 137 | { |
138 | 139 | std::cerr << "Error: cannot open file : " << kmc_suf_file_name << "\n"; |
139 | 140 | exit(1); |
140 | 141 | } |
142 | setvbuf(kmc_suf, NULL, _IONBF, 0); | |
141 | 143 | |
142 | 144 | setvbuf(kmc_pre, NULL, _IONBF, 0); |
143 | 145 | setvbuf(kmc_suf, NULL, _IONBF, 0); |
41 | 41 | kmer_len = _kmer_len; |
42 | 42 | |
43 | 43 | // Size and pointer for the buffer |
44 | part_size = 1 << 23; | |
44 | part_size = 1 << 23; | |
45 | 45 | part = nullptr; |
46 | 46 | |
47 | 47 | containsNextChromosome = false; |
60 | 60 | //---------------------------------------------------------------------------------- |
61 | 61 | // Set part size of the buffer |
62 | 62 | bool CFastqReader::SetPartSize(uint64 _part_size) |
63 | { | |
63 | { | |
64 | 64 | if (_part_size < (1 << 20) || _part_size >(1 << 30)) |
65 | 65 | return false; |
66 | 66 | |
405 | 405 | int64 tmp = i; |
406 | 406 | bool next_line = SkipNextEOL(part, i, total_filled); |
407 | 407 | if (!next_line) |
408 | i = total_filled; | |
408 | i = total_filled; | |
409 | 409 | copy(part + tmp, part + i, part + pos); |
410 | 410 | last_header_pos = pos; |
411 | 411 | pos += i - tmp; |
467 | 467 | return true; |
468 | 468 | } |
469 | 469 | |
470 | void CFastqReader::CleanUpAfterLongFastaRead() | |
471 | { | |
472 | pmm_fastq->reserve(part); | |
473 | ||
474 | uchar symb; | |
475 | int64 in_part = 0; | |
476 | int64 skip_pos = 0; | |
477 | while (GetNextSymbOfLongReadRecord(symb, skip_pos, in_part)) | |
478 | { | |
479 | if (symb == '\n' || symb == '\r') | |
480 | ; | |
481 | else | |
482 | { | |
483 | if (symb != '>') | |
484 | { | |
485 | cerr << "Error: Wrong input file!\n"; | |
486 | exit(1); | |
487 | } | |
488 | std::copy(part + skip_pos - 1, part + in_part, part); | |
489 | part_filled = in_part - (skip_pos - 1); | |
490 | return; | |
491 | } | |
492 | } | |
493 | ||
494 | //the file has ended | |
495 | part_filled = 0; | |
496 | } | |
470 | 497 | void CFastqReader::CleanUpAfterLongFastqRead(uint32 number_of_lines_to_skip) |
471 | 498 | { |
472 | 499 | pmm_fastq->reserve(part); |
539 | 566 | |
540 | 567 | if (data_src.Finished() && !long_read_in_progress) |
541 | 568 | { |
569 | read_type = ReadType::normal_read; | |
542 | 570 | _part = part; |
543 | 571 | _size = total_filled; |
544 | 572 | |
553 | 581 | } // Look for the end of the last complete record in a buffer |
554 | 582 | else if (file_type == fasta) // FASTA files |
555 | 583 | { |
584 | if (long_read_in_progress) | |
585 | { | |
586 | //check if there is EOL in the data | |
587 | int64 pos = 0; | |
588 | for (; pos < total_filled; ++pos) | |
589 | { | |
590 | if (part[pos] == '\n' || part[pos] == '\r') | |
591 | { | |
592 | long_read_in_progress = false; | |
593 | break; | |
594 | } | |
595 | } | |
596 | ||
597 | if (!long_read_in_progress) | |
598 | { | |
599 | _part = part; | |
600 | _size = pos; | |
601 | ||
602 | //all from this part was readed, maybe there is another EOL character in the file | |
603 | if(pos == total_filled) | |
604 | CleanUpAfterLongFastaRead(); | |
605 | else //there is still some important data in the part!! | |
606 | { | |
607 | //skip possible eol | |
608 | for (; pos < total_filled; ++pos) | |
609 | if (part[pos] != '\n' && part[pos] != '\r') | |
610 | break; | |
611 | ||
612 | pmm_fastq->reserve(part); | |
613 | std::copy(_part + pos, _part + total_filled, part); | |
614 | part_filled = total_filled - pos; | |
615 | } | |
616 | } | |
617 | else | |
618 | { | |
619 | _part = part; | |
620 | _size = total_filled; | |
621 | pmm_fastq->reserve(part); | |
622 | std::copy(_part + total_filled - kmer_len + 1, _part + total_filled, part); | |
623 | part_filled = kmer_len - 1; | |
624 | } | |
625 | return true; | |
626 | } | |
556 | 627 | // Looking for a FASTA record at the end of the area |
557 | 628 | i = total_filled - 1; |
558 | 629 | int64 start, end; |
582 | 653 | // Looking for a FASTQ record at the end of the area |
583 | 654 | if (!success) |
584 | 655 | { |
585 | cerr << "Error: Wrong input file!\n"; | |
586 | exit(1); | |
656 | if (readed_lines == 2) //because if successfully readed full 2 lines in the worst case there is only one read that begins at the buffer start, and there is only onle fasta record | |
657 | { | |
658 | std::cerr << "Error: some error while reading fasta file, please contact authors\n"; | |
659 | exit(1); | |
660 | } | |
661 | k = 4 - readed_lines; | |
662 | if (line_start[k] != 0) | |
663 | { | |
664 | std::cerr << "Error: some error while reading fasta file, please contact authors\n"; | |
665 | exit(1); | |
666 | } | |
667 | if (part[0] != '>') | |
668 | { | |
669 | cerr << "Error: Wrong input file!\n"; | |
670 | exit(1); | |
671 | } | |
672 | ||
673 | if (readed_lines == 1) | |
674 | { | |
675 | long_read_in_progress = true; | |
676 | ||
677 | _part = part; | |
678 | _size = total_filled; | |
679 | read_type = ReadType::long_read; | |
680 | ||
681 | //copy last k-1 symbols | |
682 | pmm_fastq->reserve(part); | |
683 | copy(_part + total_filled - kmer_len + 1, _part + total_filled, part); | |
684 | part_filled = kmer_len - 1; | |
685 | ||
686 | return true; | |
687 | } | |
688 | else | |
689 | { | |
690 | std::cerr << "Error: some error while reading fasta file, please contact authors\n"; | |
691 | exit(1); | |
692 | } | |
693 | ||
694 | return true; | |
587 | 695 | } |
588 | 696 | |
589 | 697 | _part = part; |
755 | 863 | |
756 | 864 | //---------------------------------------------------------------------------------- |
757 | 865 | // Skip to next EOL from the current position in a buffer |
758 | bool CFastqReader::SkipNextEOL(uchar *part, int64 &pos, int64 max_pos) | |
866 | bool CFastqReader::SkipNextEOL(uchar *part, int64 &pos, int64 size) | |
759 | 867 | { |
760 | 868 | int64 i; |
761 | for (i = pos; i < max_pos - 2; ++i) | |
869 | for (i = pos; i < size - 1; ++i) | |
762 | 870 | if ((part[i] == '\n' || part[i] == '\r') && !(part[i + 1] == '\n' || part[i + 1] == '\r')) |
763 | 871 | break; |
764 | 872 | |
765 | if (i >= max_pos - 2) | |
873 | if (i >= size - 1) | |
766 | 874 | return false; |
767 | 875 | |
768 | 876 | pos = i + 1; |
1071 | 1179 | binary_pack_queue = _binary_pack_queue; |
1072 | 1180 | missingEOL_at_EOF_counter = Queues.missingEOL_at_EOF_counter; |
1073 | 1181 | bam_task_manager = Queues.bam_task_manager; |
1074 | part_size = Params.fastq_buffer_size; | |
1182 | part_size = Params.fastq_buffer_size; | |
1075 | 1183 | part_queue = Queues.part_queue; |
1076 | 1184 | file_type = Params.file_type; |
1077 | 1185 | kmer_len = Params.p_k; |
103 | 103 | |
104 | 104 | bool containsNextChromosome; //for multiline_fasta processing |
105 | 105 | |
106 | bool SkipNextEOL(uchar *part, int64 &pos, int64 max_pos); | |
106 | bool SkipNextEOL(uchar *part, int64 &pos, int64 size); | |
107 | 107 | |
108 | 108 | void GetFullLineFromEnd(int64& line_sart, int64& line_end, uchar* buff, int64& pos); |
109 | 109 | |
114 | 114 | |
115 | 115 | void CleanUpAfterLongFastqRead(uint32 number_of_lines_to_skip); |
116 | 116 | |
117 | void CleanUpAfterLongFastaRead(); | |
117 | 118 | void FixEOLIfNeeded(uchar* part, int64& size); |
118 | 119 | public: |
119 | 120 | CFastqReader(CMemoryMonitor *_mm, CMemoryPoolWithBamSupport *_pmm_fastq, input_type _file_type, int _kmer_len, |
155 | 155 | Params.both_strands = Params.p_both_strands; |
156 | 156 | Params.without_output = Params.p_without_output; |
157 | 157 | Params.use_strict_mem = Params.p_strict_mem; |
158 | Params.homopolymer_compressed = Params.p_homopolymer_compressed; | |
158 | 159 | Params.mem_mode = Params.p_mem_mode; |
159 | 160 | |
160 | 161 |
151 | 151 | << " -k<len> - k-mer length (k from " << MIN_K << " to " << MAX_K << "; default: 25)\n" |
152 | 152 | << " -m<size> - max amount of RAM in GB (from 1 to 1024); default: 12\n" |
153 | 153 | << " -sm - use strict memory mode (memory limit from -m<n> switch will not be exceeded)\n" |
154 | << " -hc - count homopolymer compressed k-mers (approximate and experimental)\n" | |
154 | 155 | << " -p<par> - signature length (5, 6, 7, 8, 9, 10, 11); default: 9\n" |
155 | 156 | << " -f<a/q/m/bam> - input in FASTA format (-fa), FASTQ format (-fq), multi FASTA (-fm) or BAM (-fbam); default: FASTQ\n" |
156 | 157 | << " -ci<value> - exclude k-mers occurring less than <value> times (default: 2)\n" |
237 | 238 | Params.p_cx = atoll(&argv[i][3]); |
238 | 239 | // Maximal counter value |
239 | 240 | else if (strncmp(argv[i], "-cs", 3) == 0) |
240 | Params.p_cs = atoll(&argv[i][3]); | |
241 | Params.p_cs = atoll(&argv[i][3]); | |
241 | 242 | // Set p1 |
242 | 243 | else if (strncmp(argv[i], "-p", 2) == 0) |
243 | 244 | { |
268 | 269 | Params.p_verbose = true; |
269 | 270 | else if (strncmp(argv[i], "-sm", 3) == 0 && strlen(argv[i]) == 3) |
270 | 271 | Params.p_strict_mem = true; |
272 | else if (strncmp(argv[i], "-hc", 3) == 0 && strlen(argv[i]) == 3) | |
273 | Params.p_homopolymer_compressed = true; | |
271 | 274 | else if (strncmp(argv[i], "-r", 2) == 0) |
272 | 275 | Params.p_mem_mode = true; |
273 | 276 | else if(strncmp(argv[i], "-b", 2) == 0) |
493 | 496 | delete app; |
494 | 497 | |
495 | 498 | cout << "1st stage: " << time1 << "s\n" |
496 | << "2nd stage: " << time2 << "s\n"; | |
499 | << "2nd stage: " << time2 << "s\n"; | |
497 | 500 | |
498 | 501 | bool display_strict_mem_stats = Params.p_strict_mem && !was_small_k_opt; |
499 | 502 | if (display_strict_mem_stats) |
36 | 36 | return false; |
37 | 37 | if ((mmer & 0x3f) == 0x3b) // TGT suffix |
38 | 38 | return false; |
39 | if ((mmer & 0x3c) == 0x3c) // TG* suffix | |
39 | if ((mmer & 0x3c) == 0x3c) // TG* suffix !!!! consider issue #152 | |
40 | 40 | return false; |
41 | 41 | |
42 | 42 | for (uint32 j = 0; j < len - 3; ++j) |
34 | 34 | int64 p_cx; // do not count k-mers occurring more than |
35 | 35 | int64 p_cs; // maximal counter value |
36 | 36 | bool p_strict_mem; // use strict memory limit mode |
37 | bool p_homopolymer_compressed; // count homopolymer compressed k-mers | |
37 | 38 | bool p_mem_mode; // use RAM instead of disk |
38 | 39 | input_type p_file_type; // input in FASTA format |
39 | 40 | bool p_verbose; // verbose mode |
97 | 98 | int64 cutoff_max; // exclude k-mers occurring more than times |
98 | 99 | int64 counter_max; // maximal counter value |
99 | 100 | bool use_strict_mem; // use strict memory limit mode |
101 | bool homopolymer_compressed; //count homopolymer compressed k-mers | |
100 | 102 | bool both_strands; // find canonical representation of each k-mer |
101 | 103 | bool mem_mode; // use RAM instead of disk |
102 | 104 | |
149 | 151 | p_cx = 1000000000; |
150 | 152 | p_cs = 255; |
151 | 153 | p_strict_mem = false; |
154 | p_homopolymer_compressed = false; | |
152 | 155 | p_mem_mode = false; |
153 | 156 | p_file_type = fastq; |
154 | 157 | p_verbose = false; |
9 | 9 | |
10 | 10 | #include "stdafx.h" |
11 | 11 | #include "splitter.h" |
12 | ||
13 | 12 | |
14 | 13 | //************************************************************************************************************ |
15 | 14 | // CSplitter class - splits kmers into bins according to their signatures |
34 | 33 | |
35 | 34 | mem_part_pmm_bins = Params.mem_part_pmm_bins; |
36 | 35 | |
37 | mem_part_pmm_reads = Params.mem_part_pmm_reads; | |
36 | mem_part_pmm_reads = Params.mem_part_pmm_reads; | |
38 | 37 | |
39 | 38 | s_mapper = Queues.s_mapper; |
40 | 39 | |
50 | 49 | |
51 | 50 | n_reads = 0; |
52 | 51 | bins = nullptr; |
52 | ||
53 | homopolymer_compressed = Params.homopolymer_compressed; | |
53 | 54 | } |
54 | 55 | |
55 | 56 | void CSplitter::InitBins(CKMCParams &Params, CKMCQueues &Queues) |
66 | 67 | } |
67 | 68 | |
68 | 69 | //---------------------------------------------------------------------------------- |
70 | // Parse long read, header_merker is '@' or '>' | |
71 | bool CSplitter::GetSeqLongRead(char *seq, uint32 &seq_size, uchar header_marker) | |
72 | { | |
73 | uint32 pos = 0; | |
74 | //long read may or may not contain header | |
75 | if (part_pos == 0 && part[0] == header_marker) | |
76 | { | |
77 | ++n_reads; | |
78 | for (; part[part_pos] != '\n' && part[part_pos] != '\r'; ++part_pos) | |
79 | ; | |
80 | } | |
81 | while (pos < mem_part_pmm_reads && part_pos < part_size) | |
82 | seq[pos++] = codes[part[part_pos++]]; | |
83 | seq_size = pos; | |
84 | if (part_pos < part_size) | |
85 | part_pos -= kmer_len - 1; | |
86 | return true; | |
87 | } | |
88 | ||
89 | ||
90 | ||
91 | //---------------------------------------------------------------------------------- | |
69 | 92 | // Return a single record from FASTA/FASTQ data |
70 | 93 | bool CSplitter::GetSeq(char *seq, uint32 &seq_size, ReadType read_type) |
71 | 94 | { |
95 | if (part_pos >= part_size) | |
96 | return false; | |
97 | ||
72 | 98 | uchar c = 0; |
73 | 99 | uint32 pos = 0; |
74 | 100 | |
75 | 101 | if (file_type == fasta) |
76 | { | |
77 | // Title | |
78 | if (part_pos >= part_size) | |
79 | return false; | |
102 | { | |
103 | if (read_type == ReadType::long_read) | |
104 | return GetSeqLongRead(seq, seq_size, '>'); | |
80 | 105 | if (curr_read_len == 0) |
81 | 106 | { |
107 | // Title | |
82 | 108 | c = part[part_pos++]; |
83 | 109 | if (c != '>') |
84 | 110 | return false; |
108 | 134 | seq[pos++] = codes[c]; |
109 | 135 | } |
110 | 136 | |
137 | seq_size = pos; | |
138 | ||
111 | 139 | if (part_pos >= part_size) |
112 | 140 | return true; |
113 | 141 | |
114 | seq_size = pos; | |
115 | 142 | curr_read_len = pos; |
116 | 143 | |
117 | 144 | if (pos >= mem_part_pmm_reads) // read is too long to fit into out buff, it will be splitted into multiple buffers |
131 | 158 | seq[pos++] = codes[c]; |
132 | 159 | } |
133 | 160 | |
161 | seq_size = pos; | |
162 | ||
134 | 163 | if (part_pos >= part_size) |
135 | 164 | return true; |
136 | 165 | |
137 | seq_size = pos; | |
138 | 166 | curr_read_len += pos - kmer_len + 1; |
139 | 167 | |
140 | 168 | if (pos >= mem_part_pmm_reads) // read is too long to fit into out buff, it will be splitted into multiple buffers |
143 | 171 | return true; |
144 | 172 | } |
145 | 173 | } |
174 | ||
175 | curr_read_len = 0; | |
176 | ||
177 | //end of last record | |
178 | if (part_pos >= part_size) | |
179 | return true; | |
180 | ||
146 | 181 | if (part[part_pos++] >= 32) |
147 | 182 | part_pos--; |
148 | 183 | else if (part_pos >= part_size) |
150 | 185 | } |
151 | 186 | else if (file_type == fastq) |
152 | 187 | { |
153 | if (read_type == ReadType::long_read) | |
154 | { | |
155 | //long read may or may not contain header | |
188 | if (read_type == ReadType::long_read) | |
189 | return GetSeqLongRead(seq, seq_size, '@'); | |
190 | ||
191 | if (curr_read_len == 0) | |
192 | { | |
193 | // Title | |
194 | c = part[part_pos++]; | |
195 | if (c != '@') | |
196 | return false; | |
197 | ++n_reads; | |
198 | ||
199 | for (; part_pos < part_size;) | |
200 | { | |
201 | c = part[part_pos++]; | |
202 | if (c < 32) // newliners | |
203 | break; | |
204 | } | |
156 | 205 | if (part_pos >= part_size) |
157 | 206 | return false; |
158 | 207 | |
159 | if (part_pos == 0 && part[0] == '@') | |
160 | { | |
161 | ++n_reads; | |
162 | for (; part[part_pos] != '\n' && part[part_pos] != '\r'; ++part_pos) | |
163 | ; | |
164 | } | |
165 | while(pos < mem_part_pmm_reads && part_pos < part_size) | |
166 | seq[pos++] = codes[part[part_pos++]]; | |
167 | seq_size = pos; | |
168 | if(part_pos < part_size) | |
169 | part_pos -= kmer_len - 1; | |
170 | return true; | |
171 | } | |
172 | else | |
173 | { | |
174 | ||
175 | // Title | |
176 | if (part_pos >= part_size) | |
177 | return false; | |
178 | ||
179 | if (curr_read_len == 0) | |
180 | { | |
181 | c = part[part_pos++]; | |
182 | if (c != '@') | |
183 | return false; | |
184 | ++n_reads; | |
185 | ||
186 | for (; part_pos < part_size;) | |
187 | { | |
188 | c = part[part_pos++]; | |
189 | if (c < 32) // newliners | |
190 | break; | |
191 | } | |
192 | if (part_pos >= part_size) | |
193 | return false; | |
194 | ||
195 | c = part[part_pos++]; | |
196 | if (c >= 32 || c == part[part_pos - 2]) //read may be empty | |
197 | part_pos--; | |
198 | else if (part_pos >= part_size) | |
199 | return false; | |
200 | ||
201 | // Sequence | |
202 | for (; part_pos < part_size && pos < mem_part_pmm_reads;) | |
203 | { | |
204 | c = part[part_pos++]; | |
205 | if (c < 32) // newliners | |
206 | break; | |
207 | seq[pos++] = codes[c]; | |
208 | } | |
209 | if (part_pos >= part_size) | |
210 | return false; | |
211 | ||
212 | seq_size = pos; | |
213 | curr_read_len = pos; | |
214 | ||
215 | if (pos >= mem_part_pmm_reads) // read is too long to fit into out buff, it will be splitted into multiple buffers | |
216 | { | |
217 | part_pos -= kmer_len - 1; | |
218 | return true; | |
219 | } | |
220 | } | |
221 | else // we are inside read | |
222 | { | |
223 | // Sequence | |
224 | for (; part_pos < part_size && pos < mem_part_pmm_reads;) | |
225 | { | |
226 | c = part[part_pos++]; | |
227 | if (c < 32) // newliners | |
228 | break; | |
229 | seq[pos++] = codes[c]; | |
230 | } | |
231 | if (part_pos >= part_size) | |
232 | return false; | |
233 | ||
234 | seq_size = pos; | |
235 | curr_read_len += pos - kmer_len + 1; | |
236 | if (pos >= mem_part_pmm_reads) // read is too long to fit into out buff, it will be splitted into multiple buffers | |
237 | { | |
238 | part_pos -= kmer_len - 1; | |
239 | return true; | |
240 | } | |
241 | } | |
242 | ||
243 | 208 | c = part[part_pos++]; |
244 | if (c >= 32) | |
209 | if (c >= 32 || c == part[part_pos - 2]) //read may be empty | |
245 | 210 | part_pos--; |
246 | 211 | else if (part_pos >= part_size) |
247 | 212 | return false; |
248 | 213 | |
249 | // Plus | |
250 | c = part[part_pos++]; | |
214 | // Sequence | |
215 | for (; part_pos < part_size && pos < mem_part_pmm_reads;) | |
216 | { | |
217 | c = part[part_pos++]; | |
218 | if (c < 32) // newliners | |
219 | break; | |
220 | seq[pos++] = codes[c]; | |
221 | } | |
251 | 222 | if (part_pos >= part_size) |
252 | 223 | return false; |
253 | if (c != '+') | |
254 | return false; | |
255 | for (; part_pos < part_size;) | |
224 | ||
225 | seq_size = pos; | |
226 | curr_read_len = pos; | |
227 | ||
228 | if (pos >= mem_part_pmm_reads) // read is too long to fit into out buff, it will be splitted into multiple buffers | |
229 | { | |
230 | part_pos -= kmer_len - 1; | |
231 | return true; | |
232 | } | |
233 | } | |
234 | else // we are inside read | |
235 | { | |
236 | // Sequence | |
237 | for (; part_pos < part_size && pos < mem_part_pmm_reads;) | |
256 | 238 | { |
257 | 239 | c = part[part_pos++]; |
258 | 240 | if (c < 32) // newliners |
259 | 241 | break; |
242 | seq[pos++] = codes[c]; | |
260 | 243 | } |
261 | 244 | if (part_pos >= part_size) |
262 | 245 | return false; |
263 | 246 | |
247 | seq_size = pos; | |
248 | curr_read_len += pos - kmer_len + 1; | |
249 | if (pos >= mem_part_pmm_reads) // read is too long to fit into out buff, it will be splitted into multiple buffers | |
250 | { | |
251 | part_pos -= kmer_len - 1; | |
252 | return true; | |
253 | } | |
254 | } | |
255 | ||
256 | c = part[part_pos++]; | |
257 | if (c >= 32) | |
258 | part_pos--; | |
259 | else if (part_pos >= part_size) | |
260 | return false; | |
261 | ||
262 | // Plus | |
263 | c = part[part_pos++]; | |
264 | if (part_pos >= part_size) | |
265 | return false; | |
266 | if (c != '+') | |
267 | return false; | |
268 | for (; part_pos < part_size;) | |
269 | { | |
264 | 270 | c = part[part_pos++]; |
265 | if (c >= 32 || c == part[part_pos - 2]) //qual may be empty | |
266 | part_pos--; | |
267 | else if (part_pos >= part_size) | |
268 | return false; | |
269 | ||
270 | part_pos += curr_read_len; | |
271 | curr_read_len = 0; | |
272 | ||
273 | // Quality | |
274 | ||
275 | if (part_pos >= part_size) | |
276 | return false; | |
277 | c = part[part_pos++]; | |
278 | ||
279 | if (part_pos >= part_size) | |
280 | return true; | |
281 | ||
282 | if (part[part_pos++] >= 32) | |
283 | part_pos--; | |
284 | else if (part_pos >= part_size) | |
285 | return true; | |
286 | } | |
271 | if (c < 32) // newliners | |
272 | break; | |
273 | } | |
274 | if (part_pos >= part_size) | |
275 | return false; | |
276 | ||
277 | c = part[part_pos++]; | |
278 | if (c >= 32 || c == part[part_pos - 2]) //qual may be empty | |
279 | part_pos--; | |
280 | else if (part_pos >= part_size) | |
281 | return false; | |
282 | ||
283 | // Quality | |
284 | part_pos += curr_read_len; //skip quality | |
285 | ||
286 | curr_read_len = 0; | |
287 | ||
288 | if (part_pos >= part_size) | |
289 | return false; | |
290 | c = part[part_pos++]; | |
291 | ||
292 | //end of last record | |
293 | if (part_pos >= part_size) | |
294 | return true; | |
295 | ||
296 | //may be additional EOL character | |
297 | if (part[part_pos++] >= 32) | |
298 | part_pos--; | |
299 | else if (part_pos >= part_size) | |
300 | return true; | |
301 | ||
287 | 302 | } |
288 | 303 | else if (file_type == multiline_fasta) |
289 | 304 | { |
290 | if (part_pos >= part_size) | |
291 | return false; | |
292 | 305 | if (part[part_pos] == '>')//need to ommit header |
293 | 306 | { |
294 | 307 | ++n_reads; |
407 | 420 | } |
408 | 421 | |
409 | 422 | //---------------------------------------------------------------------------------- |
423 | void CSplitter::HomopolymerCompressSeq(char* seq, uint32 &seq_size) | |
424 | { | |
425 | if (seq_size <= 1) | |
426 | return; | |
427 | ||
428 | uint32 read_pos = 1; | |
429 | uint32 write_pos = 0; | |
430 | for (; read_pos < seq_size; ++read_pos) | |
431 | if (seq[read_pos] != seq[write_pos]) | |
432 | seq[++write_pos] = seq[read_pos]; | |
433 | seq_size = write_pos + 1; | |
434 | } | |
435 | ||
436 | //---------------------------------------------------------------------------------- | |
410 | 437 | // Calculate statistics of m-mers |
411 | 438 | void CSplitter::CalcStats(uchar* _part, uint64 _part_size, ReadType read_type, uint32* _stats) |
412 | 439 | { |
426 | 453 | |
427 | 454 | while (GetSeq(seq, seq_size, read_type)) |
428 | 455 | { |
456 | if (homopolymer_compressed) | |
457 | HomopolymerCompressSeq(seq, seq_size); | |
429 | 458 | i = 0; |
430 | 459 | len = 0; |
431 | 460 | while (i + kmer_len - 1 < seq_size) |
526 | 555 | uint32 len;//length of extended kmer |
527 | 556 | |
528 | 557 | while (GetSeq(seq, seq_size, read_type)) |
529 | { | |
558 | { | |
559 | if (homopolymer_compressed) | |
560 | HomopolymerCompressSeq(seq, seq_size); | |
530 | 561 | //if (file_type != multiline_fasta && file_type != fastq) //read conting moved to GetSeq |
531 | 562 | // n_reads++; |
532 | 563 | i = 0; |
652 | 683 | if (both_strands) |
653 | 684 | while (GetSeq(seq, seq_size, read_type)) |
654 | 685 | { |
655 | if (file_type != multiline_fasta) | |
656 | n_reads++; | |
686 | if (homopolymer_compressed) | |
687 | HomopolymerCompressSeq(seq, seq_size); | |
688 | //if (file_type != multiline_fasta) | |
689 | // n_reads++; | |
657 | 690 | |
658 | 691 | // Init k-mer |
659 | 692 | kmer_str.clear(); |
705 | 738 | else |
706 | 739 | while (GetSeq(seq, seq_size, read_type)) |
707 | 740 | { |
708 | if (file_type != multiline_fasta) | |
709 | n_reads++; | |
741 | if (homopolymer_compressed) | |
742 | HomopolymerCompressSeq(seq, seq_size); | |
743 | //if (file_type != multiline_fasta) | |
744 | // n_reads++; | |
710 | 745 | |
711 | 746 | // Init k-mer |
712 | 747 | kmer_str.clear(); |
804 | 839 | uint64 size; |
805 | 840 | ReadType read_type; |
806 | 841 | if (pq->pop(part, size, read_type)) |
807 | { | |
842 | { | |
808 | 843 | spl->ProcessReads(part, size, read_type); |
809 | 844 | pmm_fastq->free(part); |
810 | 845 | } |
53 | 53 | |
54 | 54 | CSignatureMapper* s_mapper; |
55 | 55 | |
56 | bool homopolymer_compressed; | |
57 | ||
58 | bool GetSeqLongRead(char *seq, uint32 &seq_size, uchar header_marker); | |
59 | ||
56 | 60 | bool GetSeq(char *seq, uint32 &seq_size, ReadType read_type); |
61 | ||
62 | void HomopolymerCompressSeq(char* seq, uint32 &seq_size); | |
57 | 63 | |
58 | 64 | public: |
59 | 65 | static uint32 MAX_LINE_SIZE; |