New upstream snapshot.
Debian Janitor
4 years ago
4 | 4 | addons: |
5 | 5 | apt: |
6 | 6 | packages: |
7 | - check | |
7 | - check | |
8 | 8 | before_install: |
9 | - sudo apt-get update -qq | |
10 | script: | |
11 | - autoreconf -i && ./configure --enable-maintainer-mode CFLAGS='-O0 --coverage' && make && make check | |
9 | - sudo apt-get update -qq | |
10 | script: | |
11 | - autoreconf -i && ./configure --enable-maintainer-mode CFLAGS='-O0 --coverage' && make && make check | |
12 | 12 | after_success: |
13 | 13 | - bash <(curl -s https://codecov.io/bash) |
0 | 0 | Andrew J. Page <ap13@sanger.ac.uk> |
1 | 1 | Ben Taylor <path-help@sanger.ac.uk> |
2 | 2 | Jorge Soares <path-help@sanger.ac.uk> |
3 | Peter van Heusden | |
3 | Peter van Heusden <pvh@sanbi.ac.za> |
0 | snp-sites (2.5.1+git20191111.a274ec4-1) UNRELEASED; urgency=medium | |
1 | ||
2 | * New upstream snapshot. | |
3 | ||
4 | -- Debian Janitor <janitor@jelmer.uk> Thu, 30 Jan 2020 10:21:48 +0000 | |
5 | ||
0 | 6 | snp-sites (2.5.1-1) unstable; urgency=medium |
1 | 7 | |
2 | 8 | [ Andreas Tille ] |
33 | 33 | int length_of_genome; |
34 | 34 | int number_of_samples; |
35 | 35 | int number_of_snps; |
36 | char ** sequence_names; | |
37 | int * snp_locations; | |
38 | char * pseudo_reference_sequence; | |
36 | char **sequence_names; | |
37 | int *snp_locations; | |
38 | char *pseudo_reference_sequence; | |
39 | 39 | |
40 | 40 | int get_length_of_genome() |
41 | 41 | { |
52 | 52 | return number_of_snps; |
53 | 53 | } |
54 | 54 | |
55 | char ** get_sequence_names() | |
55 | char **get_sequence_names() | |
56 | 56 | { |
57 | 57 | return sequence_names; |
58 | 58 | } |
59 | 59 | |
60 | int * get_snp_locations() | |
60 | int *get_snp_locations() | |
61 | 61 | { |
62 | 62 | return snp_locations; |
63 | 63 | } |
64 | 64 | |
65 | char * get_pseudo_reference_sequence() | |
65 | char *get_pseudo_reference_sequence() | |
66 | 66 | { |
67 | 67 | return pseudo_reference_sequence; |
68 | 68 | } |
69 | 69 | |
70 | void get_bases_for_each_snp(char filename[], char ** bases_for_snps) | |
71 | { | |
72 | int l; | |
73 | int i = 0; | |
74 | int sequence_number = 0; | |
75 | size_t length_of_genome_found =0; | |
76 | ||
77 | gzFile fp; | |
78 | kseq_t *seq; | |
79 | ||
80 | fp = gzopen(filename, "r"); | |
81 | seq = kseq_init(fp); | |
82 | ||
83 | while ((l = kseq_read(seq)) >= 0) | |
84 | { | |
85 | if(sequence_number == 0) | |
86 | { | |
87 | length_of_genome_found = seq->seq.l; | |
88 | } | |
89 | for(i = 0; i< number_of_snps; i++) | |
90 | { | |
91 | bases_for_snps[i][sequence_number] = toupper(((char *) seq->seq.s)[snp_locations[i]]); | |
92 | } | |
93 | ||
94 | if(seq->seq.l != length_of_genome_found) | |
95 | { | |
96 | fprintf(stderr, "Alignment %s contains sequences of unequal length. Expected length is %i but got %i in sequence %s\n\n",filename, (int) length_of_genome_found, (int) seq->seq.l,seq->name.s); | |
97 | fflush(stderr); | |
98 | exit(EXIT_FAILURE); | |
99 | } | |
100 | sequence_number++; | |
101 | } | |
102 | ||
103 | kseq_destroy(seq); | |
104 | gzclose(fp); | |
105 | } | |
106 | ||
107 | void detect_snps(char filename[], int pure_mode, int output_monomorphic) { | |
70 | void get_bases_for_each_snp(char filename[], char **bases_for_snps) | |
71 | { | |
72 | int l; | |
73 | int i = 0; | |
74 | int sequence_number = 0; | |
75 | size_t length_of_genome_found = 0; | |
76 | ||
77 | gzFile fp; | |
78 | kseq_t *seq; | |
79 | ||
80 | fp = gzopen(filename, "r"); | |
81 | seq = kseq_init(fp); | |
82 | ||
83 | while ((l = kseq_read(seq)) >= 0) { | |
84 | if (sequence_number == 0) { | |
85 | length_of_genome_found = seq->seq.l; | |
86 | } | |
87 | for (i = 0; i < number_of_snps; i++) { | |
88 | bases_for_snps[i][sequence_number] = toupper(((char *) seq->seq.s)[snp_locations[i]]); | |
89 | } | |
90 | ||
91 | if (seq->seq.l != length_of_genome_found) { | |
92 | fprintf(stderr, | |
93 | "Alignment %s contains sequences of unequal length. Expected length is %i but got %i in sequence %s\n\n", | |
94 | filename, (int) length_of_genome_found, (int) seq->seq.l, seq->name.s); | |
95 | fflush(stderr); | |
96 | exit(EXIT_FAILURE); | |
97 | } | |
98 | sequence_number++; | |
99 | } | |
100 | ||
101 | kseq_destroy(seq); | |
102 | gzclose(fp); | |
103 | } | |
104 | ||
105 | void detect_snps(char filename[], int pure_mode, int output_monomorphic) | |
106 | { | |
108 | 107 | detect_snps_count_constant_sites(filename, pure_mode, output_monomorphic, NULL); |
109 | 108 | } |
110 | 109 | |
111 | void detect_snps_count_constant_sites(char filename[], int pure_mode, int output_monomorphic, int* constant_site_counts) | |
112 | { | |
113 | int i; | |
114 | int l; | |
115 | number_of_snps = 0; | |
116 | number_of_samples = 0; | |
117 | length_of_genome = 0; | |
118 | char * first_sequence; | |
119 | /* array below allows quick mapping of A, C, T and G characters to indices in base_counts array */ | |
120 | const int char_to_base_count_index[] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, 1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3}; | |
110 | void | |
111 | detect_snps_count_constant_sites(char filename[], int pure_mode, int output_monomorphic, int *constant_site_counts) | |
112 | { | |
113 | int i; | |
114 | int l; | |
115 | number_of_snps = 0; | |
116 | number_of_samples = 0; | |
117 | length_of_genome = 0; | |
118 | char *first_sequence; | |
119 | /* array below allows quick mapping of A, C, T and G characters to indices in base_counts array */ | |
120 | const int char_to_base_count_index[] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | |
121 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | |
122 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | |
123 | -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, 1, -1, -1, -1, 2, -1, -1, -1, -1, -1, | |
124 | -1, -1, -1, -1, -1, -1, -1, 3}; | |
121 | 125 | |
122 | 126 | |
123 | 127 | gzFile fp; |
124 | kseq_t *seq; | |
125 | ||
126 | fp = gzopen(filename, "r"); | |
127 | seq = kseq_init(fp); | |
128 | ||
129 | sequence_names = calloc(DEFAULT_NUM_SAMPLES, sizeof(char*)); | |
130 | ||
131 | while ((l = kseq_read(seq)) >= 0) { | |
132 | if(number_of_samples == 0) | |
133 | { | |
134 | length_of_genome = seq->seq.l; | |
135 | first_sequence = calloc(length_of_genome + 1, sizeof(char)); | |
136 | pseudo_reference_sequence = calloc(length_of_genome + 1, sizeof(char)); | |
137 | ||
138 | memset(first_sequence, 'N', length_of_genome); | |
139 | memset(pseudo_reference_sequence, 'N', length_of_genome); | |
140 | } | |
141 | for(i = 0; i < length_of_genome; i++) | |
142 | { | |
143 | if(first_sequence[i] == '#') | |
144 | { | |
145 | continue; | |
146 | } | |
147 | ||
148 | if(first_sequence[i] == 'N' && !is_unknown(seq->seq.s[i])) | |
149 | { | |
150 | first_sequence[i] = toupper(seq->seq.s[i]); | |
151 | pseudo_reference_sequence[i] = toupper(seq->seq.s[i]); | |
152 | } | |
153 | ||
154 | // in pure mode we only want /ACGT/i, if any other base is found the whole column is excluded | |
155 | if(pure_mode && !is_pure(seq->seq.s[i])) | |
156 | { | |
157 | first_sequence[i] = '#'; | |
158 | continue; | |
159 | } | |
160 | ||
161 | if(first_sequence[i] != '>' && !is_unknown(seq->seq.s[i]) && first_sequence[i] != 'N' && first_sequence[i] != toupper(seq->seq.s[i])) | |
162 | { | |
163 | first_sequence[i] = '>'; | |
164 | } | |
165 | } | |
166 | ||
167 | if(number_of_samples >= DEFAULT_NUM_SAMPLES) | |
168 | { | |
169 | sequence_names = realloc(sequence_names, (number_of_samples + 1) * sizeof(char*)); | |
170 | } | |
171 | sequence_names[number_of_samples] = calloc(MAX_SAMPLE_NAME_SIZE,sizeof(char)); | |
172 | strcpy(sequence_names[number_of_samples], seq->name.s); | |
173 | ||
174 | number_of_samples++; | |
175 | } | |
176 | ||
177 | for(i = 0; i < length_of_genome; i++) | |
178 | { | |
179 | if(first_sequence[i] == '>' || (output_monomorphic && first_sequence[i] != '#')) | |
180 | { | |
181 | number_of_snps++; | |
182 | } | |
183 | } | |
184 | ||
185 | if(number_of_snps == 0) | |
186 | { | |
187 | fprintf(stderr, "Warning: No SNPs were detected so there is nothing to output.\n"); | |
188 | fflush(stderr); | |
189 | exit(EXIT_FAILURE); | |
190 | } | |
191 | ||
192 | int current_snp_index = 0; | |
193 | snp_locations = calloc(number_of_snps +1, sizeof(int)); | |
194 | for(i = 0; i < length_of_genome; i++) | |
195 | { | |
196 | if(first_sequence[i] == '>' || (output_monomorphic && first_sequence[i] != '#')) | |
197 | { | |
198 | snp_locations[current_snp_index] = i; | |
199 | current_snp_index++; | |
200 | } else if (constant_site_counts != NULL && is_pure(first_sequence[i])) { | |
201 | constant_site_counts[char_to_base_count_index[(int) toupper(first_sequence[i])]]++; | |
202 | } | |
203 | ||
204 | } | |
205 | ||
206 | free(first_sequence); | |
207 | kseq_destroy(seq); | |
208 | gzclose(fp); | |
209 | return; | |
128 | kseq_t *seq; | |
129 | ||
130 | fp = gzopen(filename, "r"); | |
131 | seq = kseq_init(fp); | |
132 | ||
133 | sequence_names = calloc(DEFAULT_NUM_SAMPLES, sizeof(char *)); | |
134 | ||
135 | while ((l = kseq_read(seq)) >= 0) { | |
136 | if (number_of_samples == 0) { | |
137 | length_of_genome = seq->seq.l; | |
138 | first_sequence = calloc(length_of_genome + 1, sizeof(char)); | |
139 | pseudo_reference_sequence = calloc(length_of_genome + 1, sizeof(char)); | |
140 | ||
141 | memset(first_sequence, 'N', length_of_genome); | |
142 | memset(pseudo_reference_sequence, 'N', length_of_genome); | |
143 | } | |
144 | for (i = 0; i < length_of_genome; i++) { | |
145 | if (first_sequence[i] == '#') { | |
146 | continue; | |
147 | } | |
148 | ||
149 | if (first_sequence[i] == 'N' && !is_unknown(seq->seq.s[i])) { | |
150 | first_sequence[i] = toupper(seq->seq.s[i]); | |
151 | pseudo_reference_sequence[i] = toupper(seq->seq.s[i]); | |
152 | } | |
153 | ||
154 | // in pure mode we only want /ACGT/i, if any other base is found the whole column is excluded | |
155 | if (pure_mode && !is_pure(seq->seq.s[i])) { | |
156 | first_sequence[i] = '#'; | |
157 | continue; | |
158 | } | |
159 | ||
160 | if (first_sequence[i] != '>' && !is_unknown(seq->seq.s[i]) && first_sequence[i] != 'N' && | |
161 | first_sequence[i] != toupper(seq->seq.s[i])) { | |
162 | first_sequence[i] = '>'; | |
163 | } | |
164 | } | |
165 | ||
166 | if (number_of_samples >= DEFAULT_NUM_SAMPLES) { | |
167 | sequence_names = realloc(sequence_names, (number_of_samples + 1) * sizeof(char *)); | |
168 | } | |
169 | sequence_names[number_of_samples] = calloc(MAX_SAMPLE_NAME_SIZE, sizeof(char)); | |
170 | strcpy(sequence_names[number_of_samples], seq->name.s); | |
171 | ||
172 | number_of_samples++; | |
173 | } | |
174 | ||
175 | for (i = 0; i < length_of_genome; i++) { | |
176 | if (first_sequence[i] == '>' || (output_monomorphic && first_sequence[i] != '#')) { | |
177 | number_of_snps++; | |
178 | } | |
179 | } | |
180 | ||
181 | if (number_of_snps == 0) { | |
182 | fprintf(stderr, "Warning: No SNPs were detected so there is nothing to output.\n"); | |
183 | fflush(stderr); | |
184 | exit(EXIT_FAILURE); | |
185 | } | |
186 | ||
187 | int current_snp_index = 0; | |
188 | snp_locations = calloc(number_of_snps + 1, sizeof(int)); | |
189 | for (i = 0; i < length_of_genome; i++) { | |
190 | if (first_sequence[i] == '>' || (output_monomorphic && first_sequence[i] != '#')) { | |
191 | snp_locations[current_snp_index] = i; | |
192 | current_snp_index++; | |
193 | } else if (constant_site_counts != NULL && is_pure(first_sequence[i])) { | |
194 | constant_site_counts[char_to_base_count_index[(int) toupper(first_sequence[i])]]++; | |
195 | } | |
196 | ||
197 | } | |
198 | ||
199 | free(first_sequence); | |
200 | kseq_destroy(seq); | |
201 | gzclose(fp); | |
202 | return; | |
210 | 203 | } |
211 | 204 | |
212 | 205 | int is_unknown(char base) |
213 | 206 | { |
214 | switch (base) { | |
215 | case 'N': | |
216 | case 'n': | |
217 | case '-': | |
218 | case '?': | |
219 | return 1; | |
220 | default: | |
221 | return 0; | |
222 | } | |
207 | switch (base) { | |
208 | case 'N': | |
209 | case 'n': | |
210 | case '-': | |
211 | case '?': | |
212 | return 1; | |
213 | default: | |
214 | return 0; | |
215 | } | |
223 | 216 | } |
224 | 217 | |
225 | 218 | int is_pure(char base) |
226 | 219 | { |
227 | switch (base) { | |
228 | case 'A': | |
229 | case 'C': | |
230 | case 'G': | |
231 | case 'T': | |
232 | case 'a': | |
233 | case 'c': | |
234 | case 'g': | |
235 | case 't': | |
236 | return 1; | |
237 | default: | |
238 | return 0; | |
239 | } | |
240 | } | |
220 | switch (base) { | |
221 | case 'A': | |
222 | case 'C': | |
223 | case 'G': | |
224 | case 'T': | |
225 | case 'a': | |
226 | case 'c': | |
227 | case 'g': | |
228 | case 't': | |
229 | return 1; | |
230 | default: | |
231 | return 0; | |
232 | } | |
233 | } |
21 | 21 | |
22 | 22 | #include "kseq.h" |
23 | 23 | |
24 | void detect_snps( char filename[], int pure_mode, int output_monomorphic); | |
25 | void detect_snps_count_constant_sites(char filename[], int pure_mode, int output_monomorphic, int *constant_site_counts); | |
26 | void get_bases_for_each_snp(char filename[], char ** bases_for_snps); | |
24 | void detect_snps(char filename[], int pure_mode, int output_monomorphic); | |
25 | ||
26 | void | |
27 | detect_snps_count_constant_sites(char filename[], int pure_mode, int output_monomorphic, int *constant_site_counts); | |
28 | ||
29 | void get_bases_for_each_snp(char filename[], char **bases_for_snps); | |
30 | ||
27 | 31 | int is_unknown(char base); |
32 | ||
28 | 33 | int get_length_of_genome(); |
34 | ||
29 | 35 | int get_number_of_samples(); |
36 | ||
30 | 37 | int get_number_of_snps(); |
31 | char ** get_sequence_names(); | |
32 | int * get_snp_locations(); | |
33 | char * get_pseudo_reference_sequence(); | |
38 | ||
39 | char **get_sequence_names(); | |
40 | ||
41 | int *get_snp_locations(); | |
42 | ||
43 | char *get_pseudo_reference_sequence(); | |
44 | ||
34 | 45 | int is_pure(char base); |
35 | 46 | |
36 | 47 | #define MAX_SAMPLE_NAME_SIZE 2048 |
23 | 23 | #include <regex.h> |
24 | 24 | #include "fasta-of-snp-sites.h" |
25 | 25 | |
26 | void create_fasta_of_snp_sites(char filename[], int number_of_snps, char ** bases_for_snps, char ** sequence_names, int number_of_samples, int output_reference, char * pseudo_reference_sequence, int snp_locations[]) | |
26 | void create_fasta_of_snp_sites(char filename[], int number_of_snps, char **bases_for_snps, char **sequence_names, | |
27 | int number_of_samples, int output_reference, char *pseudo_reference_sequence, | |
28 | int snp_locations[]) | |
27 | 29 | { |
28 | FILE *fasta_file_pointer; | |
29 | int sample_counter; | |
30 | int snp_counter; | |
30 | FILE *fasta_file_pointer; | |
31 | int sample_counter; | |
32 | int snp_counter; | |
31 | 33 | |
32 | fasta_file_pointer = fopen(filename, "w"); | |
33 | ||
34 | if(output_reference == 1) | |
35 | { | |
36 | fprintf( fasta_file_pointer, ">pseudo_reference_sequence\n"); | |
37 | for(snp_counter=0; snp_counter< number_of_snps; snp_counter++) | |
38 | { | |
39 | fputc( pseudo_reference_sequence[snp_locations[snp_counter]], fasta_file_pointer ); | |
34 | fasta_file_pointer = fopen(filename, "w"); | |
35 | ||
36 | if (output_reference == 1) { | |
37 | fprintf(fasta_file_pointer, ">pseudo_reference_sequence\n"); | |
38 | for (snp_counter = 0; snp_counter < number_of_snps; snp_counter++) { | |
39 | fputc(pseudo_reference_sequence[snp_locations[snp_counter]], fasta_file_pointer); | |
40 | } | |
41 | fprintf(fasta_file_pointer, "\n"); | |
40 | 42 | } |
41 | fprintf( fasta_file_pointer, "\n"); | |
42 | } | |
43 | ||
44 | for(sample_counter=0; sample_counter< number_of_samples; sample_counter++) | |
45 | { | |
46 | fprintf( fasta_file_pointer, ">%s\n", sequence_names[sample_counter]); | |
47 | for(snp_counter=0; snp_counter< number_of_snps; snp_counter++) | |
48 | { | |
49 | fputc( bases_for_snps[snp_counter][sample_counter], fasta_file_pointer ); | |
50 | } | |
51 | fprintf( fasta_file_pointer, "\n"); | |
52 | } | |
53 | ||
54 | fclose(fasta_file_pointer); | |
43 | ||
44 | for (sample_counter = 0; sample_counter < number_of_samples; sample_counter++) { | |
45 | fprintf(fasta_file_pointer, ">%s\n", sequence_names[sample_counter]); | |
46 | for (snp_counter = 0; snp_counter < number_of_snps; snp_counter++) { | |
47 | fputc(bases_for_snps[snp_counter][sample_counter], fasta_file_pointer); | |
48 | } | |
49 | fprintf(fasta_file_pointer, "\n"); | |
50 | } | |
51 | ||
52 | fclose(fasta_file_pointer); | |
55 | 53 | } |
22 | 22 | |
23 | 23 | #include <stdio.h> |
24 | 24 | |
25 | void create_fasta_of_snp_sites(char filename[], int number_of_snps, char ** bases_for_snps, char ** sequence_names, int number_of_samples, int output_reference, char * pseudo_reference_sequence, int snp_locations[]); | |
25 | void create_fasta_of_snp_sites(char filename[], int number_of_snps, char **bases_for_snps, char **sequence_names, | |
26 | int number_of_samples, int output_reference, char *pseudo_reference_sequence, | |
27 | int snp_locations[]); | |
26 | 28 | |
27 | 29 | #endif |
36 | 36 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) |
37 | 37 | #define KS_SEP_MAX 2 |
38 | 38 | |
39 | #define __KS_TYPE(type_t) \ | |
40 | typedef struct __kstream_t { \ | |
41 | unsigned char *buf; \ | |
42 | int begin, end, is_eof; \ | |
43 | type_t f; \ | |
44 | } kstream_t; | |
39 | #define __KS_TYPE(type_t) \ | |
40 | typedef struct __kstream_t { \ | |
41 | unsigned char *buf; \ | |
42 | int begin, end, is_eof; \ | |
43 | type_t f; \ | |
44 | } kstream_t; | |
45 | 45 | |
46 | 46 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) |
47 | 47 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) |
48 | 48 | |
49 | #define __KS_BASIC(type_t, __bufsize) \ | |
50 | static inline kstream_t *ks_init(type_t f) \ | |
51 | { \ | |
52 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ | |
53 | ks->f = f; \ | |
54 | ks->buf = (unsigned char*)malloc(__bufsize); \ | |
55 | return ks; \ | |
56 | } \ | |
57 | static inline void ks_destroy(kstream_t *ks) \ | |
58 | { \ | |
59 | if (ks) { \ | |
60 | free(ks->buf); \ | |
61 | free(ks); \ | |
62 | } \ | |
63 | } | |
64 | ||
65 | #define __KS_GETC(__read, __bufsize) \ | |
66 | static inline int ks_getc(kstream_t *ks) \ | |
67 | { \ | |
68 | if (ks->is_eof && ks->begin >= ks->end) return -1; \ | |
69 | if (ks->begin >= ks->end) { \ | |
70 | ks->begin = 0; \ | |
71 | ks->end = __read(ks->f, ks->buf, __bufsize); \ | |
72 | if (ks->end < __bufsize) ks->is_eof = 1; \ | |
73 | if (ks->end == 0) return -1; \ | |
74 | } \ | |
75 | return (int)ks->buf[ks->begin++]; \ | |
76 | } | |
49 | #define __KS_BASIC(type_t, __bufsize) \ | |
50 | static inline kstream_t *ks_init(type_t f) \ | |
51 | { \ | |
52 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ | |
53 | ks->f = f; \ | |
54 | ks->buf = (unsigned char*)malloc(__bufsize); \ | |
55 | return ks; \ | |
56 | } \ | |
57 | static inline void ks_destroy(kstream_t *ks) \ | |
58 | { \ | |
59 | if (ks) { \ | |
60 | free(ks->buf); \ | |
61 | free(ks); \ | |
62 | } \ | |
63 | } | |
64 | ||
65 | #define __KS_GETC(__read, __bufsize) \ | |
66 | static inline int ks_getc(kstream_t *ks) \ | |
67 | { \ | |
68 | if (ks->is_eof && ks->begin >= ks->end) return -1; \ | |
69 | if (ks->begin >= ks->end) { \ | |
70 | ks->begin = 0; \ | |
71 | ks->end = __read(ks->f, ks->buf, __bufsize); \ | |
72 | if (ks->end < __bufsize) ks->is_eof = 1; \ | |
73 | if (ks->end == 0) return -1; \ | |
74 | } \ | |
75 | return (int)ks->buf[ks->begin++]; \ | |
76 | } | |
77 | 77 | |
78 | 78 | #ifndef KSTRING_T |
79 | 79 | #define KSTRING_T kstring_t |
80 | typedef struct __kstring_t { | |
81 | size_t l, m; | |
82 | char *s; | |
80 | typedef struct __kstring_t | |
81 | { | |
82 | size_t l, m; | |
83 | char *s; | |
83 | 84 | } kstring_t; |
84 | 85 | #endif |
85 | 86 | |
87 | 88 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) |
88 | 89 | #endif |
89 | 90 | |
90 | #define __KS_GETUNTIL(__read, __bufsize) \ | |
91 | static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ | |
92 | { \ | |
93 | if (dret) *dret = 0; \ | |
94 | str->l = append? str->l : 0; \ | |
95 | if (ks->begin >= ks->end && ks->is_eof) return -1; \ | |
96 | for (;;) { \ | |
97 | int i; \ | |
98 | if (ks->begin >= ks->end) { \ | |
99 | if (!ks->is_eof) { \ | |
100 | ks->begin = 0; \ | |
101 | ks->end = __read(ks->f, ks->buf, __bufsize); \ | |
102 | if (ks->end < __bufsize) ks->is_eof = 1; \ | |
103 | if (ks->end == 0) break; \ | |
104 | } else break; \ | |
105 | } \ | |
106 | if (delimiter == KS_SEP_LINE) { \ | |
107 | for (i = ks->begin; i < ks->end; ++i) \ | |
108 | if (ks->buf[i] == '\n') break; \ | |
109 | } else if (delimiter > KS_SEP_MAX) { \ | |
110 | for (i = ks->begin; i < ks->end; ++i) \ | |
111 | if (ks->buf[i] == delimiter) break; \ | |
112 | } else if (delimiter == KS_SEP_SPACE) { \ | |
113 | for (i = ks->begin; i < ks->end; ++i) \ | |
114 | if (isspace(ks->buf[i])) break; \ | |
115 | } else if (delimiter == KS_SEP_TAB) { \ | |
116 | for (i = ks->begin; i < ks->end; ++i) \ | |
117 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ | |
118 | } else i = 0; /* never come to here! */ \ | |
119 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ | |
120 | str->m = str->l + (i - ks->begin) + 1; \ | |
121 | kroundup32(str->m); \ | |
122 | str->s = (char*)realloc(str->s, str->m); \ | |
123 | } \ | |
124 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ | |
125 | str->l = str->l + (i - ks->begin); \ | |
126 | ks->begin = i + 1; \ | |
127 | if (i < ks->end) { \ | |
128 | if (dret) *dret = ks->buf[i]; \ | |
129 | break; \ | |
130 | } \ | |
131 | } \ | |
132 | if (str->s == 0) { \ | |
133 | str->m = 1; \ | |
134 | str->s = (char*)calloc(1, 1); \ | |
135 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ | |
136 | str->s[str->l] = '\0'; \ | |
137 | return str->l; \ | |
138 | } \ | |
139 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ | |
140 | { return ks_getuntil2(ks, delimiter, str, dret, 0); } | |
91 | #define __KS_GETUNTIL(__read, __bufsize) \ | |
92 | static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ | |
93 | { \ | |
94 | if (dret) *dret = 0; \ | |
95 | str->l = append? str->l : 0; \ | |
96 | if (ks->begin >= ks->end && ks->is_eof) return -1; \ | |
97 | for (;;) { \ | |
98 | int i; \ | |
99 | if (ks->begin >= ks->end) { \ | |
100 | if (!ks->is_eof) { \ | |
101 | ks->begin = 0; \ | |
102 | ks->end = __read(ks->f, ks->buf, __bufsize); \ | |
103 | if (ks->end < __bufsize) ks->is_eof = 1; \ | |
104 | if (ks->end == 0) break; \ | |
105 | } else break; \ | |
106 | } \ | |
107 | if (delimiter == KS_SEP_LINE) { \ | |
108 | for (i = ks->begin; i < ks->end; ++i) \ | |
109 | if (ks->buf[i] == '\n') break; \ | |
110 | } else if (delimiter > KS_SEP_MAX) { \ | |
111 | for (i = ks->begin; i < ks->end; ++i) \ | |
112 | if (ks->buf[i] == delimiter) break; \ | |
113 | } else if (delimiter == KS_SEP_SPACE) { \ | |
114 | for (i = ks->begin; i < ks->end; ++i) \ | |
115 | if (isspace(ks->buf[i])) break; \ | |
116 | } else if (delimiter == KS_SEP_TAB) { \ | |
117 | for (i = ks->begin; i < ks->end; ++i) \ | |
118 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ | |
119 | } else i = 0; /* never come to here! */ \ | |
120 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ | |
121 | str->m = str->l + (i - ks->begin) + 1; \ | |
122 | kroundup32(str->m); \ | |
123 | str->s = (char*)realloc(str->s, str->m); \ | |
124 | } \ | |
125 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ | |
126 | str->l = str->l + (i - ks->begin); \ | |
127 | ks->begin = i + 1; \ | |
128 | if (i < ks->end) { \ | |
129 | if (dret) *dret = ks->buf[i]; \ | |
130 | break; \ | |
131 | } \ | |
132 | } \ | |
133 | if (str->s == 0) { \ | |
134 | str->m = 1; \ | |
135 | str->s = (char*)calloc(1, 1); \ | |
136 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ | |
137 | str->s[str->l] = '\0'; \ | |
138 | return str->l; \ | |
139 | } \ | |
140 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ | |
141 | { return ks_getuntil2(ks, delimiter, str, dret, 0); } | |
141 | 142 | |
142 | 143 | #define KSTREAM_INIT(type_t, __read, __bufsize) \ |
143 | __KS_TYPE(type_t) \ | |
144 | __KS_BASIC(type_t, __bufsize) \ | |
145 | __KS_GETC(__read, __bufsize) \ | |
146 | __KS_GETUNTIL(__read, __bufsize) | |
144 | __KS_TYPE(type_t) \ | |
145 | __KS_BASIC(type_t, __bufsize) \ | |
146 | __KS_GETC(__read, __bufsize) \ | |
147 | __KS_GETUNTIL(__read, __bufsize) | |
147 | 148 | |
148 | 149 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) |
149 | 150 | |
150 | #define __KSEQ_BASIC(SCOPE, type_t) \ | |
151 | SCOPE kseq_t *kseq_init(type_t fd) \ | |
152 | { \ | |
153 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ | |
154 | s->f = ks_init(fd); \ | |
155 | return s; \ | |
156 | } \ | |
157 | SCOPE void kseq_destroy(kseq_t *ks) \ | |
158 | { \ | |
159 | if (!ks) return; \ | |
160 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ | |
161 | ks_destroy(ks->f); \ | |
162 | free(ks); \ | |
163 | } | |
151 | #define __KSEQ_BASIC(SCOPE, type_t) \ | |
152 | SCOPE kseq_t *kseq_init(type_t fd) \ | |
153 | { \ | |
154 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ | |
155 | s->f = ks_init(fd); \ | |
156 | return s; \ | |
157 | } \ | |
158 | SCOPE void kseq_destroy(kseq_t *ks) \ | |
159 | { \ | |
160 | if (!ks) return; \ | |
161 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ | |
162 | ks_destroy(ks->f); \ | |
163 | free(ks); \ | |
164 | } | |
164 | 165 | |
165 | 166 | /* Return value: |
166 | 167 | >=0 length of the sequence (normal) |
168 | 169 | -2 truncated quality string |
169 | 170 | */ |
170 | 171 | #define __KSEQ_READ(SCOPE) \ |
171 | SCOPE int kseq_read(kseq_t *seq) \ | |
172 | { \ | |
173 | int c; \ | |
174 | kstream_t *ks = seq->f; \ | |
175 | if (seq->last_char == 0) { /* then jump to the next header line */ \ | |
176 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ | |
177 | if (c == -1) return -1; /* end of file */ \ | |
178 | seq->last_char = c; \ | |
179 | } /* else: the first header char has been read in the previous call */ \ | |
180 | seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ | |
181 | if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ | |
182 | if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ | |
183 | if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ | |
184 | seq->seq.m = 256; \ | |
185 | seq->seq.s = (char*)malloc(seq->seq.m); \ | |
186 | } \ | |
187 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ | |
188 | if (c == '\n') continue; /* skip empty lines */ \ | |
189 | seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ | |
190 | ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ | |
191 | } \ | |
192 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ | |
193 | if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ | |
194 | seq->seq.m = seq->seq.l + 2; \ | |
195 | kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ | |
196 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ | |
197 | } \ | |
198 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ | |
199 | if (c != '+') return seq->seq.l; /* FASTA */ \ | |
200 | if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ | |
201 | seq->qual.m = seq->seq.m; \ | |
202 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ | |
203 | } \ | |
204 | while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ | |
205 | if (c == -1) return -2; /* error: no quality string */ \ | |
206 | while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ | |
207 | seq->last_char = 0; /* we have not come to the next header line */ \ | |
208 | if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ | |
209 | return seq->seq.l; \ | |
210 | } | |
211 | ||
212 | #define __KSEQ_TYPE(type_t) \ | |
213 | typedef struct { \ | |
214 | kstring_t name, comment, seq, qual; \ | |
215 | int last_char; \ | |
216 | kstream_t *f; \ | |
217 | } kseq_t; | |
218 | ||
219 | #define KSEQ_INIT2(SCOPE, type_t, __read) \ | |
220 | KSTREAM_INIT(type_t, __read, 2048) \ | |
221 | __KSEQ_TYPE(type_t) \ | |
222 | __KSEQ_BASIC(SCOPE, type_t) \ | |
223 | __KSEQ_READ(SCOPE) | |
172 | SCOPE int kseq_read(kseq_t *seq) \ | |
173 | { \ | |
174 | int c; \ | |
175 | kstream_t *ks = seq->f; \ | |
176 | if (seq->last_char == 0) { /* then jump to the next header line */ \ | |
177 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ | |
178 | if (c == -1) return -1; /* end of file */ \ | |
179 | seq->last_char = c; \ | |
180 | } /* else: the first header char has been read in the previous call */ \ | |
181 | seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ | |
182 | if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ | |
183 | if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ | |
184 | if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ | |
185 | seq->seq.m = 256; \ | |
186 | seq->seq.s = (char*)malloc(seq->seq.m); \ | |
187 | } \ | |
188 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ | |
189 | if (c == '\n') continue; /* skip empty lines */ \ | |
190 | seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ | |
191 | ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ | |
192 | } \ | |
193 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ | |
194 | if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ | |
195 | seq->seq.m = seq->seq.l + 2; \ | |
196 | kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ | |
197 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ | |
198 | } \ | |
199 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ | |
200 | if (c != '+') return seq->seq.l; /* FASTA */ \ | |
201 | if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ | |
202 | seq->qual.m = seq->seq.m; \ | |
203 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ | |
204 | } \ | |
205 | while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ | |
206 | if (c == -1) return -2; /* error: no quality string */ \ | |
207 | while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ | |
208 | seq->last_char = 0; /* we have not come to the next header line */ \ | |
209 | if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ | |
210 | return seq->seq.l; \ | |
211 | } | |
212 | ||
213 | #define __KSEQ_TYPE(type_t) \ | |
214 | typedef struct { \ | |
215 | kstring_t name, comment, seq, qual; \ | |
216 | int last_char; \ | |
217 | kstream_t *f; \ | |
218 | } kseq_t; | |
219 | ||
220 | #define KSEQ_INIT2(SCOPE, type_t, __read) \ | |
221 | KSTREAM_INIT(type_t, __read, 2048) \ | |
222 | __KSEQ_TYPE(type_t) \ | |
223 | __KSEQ_BASIC(SCOPE, type_t) \ | |
224 | __KSEQ_READ(SCOPE) | |
224 | 225 | |
225 | 226 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) |
226 | 227 | |
227 | 228 | #define KSEQ_DECLARE(type_t) \ |
228 | __KS_TYPE(type_t) \ | |
229 | __KSEQ_TYPE(type_t) \ | |
230 | extern kseq_t *kseq_init(type_t fd); \ | |
231 | void kseq_destroy(kseq_t *ks); \ | |
232 | int kseq_read(kseq_t *seq); | |
229 | __KS_TYPE(type_t) \ | |
230 | __KSEQ_TYPE(type_t) \ | |
231 | extern kseq_t *kseq_init(type_t fd); \ | |
232 | void kseq_destroy(kseq_t *ks); \ | |
233 | int kseq_read(kseq_t *seq); | |
233 | 234 | |
234 | 235 | #endif⏎ |
31 | 31 | |
32 | 32 | static void print_usage() |
33 | 33 | { |
34 | printf("Usage: snp-sites [-rmvpcbhV] [-o output_filename] <file>\n"); | |
35 | printf("This program finds snp sites from a multi FASTA alignment file.\n"); | |
36 | printf(" -r output internal pseudo reference sequence\n"); | |
37 | printf(" -m output a multi fasta alignment file (default)\n"); | |
38 | printf(" -v output a VCF file\n"); | |
39 | printf(" -p output a phylip file\n"); | |
40 | printf(" -o STR specify an output filename [STDOUT]\n"); | |
41 | printf(" -c only output columns containing exclusively ACGT\n"); | |
42 | printf(" -C only output count of constant sites (suitable for IQ-TREE -fconst) and nothing else\n"); | |
43 | printf(" -b output monomorphic sites, used for BEAST\n"); | |
44 | printf(" -h this help message\n"); | |
45 | printf(" -V print version and exit\n"); | |
46 | printf(" <file> input alignment file which can optionally be gzipped\n\n"); | |
47 | ||
48 | printf("Example: creating files for BEAST\n"); | |
49 | printf(" snp-sites -cb -o outputfile.aln inputfile.aln\n\n"); | |
34 | printf("Usage: snp-sites [-rmvpcbhV] [-o output_filename] <file>\n"); | |
35 | printf("This program finds snp sites from a multi FASTA alignment file.\n"); | |
36 | printf(" -r output internal pseudo reference sequence\n"); | |
37 | printf(" -m output a multi fasta alignment file (default)\n"); | |
38 | printf(" -v output a VCF file\n"); | |
39 | printf(" -p output a phylip file\n"); | |
40 | printf(" -o STR specify an output filename [STDOUT]\n"); | |
41 | printf(" -c only output columns containing exclusively ACGT\n"); | |
42 | printf(" -C only output count of constant sites (suitable for IQ-TREE -fconst) and nothing else\n"); | |
43 | printf(" -b output monomorphic sites, used for BEAST\n"); | |
44 | printf(" -h this help message\n"); | |
45 | printf(" -V print version and exit\n"); | |
46 | printf(" <file> input alignment file which can optionally be gzipped\n\n"); | |
50 | 47 | |
51 | printf("If you use this program, please cite:\n"); | |
52 | printf("\"SNP-sites: rapid efficient extraction of SNPs from multi-FASTA alignments\",\n"); | |
53 | printf("Andrew J. Page, Ben Taylor, Aidan J. Delaney, Jorge Soares, Torsten Seemann, Jacqueline A. Keane, Simon R. Harris,\n"); | |
54 | printf("Microbial Genomics 2(4), (2016). http://dx.doi.org/10.1099/mgen.0.000056\n"); | |
55 | ||
48 | printf("Example: creating files for BEAST\n"); | |
49 | printf(" snp-sites -cb -o outputfile.aln inputfile.aln\n\n"); | |
50 | ||
51 | printf("If you use this program, please cite:\n"); | |
52 | printf("\"SNP-sites: rapid efficient extraction of SNPs from multi-FASTA alignments\",\n"); | |
53 | printf("Andrew J. Page, Ben Taylor, Aidan J. Delaney, Jorge Soares, Torsten Seemann, Jacqueline A. Keane, Simon R. Harris,\n"); | |
54 | printf("Microbial Genomics 2(4), (2016). http://dx.doi.org/10.1099/mgen.0.000056\n"); | |
55 | ||
56 | 56 | } |
57 | 57 | |
58 | 58 | static void print_version() |
59 | 59 | { |
60 | printf("%s %s\n", PROGRAM_NAME, PROGRAM_VERSION); | |
60 | printf("%s %s\n", PROGRAM_NAME, PROGRAM_VERSION); | |
61 | 61 | } |
62 | 62 | |
63 | int main (int argc, char **argv) { | |
64 | char multi_fasta_filename[FILENAME_MAX] = {""}; | |
65 | char output_filename[FILENAME_MAX] = {"/dev/stdout"}; | |
63 | int main(int argc, char **argv) | |
64 | { | |
65 | char multi_fasta_filename[FILENAME_MAX] = {""}; | |
66 | char output_filename[FILENAME_MAX] = {"/dev/stdout"}; | |
66 | 67 | |
67 | int c; | |
68 | int index; | |
69 | int output_multi_fasta_file = 0; | |
70 | int output_vcf_file = 0; | |
71 | int output_phylip_file = 0; | |
72 | int output_reference = 0; | |
73 | int output_constant_site_counts = 0; | |
74 | int pure_mode = 0; | |
75 | int output_monomorphic =0; | |
76 | ||
77 | while ((c = getopt (argc, argv, "mvrbpcCo:V")) != -1) | |
78 | switch (c) | |
79 | { | |
80 | case 'm': | |
81 | output_multi_fasta_file = 1; | |
82 | break; | |
83 | case 'v': | |
84 | output_vcf_file = 1; | |
85 | break; | |
86 | case 'V': | |
87 | print_version(); | |
88 | return 0; | |
89 | case 'p': | |
90 | output_phylip_file = 1; | |
91 | break; | |
92 | case 'r': | |
93 | output_reference = 1; | |
94 | break; | |
95 | case 'c': | |
96 | pure_mode = 1; | |
97 | break; | |
98 | case 'C': | |
99 | output_constant_site_counts = 1; | |
100 | break; | |
101 | case 'b': | |
102 | output_monomorphic = 1; | |
103 | break; | |
104 | case 'o': | |
105 | strncpy(output_filename, optarg, FILENAME_MAX); | |
106 | break; | |
107 | case 'h': | |
108 | print_usage(); | |
109 | exit(EXIT_SUCCESS); | |
110 | default: | |
111 | output_multi_fasta_file = 1; | |
112 | } | |
113 | ||
114 | ||
115 | if(optind < argc) | |
116 | { | |
117 | // check to see if the input alignment file exists | |
118 | if( access( argv[optind], F_OK ) == -1 ) { | |
119 | fprintf(stderr,"ERROR: cannot access input alignment file '%s'\n", argv[optind]); | |
120 | fflush(stderr); | |
121 | exit(EXIT_FAILURE); | |
68 | int c; | |
69 | int index; | |
70 | int output_multi_fasta_file = 0; | |
71 | int output_vcf_file = 0; | |
72 | int output_phylip_file = 0; | |
73 | int output_reference = 0; | |
74 | int output_constant_site_counts = 0; | |
75 | int pure_mode = 0; | |
76 | int output_monomorphic = 0; | |
77 | ||
78 | while ((c = getopt(argc, argv, "mvrbpcCo:V")) != -1) | |
79 | switch (c) { | |
80 | case 'm': | |
81 | output_multi_fasta_file = 1; | |
82 | break; | |
83 | case 'v': | |
84 | output_vcf_file = 1; | |
85 | break; | |
86 | case 'V': | |
87 | print_version(); | |
88 | return 0; | |
89 | case 'p': | |
90 | output_phylip_file = 1; | |
91 | break; | |
92 | case 'r': | |
93 | output_reference = 1; | |
94 | break; | |
95 | case 'c': | |
96 | pure_mode = 1; | |
97 | break; | |
98 | case 'C': | |
99 | output_constant_site_counts = 1; | |
100 | break; | |
101 | case 'b': | |
102 | output_monomorphic = 1; | |
103 | break; | |
104 | case 'o': | |
105 | strncpy(output_filename, optarg, FILENAME_MAX); | |
106 | break; | |
107 | case 'h': | |
108 | print_usage(); | |
109 | exit(EXIT_SUCCESS); | |
110 | default: | |
111 | output_multi_fasta_file = 1; | |
112 | } | |
113 | ||
114 | ||
115 | if (optind < argc) { | |
116 | // check to see if the input alignment file exists | |
117 | if (access(argv[optind], F_OK) == -1) { | |
118 | fprintf(stderr, "ERROR: cannot access input alignment file '%s'\n", argv[optind]); | |
119 | fflush(stderr); | |
120 | exit(EXIT_FAILURE); | |
121 | } | |
122 | ||
123 | strncpy(multi_fasta_filename, argv[optind], FILENAME_MAX); | |
124 | ||
125 | if (output_constant_site_counts) { | |
126 | count_constant_sites(multi_fasta_filename, output_filename); | |
127 | } else if (pure_mode || output_monomorphic) { | |
128 | generate_snp_sites_with_ref_pure_mono(multi_fasta_filename, | |
129 | output_multi_fasta_file, | |
130 | output_vcf_file, output_phylip_file, | |
131 | output_filename, output_reference, pure_mode, output_monomorphic); | |
132 | } else if (output_reference) { | |
133 | generate_snp_sites_with_ref(multi_fasta_filename, | |
134 | output_multi_fasta_file, | |
135 | output_vcf_file, output_phylip_file, | |
136 | output_filename); | |
137 | } else { | |
138 | generate_snp_sites(multi_fasta_filename, output_multi_fasta_file, | |
139 | output_vcf_file, output_phylip_file, | |
140 | output_filename); | |
141 | } | |
142 | } else { | |
143 | print_usage(); | |
122 | 144 | } |
123 | ||
124 | strncpy(multi_fasta_filename, argv[optind], FILENAME_MAX); | |
125 | 145 | |
126 | if (output_constant_site_counts) { | |
127 | count_constant_sites(multi_fasta_filename, output_filename); | |
128 | } else if( pure_mode || output_monomorphic) | |
129 | { | |
130 | generate_snp_sites_with_ref_pure_mono(multi_fasta_filename, | |
131 | output_multi_fasta_file, | |
132 | output_vcf_file, output_phylip_file, | |
133 | output_filename,output_reference,pure_mode,output_monomorphic); | |
134 | } | |
135 | else if(output_reference ) { | |
136 | generate_snp_sites_with_ref(multi_fasta_filename, | |
137 | output_multi_fasta_file, | |
138 | output_vcf_file, output_phylip_file, | |
139 | output_filename); | |
140 | } else { | |
141 | generate_snp_sites(multi_fasta_filename, output_multi_fasta_file, | |
142 | output_vcf_file, output_phylip_file, | |
143 | output_filename); | |
144 | } | |
145 | } | |
146 | else | |
147 | { | |
148 | print_usage(); | |
149 | } | |
150 | ||
151 | exit(EXIT_SUCCESS); | |
146 | exit(EXIT_SUCCESS); | |
152 | 147 | } |
24 | 24 | #include "phylib-of-snp-sites.h" |
25 | 25 | |
26 | 26 | |
27 | void create_phylib_of_snp_sites(char filename[], int number_of_snps, char ** bases_for_snps, char ** sequence_names, int number_of_samples, int output_reference, char * pseudo_reference_sequence, int snp_locations[]) | |
27 | void create_phylib_of_snp_sites(char filename[], int number_of_snps, char **bases_for_snps, char **sequence_names, | |
28 | int number_of_samples, int output_reference, char *pseudo_reference_sequence, | |
29 | int snp_locations[]) | |
28 | 30 | { |
29 | FILE *phylip_file_pointer; | |
30 | int sample_counter; | |
31 | int snp_counter; | |
31 | FILE *phylip_file_pointer; | |
32 | int sample_counter; | |
33 | int snp_counter; | |
32 | 34 | |
33 | phylip_file_pointer = fopen(filename, "w"); | |
34 | ||
35 | if(output_reference == 1) | |
36 | { | |
37 | fprintf( phylip_file_pointer, "%d %d\n", number_of_samples+1, number_of_snps); | |
38 | fprintf( phylip_file_pointer, "pseudo_reference_sequence\t"); | |
39 | for(snp_counter=0; snp_counter< number_of_snps; snp_counter++) | |
40 | { | |
41 | fputc( pseudo_reference_sequence[snp_locations[snp_counter]], phylip_file_pointer ); | |
35 | phylip_file_pointer = fopen(filename, "w"); | |
36 | ||
37 | if (output_reference == 1) { | |
38 | fprintf(phylip_file_pointer, "%d %d\n", number_of_samples + 1, number_of_snps); | |
39 | fprintf(phylip_file_pointer, "pseudo_reference_sequence\t"); | |
40 | for (snp_counter = 0; snp_counter < number_of_snps; snp_counter++) { | |
41 | fputc(pseudo_reference_sequence[snp_locations[snp_counter]], phylip_file_pointer); | |
42 | } | |
43 | fprintf(phylip_file_pointer, "\n"); | |
44 | } else { | |
45 | fprintf(phylip_file_pointer, "%d %d\n", number_of_samples, number_of_snps); | |
42 | 46 | } |
43 | fprintf( phylip_file_pointer, "\n"); | |
44 | } | |
45 | else | |
46 | { | |
47 | fprintf( phylip_file_pointer, "%d %d\n", number_of_samples, number_of_snps); | |
48 | } | |
49 | ||
50 | for(sample_counter=0; sample_counter< number_of_samples; sample_counter++) | |
51 | { | |
52 | // sequence_name can be more than 10 (relaxed phylib format) and contain [\w\s] | |
53 | //TODO check for illegal characters [^\w\s] | |
54 | fprintf( phylip_file_pointer, "%s\t", sequence_names[sample_counter]); | |
55 | ||
56 | for(snp_counter=0; snp_counter< number_of_snps; snp_counter++) | |
57 | { | |
58 | fputc( bases_for_snps[snp_counter][sample_counter], phylip_file_pointer); | |
59 | } | |
60 | fprintf( phylip_file_pointer, "\n"); | |
61 | } | |
62 | 47 | |
63 | fclose(phylip_file_pointer); | |
48 | for (sample_counter = 0; sample_counter < number_of_samples; sample_counter++) { | |
49 | // sequence_name can be more than 10 (relaxed phylib format) and contain [\w\s] | |
50 | //TODO check for illegal characters [^\w\s] | |
51 | fprintf(phylip_file_pointer, "%s\t", sequence_names[sample_counter]); | |
52 | ||
53 | for (snp_counter = 0; snp_counter < number_of_snps; snp_counter++) { | |
54 | fputc(bases_for_snps[snp_counter][sample_counter], phylip_file_pointer); | |
55 | } | |
56 | fprintf(phylip_file_pointer, "\n"); | |
57 | } | |
58 | ||
59 | fclose(phylip_file_pointer); | |
64 | 60 | } |
23 | 23 | |
24 | 24 | #include <stdio.h> |
25 | 25 | |
26 | void create_phylib_of_snp_sites(char filename[], int number_of_snps, char ** bases_for_snps, char ** sequence_names, int number_of_samples, int output_reference, char * pseudo_reference_sequence, int snp_locations[]); | |
26 | void create_phylib_of_snp_sites(char filename[], int number_of_snps, char **bases_for_snps, char **sequence_names, | |
27 | int number_of_samples, int output_reference, char *pseudo_reference_sequence, | |
28 | int snp_locations[]); | |
27 | 29 | |
28 | 30 | #endif |
26 | 26 | #include "phylib-of-snp-sites.h" |
27 | 27 | #include "fasta-of-snp-sites.h" |
28 | 28 | |
29 | char ** bases_for_snps; | |
29 | char **bases_for_snps; | |
30 | 30 | |
31 | 31 | static int generate_snp_sites_generic(char filename[], |
32 | 32 | int output_multi_fasta_file, |
36 | 36 | int output_reference, int pure_mode, |
37 | 37 | int output_monomorphic) |
38 | 38 | { |
39 | int i; | |
40 | detect_snps(filename, pure_mode, output_monomorphic); | |
39 | int i; | |
40 | detect_snps(filename, pure_mode, output_monomorphic); | |
41 | 41 | |
42 | bases_for_snps = calloc(get_number_of_snps()+1, sizeof(char*)); | |
43 | ||
44 | for(i = 0; i < get_number_of_snps(); i++) | |
45 | { | |
46 | bases_for_snps[i] = calloc(get_number_of_samples()+1, sizeof(char)); | |
47 | } | |
48 | ||
49 | get_bases_for_each_snp(filename, bases_for_snps); | |
50 | ||
51 | char output_filename_base[FILENAME_MAX]; | |
52 | char filename_without_directory[FILENAME_MAX]; | |
53 | strip_directory_from_filename(filename, filename_without_directory); | |
54 | strncpy(output_filename_base, filename_without_directory, FILENAME_MAX); | |
55 | ||
56 | if(output_filename != NULL && *output_filename != '\0') | |
57 | { | |
58 | strncpy(output_filename_base, output_filename, FILENAME_MAX); | |
59 | } | |
42 | bases_for_snps = calloc(get_number_of_snps() + 1, sizeof(char *)); | |
60 | 43 | |
61 | if(output_vcf_file) | |
62 | { | |
63 | char vcf_output_filename[FILENAME_MAX]; | |
64 | strncpy(vcf_output_filename, output_filename_base, FILENAME_MAX); | |
65 | if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') ) | |
66 | { | |
67 | strcat(vcf_output_filename, ".vcf"); | |
68 | } | |
69 | ||
70 | create_vcf_file(vcf_output_filename, get_snp_locations(), get_number_of_snps(), bases_for_snps, get_sequence_names(), get_number_of_samples(), get_length_of_genome(), get_pseudo_reference_sequence()); | |
71 | } | |
44 | for (i = 0; i < get_number_of_snps(); i++) { | |
45 | bases_for_snps[i] = calloc(get_number_of_samples() + 1, sizeof(char)); | |
46 | } | |
72 | 47 | |
73 | ||
74 | if(output_phylip_file) | |
75 | { | |
76 | char phylip_output_filename[FILENAME_MAX]; | |
77 | strncpy(phylip_output_filename, output_filename_base, FILENAME_MAX); | |
78 | if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') ) | |
79 | { | |
80 | strcat(phylip_output_filename, ".phylip"); | |
81 | } | |
82 | create_phylib_of_snp_sites(phylip_output_filename, get_number_of_snps(), bases_for_snps, get_sequence_names(), get_number_of_samples(), output_reference, get_pseudo_reference_sequence(),get_snp_locations()); | |
83 | } | |
48 | get_bases_for_each_snp(filename, bases_for_snps); | |
84 | 49 | |
85 | if((output_multi_fasta_file) || (output_vcf_file ==0 && output_phylip_file == 0 && output_multi_fasta_file == 0)) | |
86 | { | |
87 | char multi_fasta_output_filename[FILENAME_MAX]; | |
88 | strncpy(multi_fasta_output_filename, output_filename_base, FILENAME_MAX); | |
89 | if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') ) | |
90 | { | |
91 | strcat(multi_fasta_output_filename, ".snp_sites.aln"); | |
92 | } | |
93 | create_fasta_of_snp_sites(multi_fasta_output_filename, get_number_of_snps(), bases_for_snps, get_sequence_names(), get_number_of_samples(), output_reference, get_pseudo_reference_sequence(),get_snp_locations()); | |
94 | } | |
50 | char output_filename_base[FILENAME_MAX]; | |
51 | char filename_without_directory[FILENAME_MAX]; | |
52 | strip_directory_from_filename(filename, filename_without_directory); | |
53 | strncpy(output_filename_base, filename_without_directory, FILENAME_MAX); | |
95 | 54 | |
96 | // free memory | |
97 | free(get_snp_locations()); | |
98 | for(i = 0; i < get_number_of_samples(); i++) | |
99 | { | |
100 | // free(get_sequence_names().[i]); | |
101 | } | |
102 | for(i = 0; i < get_number_of_snps(); i++) | |
103 | { | |
104 | free(bases_for_snps[i]); | |
105 | } | |
106 | free(get_pseudo_reference_sequence()); | |
55 | if (output_filename != NULL && *output_filename != '\0') { | |
56 | strncpy(output_filename_base, output_filename, FILENAME_MAX); | |
57 | } | |
107 | 58 | |
108 | return 1; | |
59 | if (output_vcf_file) { | |
60 | char vcf_output_filename[FILENAME_MAX]; | |
61 | strncpy(vcf_output_filename, output_filename_base, FILENAME_MAX); | |
62 | if ((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || | |
63 | (output_filename == NULL || *output_filename == '\0')) { | |
64 | strcat(vcf_output_filename, ".vcf"); | |
65 | } | |
66 | ||
67 | create_vcf_file(vcf_output_filename, get_snp_locations(), get_number_of_snps(), bases_for_snps, | |
68 | get_sequence_names(), get_number_of_samples(), get_length_of_genome(), | |
69 | get_pseudo_reference_sequence()); | |
70 | } | |
71 | ||
72 | ||
73 | if (output_phylip_file) { | |
74 | char phylip_output_filename[FILENAME_MAX]; | |
75 | strncpy(phylip_output_filename, output_filename_base, FILENAME_MAX); | |
76 | if ((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || | |
77 | (output_filename == NULL || *output_filename == '\0')) { | |
78 | strcat(phylip_output_filename, ".phylip"); | |
79 | } | |
80 | create_phylib_of_snp_sites(phylip_output_filename, get_number_of_snps(), bases_for_snps, get_sequence_names(), | |
81 | get_number_of_samples(), output_reference, get_pseudo_reference_sequence(), | |
82 | get_snp_locations()); | |
83 | } | |
84 | ||
85 | if ((output_multi_fasta_file) || | |
86 | (output_vcf_file == 0 && output_phylip_file == 0 && output_multi_fasta_file == 0)) { | |
87 | char multi_fasta_output_filename[FILENAME_MAX]; | |
88 | strncpy(multi_fasta_output_filename, output_filename_base, FILENAME_MAX); | |
89 | if ((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || | |
90 | (output_filename == NULL || *output_filename == '\0')) { | |
91 | strcat(multi_fasta_output_filename, ".snp_sites.aln"); | |
92 | } | |
93 | create_fasta_of_snp_sites(multi_fasta_output_filename, get_number_of_snps(), bases_for_snps, | |
94 | get_sequence_names(), get_number_of_samples(), output_reference, | |
95 | get_pseudo_reference_sequence(), get_snp_locations()); | |
96 | } | |
97 | ||
98 | // free memory | |
99 | free(get_snp_locations()); | |
100 | for (i = 0; i < get_number_of_samples(); i++) { | |
101 | // free(get_sequence_names().[i]); | |
102 | } | |
103 | for (i = 0; i < get_number_of_snps(); i++) { | |
104 | free(bases_for_snps[i]); | |
105 | } | |
106 | free(get_pseudo_reference_sequence()); | |
107 | ||
108 | return 1; | |
109 | 109 | } |
110 | 110 | |
111 | 111 | int generate_snp_sites(char filename[], int output_multi_fasta_file, |
112 | 112 | int output_vcf_file, int output_phylip_file, |
113 | 113 | char output_filename[]) |
114 | 114 | { |
115 | return generate_snp_sites_generic(filename, output_multi_fasta_file, | |
116 | output_vcf_file, output_phylip_file, | |
117 | output_filename, 0,0,0); | |
115 | return generate_snp_sites_generic(filename, output_multi_fasta_file, | |
116 | output_vcf_file, output_phylip_file, | |
117 | output_filename, 0, 0, 0); | |
118 | 118 | } |
119 | 119 | |
120 | 120 | int generate_snp_sites_with_ref(char filename[], int output_multi_fasta_file, |
123 | 123 | { |
124 | 124 | return generate_snp_sites_generic(filename, output_multi_fasta_file, |
125 | 125 | output_vcf_file, output_phylip_file, |
126 | output_filename, 1,0,0); | |
126 | output_filename, 1, 0, 0); | |
127 | 127 | } |
128 | 128 | |
129 | 129 | int generate_snp_sites_with_ref_pure_mono(char filename[], |
130 | int output_multi_fasta_file, | |
131 | int output_vcf_file, | |
132 | int output_phylip_file, | |
133 | char output_filename[], | |
134 | int output_reference, | |
135 | int pure_mode, | |
136 | int output_monomorphic) | |
130 | int output_multi_fasta_file, | |
131 | int output_vcf_file, | |
132 | int output_phylip_file, | |
133 | char output_filename[], | |
134 | int output_reference, | |
135 | int pure_mode, | |
136 | int output_monomorphic) | |
137 | 137 | { |
138 | return generate_snp_sites_generic(filename, output_multi_fasta_file, | |
139 | output_vcf_file, output_phylip_file, | |
140 | output_filename, output_reference, pure_mode, output_monomorphic); | |
138 | return generate_snp_sites_generic(filename, output_multi_fasta_file, | |
139 | output_vcf_file, output_phylip_file, | |
140 | output_filename, output_reference, pure_mode, output_monomorphic); | |
141 | 141 | } |
142 | 142 | |
143 | void count_constant_sites(char multi_fasta_filename[], char output_filename[]) { | |
143 | void count_constant_sites(char multi_fasta_filename[], char output_filename[]) | |
144 | { | |
144 | 145 | char cwd[100]; |
145 | 146 | FILE *input_file; |
146 | 147 | FILE *output_file; |
166 | 167 | fclose(output_file); |
167 | 168 | free(constant_site_counts); |
168 | 169 | } |
170 | ||
169 | 171 | // Inefficient |
170 | void strip_directory_from_filename(char * input_filename, char * output_filename) | |
172 | void strip_directory_from_filename(char *input_filename, char *output_filename) | |
171 | 173 | { |
172 | int i; | |
173 | int end_index = 0; | |
174 | int last_forward_slash_index = -1; | |
175 | for(i = 0; i< FILENAME_MAX; i++) | |
176 | { | |
177 | if(input_filename[i] == '/') | |
178 | { | |
179 | last_forward_slash_index = i; | |
174 | int i; | |
175 | int end_index = 0; | |
176 | int last_forward_slash_index = -1; | |
177 | for (i = 0; i < FILENAME_MAX; i++) { | |
178 | if (input_filename[i] == '/') { | |
179 | last_forward_slash_index = i; | |
180 | } | |
181 | ||
182 | if (input_filename[i] == '\0' || input_filename[i] == '\n') { | |
183 | end_index = i; | |
184 | break; | |
185 | } | |
180 | 186 | } |
181 | ||
182 | if(input_filename[i] == '\0' || input_filename[i] == '\n') | |
183 | { | |
184 | end_index = i; | |
185 | break; | |
187 | ||
188 | int current_index = 0; | |
189 | for (i = last_forward_slash_index + 1; i < end_index; i++) { | |
190 | output_filename[current_index] = input_filename[i]; | |
191 | current_index++; | |
186 | 192 | } |
187 | } | |
188 | ||
189 | int current_index = 0; | |
190 | for(i = last_forward_slash_index+1; i< end_index; i++) | |
191 | { | |
192 | output_filename[current_index] = input_filename[i]; | |
193 | current_index++; | |
194 | } | |
195 | output_filename[current_index] = '\0'; | |
193 | output_filename[current_index] = '\0'; | |
196 | 194 | } |
197 | 195 | |
198 | 196 |
30 | 30 | int output_vcf_file, |
31 | 31 | int output_phylip_file, |
32 | 32 | char output_filename[]); |
33 | ||
33 | 34 | int generate_snp_sites_with_ref(char filename[], |
34 | 35 | int output_multi_fasta_file, |
35 | 36 | int output_vcf_file, |
36 | 37 | int output_phylip_file, |
37 | 38 | char output_filename[]); |
38 | ||
39 | ||
39 | 40 | int generate_snp_sites_with_ref_pure_mono(char filename[], |
40 | int output_multi_fasta_file, | |
41 | int output_vcf_file, | |
42 | int output_phylip_file, | |
43 | char output_filename[], | |
44 | int output_reference, | |
45 | int pure_mode, | |
46 | int output_monomorphic); | |
41 | int output_multi_fasta_file, | |
42 | int output_vcf_file, | |
43 | int output_phylip_file, | |
44 | char output_filename[], | |
45 | int output_reference, | |
46 | int pure_mode, | |
47 | int output_monomorphic); | |
47 | 48 | |
48 | 49 | void count_constant_sites(char multi_fasta_filename[], char filename[]); |
49 | 50 |
27 | 27 | #include "snp-sites.h" |
28 | 28 | #include <assert.h> |
29 | 29 | |
30 | void create_vcf_file(char filename[], int snp_locations[],int number_of_snps, char ** bases_for_snps, char ** sequence_names, int number_of_samples, size_t length_of_genome, char pseudo_reference_sequence[]) | |
31 | { | |
32 | FILE *vcf_file_pointer; | |
33 | char * base_filename; | |
34 | base_filename = (char *) malloc(FILENAME_MAX*sizeof(char)); | |
35 | strcpy(base_filename, filename); | |
36 | ||
37 | vcf_file_pointer=fopen(base_filename, "w"); | |
38 | output_vcf_header(vcf_file_pointer,sequence_names, number_of_samples, (int) length_of_genome); | |
39 | output_vcf_snps(vcf_file_pointer, bases_for_snps, snp_locations, number_of_snps, number_of_samples,pseudo_reference_sequence); | |
40 | fclose(vcf_file_pointer); | |
41 | free(base_filename); | |
42 | } | |
43 | ||
44 | void output_vcf_snps(FILE * vcf_file_pointer, char ** bases_for_snps, int * snp_locations, int number_of_snps, int number_of_samples, char pseudo_reference_sequence[]) | |
45 | { | |
46 | int i; | |
47 | for(i=0; i < number_of_snps; i++) | |
48 | { | |
49 | output_vcf_row(vcf_file_pointer, bases_for_snps[i], snp_locations[i], number_of_samples,pseudo_reference_sequence); | |
50 | } | |
51 | } | |
52 | ||
53 | void output_vcf_header( FILE * vcf_file_pointer, char ** sequence_names, int number_of_samples, size_t length_of_genome) | |
54 | { | |
55 | int i; | |
56 | fprintf( vcf_file_pointer, "##fileformat=VCFv4.1\n" ); | |
57 | fprintf( vcf_file_pointer, "##contig=<ID=1,length=%i>\n", (int) length_of_genome ); | |
58 | fprintf( vcf_file_pointer, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n" ); | |
59 | fprintf( vcf_file_pointer, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT" ); | |
60 | ||
61 | for(i=0; i<number_of_samples; i++) | |
62 | { | |
63 | fprintf( vcf_file_pointer, "\t%s", sequence_names[i]); | |
64 | } | |
65 | fprintf( vcf_file_pointer, "\n"); | |
66 | } | |
67 | ||
68 | void output_vcf_row(FILE * vcf_file_pointer, char * bases_for_snp, int snp_location, int number_of_samples, char pseudo_reference_sequence[]) | |
69 | { | |
70 | char reference_base = pseudo_reference_sequence[snp_location]; | |
71 | if(reference_base == '\0') | |
72 | { | |
73 | fprintf(stderr, "Couldnt get the reference base for coordinate %d\n",snp_location); | |
74 | fflush(stderr); | |
75 | exit(EXIT_FAILURE); | |
76 | } | |
77 | ||
78 | // Chromosome | |
79 | fprintf( vcf_file_pointer, "1\t"); | |
80 | ||
81 | // Position | |
82 | fprintf( vcf_file_pointer, "%d\t", (int) snp_location +1 ); | |
83 | ||
84 | //ID | |
85 | fprintf( vcf_file_pointer, ".\t"); | |
86 | ||
87 | // REF | |
88 | fprintf( vcf_file_pointer, "%c\t", (char) reference_base ); | |
89 | ||
90 | // ALT | |
91 | // Need to look through list and find unique characters | |
92 | ||
93 | char * alt_bases = alternative_bases(reference_base, bases_for_snp, number_of_samples); | |
94 | char * alternative_bases_string = format_alternative_bases(alt_bases); | |
95 | fprintf( vcf_file_pointer, "%s\t", alternative_bases_string ); | |
96 | free(alternative_bases_string); | |
97 | ||
98 | // QUAL | |
99 | fprintf( vcf_file_pointer, ".\t"); | |
100 | ||
101 | // FILTER | |
102 | fprintf( vcf_file_pointer, ".\t"); | |
103 | ||
104 | // INFO | |
105 | fprintf( vcf_file_pointer, ".\t"); | |
106 | ||
107 | // FORMAT | |
108 | fprintf( vcf_file_pointer, "GT\t"); | |
109 | ||
110 | // Bases for each sample | |
111 | output_vcf_row_samples_bases(vcf_file_pointer, reference_base, alt_bases, bases_for_snp, number_of_samples ); | |
112 | free(alt_bases); | |
113 | ||
114 | fprintf( vcf_file_pointer, "\n"); | |
115 | ||
116 | } | |
117 | ||
118 | ||
119 | char * alternative_bases(char reference_base, char * bases_for_snp, int number_of_samples) | |
120 | { | |
121 | int i; | |
122 | int num_alt_bases = 0; | |
123 | char * alt_bases = calloc(MAXIMUM_NUMBER_OF_ALT_BASES+1, sizeof(char)); | |
124 | for(i=0; i< number_of_samples; i++ ) | |
125 | { | |
126 | char current_base = bases_for_snp[i]; | |
127 | if(current_base != reference_base) | |
128 | { | |
129 | if(is_unknown(current_base)) | |
130 | { | |
131 | // VCF spec only allows ACGTN* for alts | |
132 | current_base = '*'; | |
133 | } | |
134 | ||
135 | if(check_if_char_in_string(alt_bases, current_base, num_alt_bases) == 0) | |
136 | { | |
137 | if (num_alt_bases >= MAXIMUM_NUMBER_OF_ALT_BASES) | |
138 | { | |
139 | fprintf(stderr, "Unexpectedly large number of alternative bases found between sequences. Please check input file is not corrupted\n\n"); | |
140 | fflush(stderr); | |
141 | exit(EXIT_FAILURE); | |
142 | } | |
143 | alt_bases[num_alt_bases] = current_base; | |
144 | num_alt_bases++; | |
145 | } | |
146 | } | |
147 | } | |
148 | alt_bases[num_alt_bases] = '\0'; | |
149 | return alt_bases; | |
150 | } | |
151 | ||
152 | char * format_allele_index(char base, char reference_base, char * alt_bases) | |
153 | { | |
154 | int length_of_alt_bases = strlen(alt_bases); | |
155 | assert(length_of_alt_bases < 100); | |
156 | char * result = calloc(5, sizeof(char)); | |
157 | int index; | |
158 | ||
159 | if(is_unknown(base)) | |
160 | { | |
161 | base = '*'; | |
162 | } | |
163 | ||
164 | if (reference_base == base) | |
165 | { | |
166 | sprintf(result, "0"); | |
167 | } | |
168 | else | |
169 | { | |
170 | sprintf(result, "."); | |
171 | for (index = 1; index <= length_of_alt_bases; index++) | |
172 | { | |
173 | if (alt_bases[index-1] == base) | |
174 | { | |
175 | sprintf(result, "%i", index); | |
176 | break; | |
177 | } | |
178 | } | |
179 | } | |
180 | return result; | |
181 | } | |
182 | ||
183 | char * format_alternative_bases(char * alt_bases) | |
184 | { | |
185 | int number_of_alt_bases = strlen(alt_bases); | |
186 | assert( number_of_alt_bases < MAXIMUM_NUMBER_OF_ALT_BASES ); | |
187 | ||
188 | if(number_of_alt_bases ==0) | |
189 | { | |
190 | char * formatted_alt_bases = calloc(2 + 1, sizeof(char)); | |
191 | formatted_alt_bases[0] = '.'; | |
30 | void | |
31 | create_vcf_file(char filename[], int snp_locations[], int number_of_snps, char **bases_for_snps, char **sequence_names, | |
32 | int number_of_samples, size_t length_of_genome, char pseudo_reference_sequence[]) | |
33 | { | |
34 | FILE *vcf_file_pointer; | |
35 | char *base_filename; | |
36 | base_filename = (char *) malloc(FILENAME_MAX * sizeof(char)); | |
37 | strcpy(base_filename, filename); | |
38 | ||
39 | vcf_file_pointer = fopen(base_filename, "w"); | |
40 | output_vcf_header(vcf_file_pointer, sequence_names, number_of_samples, (int) length_of_genome); | |
41 | output_vcf_snps(vcf_file_pointer, bases_for_snps, snp_locations, number_of_snps, number_of_samples, | |
42 | pseudo_reference_sequence); | |
43 | fclose(vcf_file_pointer); | |
44 | free(base_filename); | |
45 | } | |
46 | ||
47 | void output_vcf_snps(FILE *vcf_file_pointer, char **bases_for_snps, int *snp_locations, int number_of_snps, | |
48 | int number_of_samples, char pseudo_reference_sequence[]) | |
49 | { | |
50 | int i; | |
51 | for (i = 0; i < number_of_snps; i++) { | |
52 | output_vcf_row(vcf_file_pointer, bases_for_snps[i], snp_locations[i], number_of_samples, | |
53 | pseudo_reference_sequence); | |
54 | } | |
55 | } | |
56 | ||
57 | void output_vcf_header(FILE *vcf_file_pointer, char **sequence_names, int number_of_samples, size_t length_of_genome) | |
58 | { | |
59 | int i; | |
60 | fprintf(vcf_file_pointer, "##fileformat=VCFv4.1\n"); | |
61 | fprintf(vcf_file_pointer, "##contig=<ID=1,length=%i>\n", (int) length_of_genome); | |
62 | fprintf(vcf_file_pointer, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"); | |
63 | fprintf(vcf_file_pointer, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); | |
64 | ||
65 | for (i = 0; i < number_of_samples; i++) { | |
66 | fprintf(vcf_file_pointer, "\t%s", sequence_names[i]); | |
67 | } | |
68 | fprintf(vcf_file_pointer, "\n"); | |
69 | } | |
70 | ||
71 | void output_vcf_row(FILE *vcf_file_pointer, char *bases_for_snp, int snp_location, int number_of_samples, | |
72 | char pseudo_reference_sequence[]) | |
73 | { | |
74 | char reference_base = pseudo_reference_sequence[snp_location]; | |
75 | if (reference_base == '\0') { | |
76 | fprintf(stderr, "Couldnt get the reference base for coordinate %d\n", snp_location); | |
77 | fflush(stderr); | |
78 | exit(EXIT_FAILURE); | |
79 | } | |
80 | ||
81 | // Chromosome | |
82 | fprintf(vcf_file_pointer, "1\t"); | |
83 | ||
84 | // Position | |
85 | fprintf(vcf_file_pointer, "%d\t", (int) snp_location + 1); | |
86 | ||
87 | //ID | |
88 | fprintf(vcf_file_pointer, ".\t"); | |
89 | ||
90 | // REF | |
91 | fprintf(vcf_file_pointer, "%c\t", (char) reference_base); | |
92 | ||
93 | // ALT | |
94 | // Need to look through list and find unique characters | |
95 | ||
96 | char *alt_bases = alternative_bases(reference_base, bases_for_snp, number_of_samples); | |
97 | char *alternative_bases_string = format_alternative_bases(alt_bases); | |
98 | fprintf(vcf_file_pointer, "%s\t", alternative_bases_string); | |
99 | free(alternative_bases_string); | |
100 | ||
101 | // QUAL | |
102 | fprintf(vcf_file_pointer, ".\t"); | |
103 | ||
104 | // FILTER | |
105 | fprintf(vcf_file_pointer, ".\t"); | |
106 | ||
107 | // INFO | |
108 | fprintf(vcf_file_pointer, ".\t"); | |
109 | ||
110 | // FORMAT | |
111 | fprintf(vcf_file_pointer, "GT\t"); | |
112 | ||
113 | // Bases for each sample | |
114 | output_vcf_row_samples_bases(vcf_file_pointer, reference_base, alt_bases, bases_for_snp, number_of_samples); | |
115 | free(alt_bases); | |
116 | ||
117 | fprintf(vcf_file_pointer, "\n"); | |
118 | ||
119 | } | |
120 | ||
121 | ||
122 | char *alternative_bases(char reference_base, char *bases_for_snp, int number_of_samples) | |
123 | { | |
124 | int i; | |
125 | int num_alt_bases = 0; | |
126 | char *alt_bases = calloc(MAXIMUM_NUMBER_OF_ALT_BASES + 1, sizeof(char)); | |
127 | for (i = 0; i < number_of_samples; i++) { | |
128 | char current_base = bases_for_snp[i]; | |
129 | if (current_base != reference_base) { | |
130 | if (is_unknown(current_base)) { | |
131 | // VCF spec only allows ACGTN* for alts | |
132 | current_base = '*'; | |
133 | } | |
134 | ||
135 | if (check_if_char_in_string(alt_bases, current_base, num_alt_bases) == 0) { | |
136 | if (num_alt_bases >= MAXIMUM_NUMBER_OF_ALT_BASES) { | |
137 | fprintf(stderr, | |
138 | "Unexpectedly large number of alternative bases found between sequences. Please check input file is not corrupted\n\n"); | |
139 | fflush(stderr); | |
140 | exit(EXIT_FAILURE); | |
141 | } | |
142 | alt_bases[num_alt_bases] = current_base; | |
143 | num_alt_bases++; | |
144 | } | |
145 | } | |
146 | } | |
147 | alt_bases[num_alt_bases] = '\0'; | |
148 | return alt_bases; | |
149 | } | |
150 | ||
151 | char *format_allele_index(char base, char reference_base, char *alt_bases) | |
152 | { | |
153 | int length_of_alt_bases = strlen(alt_bases); | |
154 | assert(length_of_alt_bases < 100); | |
155 | char *result = calloc(5, sizeof(char)); | |
156 | int index; | |
157 | ||
158 | if (is_unknown(base)) { | |
159 | base = '*'; | |
160 | } | |
161 | ||
162 | if (reference_base == base) { | |
163 | sprintf(result, "0"); | |
164 | } else { | |
165 | sprintf(result, "."); | |
166 | for (index = 1; index <= length_of_alt_bases; index++) { | |
167 | if (alt_bases[index - 1] == base) { | |
168 | sprintf(result, "%i", index); | |
169 | break; | |
170 | } | |
171 | } | |
172 | } | |
173 | return result; | |
174 | } | |
175 | ||
176 | char *format_alternative_bases(char *alt_bases) | |
177 | { | |
178 | int number_of_alt_bases = strlen(alt_bases); | |
179 | assert(number_of_alt_bases < MAXIMUM_NUMBER_OF_ALT_BASES); | |
180 | ||
181 | if (number_of_alt_bases == 0) { | |
182 | char *formatted_alt_bases = calloc(2 + 1, sizeof(char)); | |
183 | formatted_alt_bases[0] = '.'; | |
184 | return formatted_alt_bases; | |
185 | } | |
186 | ||
187 | char *formatted_alt_bases = calloc(number_of_alt_bases * 2 + 1, sizeof(char)); | |
188 | int i; | |
189 | formatted_alt_bases[0] = alt_bases[0]; | |
190 | for (i = 1; i < number_of_alt_bases; i++) { | |
191 | formatted_alt_bases[i * 2 - 1] = ','; | |
192 | formatted_alt_bases[i * 2] = alt_bases[i]; | |
193 | } | |
192 | 194 | return formatted_alt_bases; |
193 | } | |
194 | ||
195 | char * formatted_alt_bases = calloc(number_of_alt_bases*2 + 1, sizeof(char)); | |
196 | int i; | |
197 | formatted_alt_bases[0] = alt_bases[0]; | |
198 | for (i = 1; i < number_of_alt_bases; i++) | |
199 | { | |
200 | formatted_alt_bases[i*2 - 1] = ','; | |
201 | formatted_alt_bases[i*2] = alt_bases[i]; | |
202 | } | |
203 | return formatted_alt_bases; | |
204 | 195 | } |
205 | 196 | |
206 | 197 | int check_if_char_in_string(char search_string[], char target_char, int search_string_length) |
207 | 198 | { |
208 | int i; | |
209 | for(i=0; i < search_string_length ; i++ ) | |
210 | { | |
211 | if(search_string[i] == target_char) | |
212 | { | |
213 | return 1; | |
214 | } | |
215 | } | |
216 | return 0; | |
217 | } | |
218 | ||
219 | void output_vcf_row_samples_bases(FILE * vcf_file_pointer, char reference_base, char * alt_bases, char * bases_for_snp, int number_of_samples) | |
220 | { | |
221 | int i; | |
222 | char * format; | |
223 | ||
224 | for(i=0; i < number_of_samples ; i++ ) | |
225 | { | |
226 | format = format_allele_index(bases_for_snp[i], reference_base, alt_bases); | |
227 | fprintf( vcf_file_pointer, "%s", format); | |
228 | free(format); | |
229 | if(i+1 != number_of_samples) | |
230 | { | |
231 | fprintf( vcf_file_pointer, "\t"); | |
232 | } | |
233 | } | |
234 | } | |
235 | ||
236 | ||
237 | ||
238 | ||
199 | int i; | |
200 | for (i = 0; i < search_string_length; i++) { | |
201 | if (search_string[i] == target_char) { | |
202 | return 1; | |
203 | } | |
204 | } | |
205 | return 0; | |
206 | } | |
207 | ||
208 | void output_vcf_row_samples_bases(FILE *vcf_file_pointer, char reference_base, char *alt_bases, char *bases_for_snp, | |
209 | int number_of_samples) | |
210 | { | |
211 | int i; | |
212 | char *format; | |
213 | ||
214 | for (i = 0; i < number_of_samples; i++) { | |
215 | format = format_allele_index(bases_for_snp[i], reference_base, alt_bases); | |
216 | fprintf(vcf_file_pointer, "%s", format); | |
217 | free(format); | |
218 | if (i + 1 != number_of_samples) { | |
219 | fprintf(vcf_file_pointer, "\t"); | |
220 | } | |
221 | } | |
222 | } | |
223 | ||
224 | ||
225 | ||
226 |
22 | 22 | |
23 | 23 | #include <stdio.h> |
24 | 24 | |
25 | void output_vcf_header( FILE * vcf_file_pointer, char ** sequence_names, int number_of_samples, size_t length_of_genome); | |
26 | void create_vcf_file(char filename[], int snp_locations[], int number_of_snps, char ** bases_for_snps, char ** sequence_names, int number_of_samples, size_t length_of_genome, char pseudo_reference_sequence[]); | |
27 | void output_vcf_snps(FILE * vcf_file_pointer, char ** bases_for_snps, int * snp_locations, int number_of_snps, int number_of_samples, char pseudo_reference_sequence[]); | |
28 | void output_vcf_row(FILE * vcf_file_pointer, char * bases_for_snp, int snp_location, int number_of_samples, char pseudo_reference_sequence[]); | |
29 | void output_vcf_row_samples_bases(FILE * vcf_file_pointer, char reference_base, char * alt_bases, char * bases_for_snp, int number_of_samples); | |
30 | char * alternative_bases(char reference_base, char * bases_for_snp, int number_of_samples); | |
31 | char * format_alternative_bases(char *); | |
32 | char * format_allele_index(char, char, char *); | |
25 | void output_vcf_header(FILE *vcf_file_pointer, char **sequence_names, int number_of_samples, size_t length_of_genome); | |
26 | ||
27 | void | |
28 | create_vcf_file(char filename[], int snp_locations[], int number_of_snps, char **bases_for_snps, char **sequence_names, | |
29 | int number_of_samples, size_t length_of_genome, char pseudo_reference_sequence[]); | |
30 | ||
31 | void output_vcf_snps(FILE *vcf_file_pointer, char **bases_for_snps, int *snp_locations, int number_of_snps, | |
32 | int number_of_samples, char pseudo_reference_sequence[]); | |
33 | ||
34 | void output_vcf_row(FILE *vcf_file_pointer, char *bases_for_snp, int snp_location, int number_of_samples, | |
35 | char pseudo_reference_sequence[]); | |
36 | ||
37 | void output_vcf_row_samples_bases(FILE *vcf_file_pointer, char reference_base, char *alt_bases, char *bases_for_snp, | |
38 | int number_of_samples); | |
39 | ||
40 | char *alternative_bases(char reference_base, char *bases_for_snp, int number_of_samples); | |
41 | ||
42 | char *format_alternative_bases(char *); | |
43 | ||
44 | char *format_allele_index(char, char, char *); | |
45 | ||
33 | 46 | int check_if_char_in_string(char search_string[], char target_char, int search_string_length); |
34 | 47 | |
35 | 48 | #define MAXIMUM_NUMBER_OF_ALT_BASES 30 |
34 | 34 | #define actual_ref_seq_length2 10 //Former was 9 |
35 | 35 | |
36 | 36 | START_TEST (valid_alignment_with_one_line_per_sequence) |
37 | { | |
38 | generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln",1,1,1,""); | |
39 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.vcf", "alignment_file_one_line_per_sequence.aln.vcf" ) == 1, "Invalid VCF file for 1 line per seq" ); | |
40 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip", "alignment_file_one_line_per_sequence.aln.phylip" ) == 1, "Invalid Phylip file for 1 line per seq" ); | |
41 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln","alignment_file_one_line_per_sequence.aln.snp_sites.aln" ) == 1 , "Invalid ALN file for 1 line per seq"); | |
42 | remove("alignment_file_one_line_per_sequence.aln.vcf"); | |
43 | remove("alignment_file_one_line_per_sequence.aln.phylip"); | |
44 | remove("alignment_file_one_line_per_sequence.aln.snp_sites.aln"); | |
45 | } | |
46 | END_TEST | |
47 | ||
48 | ||
49 | START_TEST (valid_alignment_with_n_as_gap) | |
50 | { | |
51 | generate_snp_sites("../tests/data/alignment_file_with_n.aln",1,1,1, ""); | |
52 | fail_unless( compare_files("../tests/data/alignment_file_with_n.aln.vcf", "alignment_file_with_n.aln.vcf" ) == 1, "Invalid VCF file for 1 line per seq" ); | |
53 | fail_unless( compare_files("../tests/data/alignment_file_with_n.aln.phylip", "alignment_file_with_n.aln.phylip" ) == 1, "Invalid Phylip file for 1 line per seq" ); | |
54 | fail_unless( compare_files("../tests/data/alignment_file_with_n.aln.snp_sites.aln","alignment_file_with_n.aln.snp_sites.aln" ) == 1 , "Invalid ALN file for 1 line per seq"); | |
55 | remove("alignment_file_with_n.aln.vcf"); | |
56 | remove("alignment_file_with_n.aln.phylip"); | |
57 | remove("alignment_file_with_n.aln.snp_sites.aln"); | |
58 | } | |
59 | END_TEST | |
60 | ||
61 | ||
62 | ||
63 | START_TEST (valid_alignment_with_one_line_per_sequence_gzipped) | |
64 | { | |
65 | generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln.gz",1,1,1, NULL); | |
66 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.vcf", "alignment_file_one_line_per_sequence.aln.gz.vcf" ) == 1, "Invalid VCF file for 1 line per seq" ); | |
67 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip", "alignment_file_one_line_per_sequence.aln.gz.phylip" ) == 1, "Invalid Phylip file for 1 line per seq" ); | |
68 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln","alignment_file_one_line_per_sequence.aln.gz.snp_sites.aln" ) == 1 , "Invalid ALN file for 1 line per seq"); | |
69 | remove("alignment_file_one_line_per_sequence.aln.gz.vcf"); | |
70 | remove("alignment_file_one_line_per_sequence.aln.gz.phylip"); | |
71 | remove("alignment_file_one_line_per_sequence.aln.gz.snp_sites.aln"); | |
72 | } | |
73 | END_TEST | |
74 | ||
75 | START_TEST (valid_alignment_with_multiple_lines_per_sequence) | |
76 | { | |
77 | generate_snp_sites("../tests/data/alignment_file_multiple_lines_per_sequence.aln",1,1,1,""); | |
78 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.vcf", "alignment_file_multiple_lines_per_sequence.aln.vcf" ) == 1, "Invalid VCF file for multiple lines per seq" ); | |
79 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip", "alignment_file_multiple_lines_per_sequence.aln.phylip" ) == 1, "Invalid Phylip file for multiple lines per seq" ); | |
80 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln","alignment_file_multiple_lines_per_sequence.aln.snp_sites.aln" ) == 1 ,"Invalid ALN file for multiple lines per seq"); | |
37 | { | |
38 | generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln", 1, 1, 1, ""); | |
39 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.vcf", "alignment_file_one_line_per_sequence.aln.vcf" ) == 1, "Invalid VCF file for 1 line per seq" ); | |
40 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip", "alignment_file_one_line_per_sequence.aln.phylip" ) == 1, "Invalid Phylip file for 1 line per seq" ); | |
41 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln", "alignment_file_one_line_per_sequence.aln.snp_sites.aln" ) == 1, "Invalid ALN file for 1 line per seq"); | |
42 | remove("alignment_file_one_line_per_sequence.aln.vcf"); | |
43 | remove("alignment_file_one_line_per_sequence.aln.phylip"); | |
44 | remove("alignment_file_one_line_per_sequence.aln.snp_sites.aln"); | |
45 | } | |
46 | ||
47 | END_TEST | |
48 | ||
49 | ||
50 | START_TEST(valid_alignment_with_n_as_gap) | |
51 | { | |
52 | generate_snp_sites("../tests/data/alignment_file_with_n.aln", 1, 1, 1, ""); | |
53 | fail_unless(compare_files("../tests/data/alignment_file_with_n.aln.vcf", "alignment_file_with_n.aln.vcf") == 1, | |
54 | "Invalid VCF file for 1 line per seq"); | |
55 | fail_unless( | |
56 | compare_files("../tests/data/alignment_file_with_n.aln.phylip", "alignment_file_with_n.aln.phylip") == 1, | |
57 | "Invalid Phylip file for 1 line per seq"); | |
58 | fail_unless(compare_files("../tests/data/alignment_file_with_n.aln.snp_sites.aln", | |
59 | "alignment_file_with_n.aln.snp_sites.aln") == 1, "Invalid ALN file for 1 line per seq"); | |
60 | remove("alignment_file_with_n.aln.vcf"); | |
61 | remove("alignment_file_with_n.aln.phylip"); | |
62 | remove("alignment_file_with_n.aln.snp_sites.aln"); | |
63 | } | |
64 | ||
65 | END_TEST | |
66 | ||
67 | ||
68 | START_TEST(valid_alignment_with_one_line_per_sequence_gzipped) | |
69 | { | |
70 | generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln.gz", 1, 1, 1, NULL); | |
71 | fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.vcf", | |
72 | "alignment_file_one_line_per_sequence.aln.gz.vcf") == 1, | |
73 | "Invalid VCF file for 1 line per seq"); | |
74 | fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip", | |
75 | "alignment_file_one_line_per_sequence.aln.gz.phylip") == 1, | |
76 | "Invalid Phylip file for 1 line per seq"); | |
77 | fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln", | |
78 | "alignment_file_one_line_per_sequence.aln.gz.snp_sites.aln") == 1, | |
79 | "Invalid ALN file for 1 line per seq"); | |
80 | remove("alignment_file_one_line_per_sequence.aln.gz.vcf"); | |
81 | remove("alignment_file_one_line_per_sequence.aln.gz.phylip"); | |
82 | remove("alignment_file_one_line_per_sequence.aln.gz.snp_sites.aln"); | |
83 | } | |
84 | ||
85 | END_TEST | |
86 | ||
87 | START_TEST(valid_alignment_with_multiple_lines_per_sequence) | |
88 | { | |
89 | generate_snp_sites("../tests/data/alignment_file_multiple_lines_per_sequence.aln", 1, 1, 1, ""); | |
90 | fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.vcf", | |
91 | "alignment_file_multiple_lines_per_sequence.aln.vcf") == 1, | |
92 | "Invalid VCF file for multiple lines per seq"); | |
93 | fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip", | |
94 | "alignment_file_multiple_lines_per_sequence.aln.phylip") == 1, | |
95 | "Invalid Phylip file for multiple lines per seq"); | |
96 | fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln", | |
97 | "alignment_file_multiple_lines_per_sequence.aln.snp_sites.aln") == 1, | |
98 | "Invalid ALN file for multiple lines per seq"); | |
81 | 99 | remove("alignment_file_multiple_lines_per_sequence.aln.vcf"); |
82 | 100 | remove("alignment_file_multiple_lines_per_sequence.aln.phylip"); |
83 | 101 | remove("alignment_file_multiple_lines_per_sequence.aln.snp_sites.aln"); |
84 | 102 | } |
85 | END_TEST | |
86 | ||
87 | START_TEST (valid_alignment_with_pure_mode) | |
88 | { | |
89 | generate_snp_sites_with_ref_pure_mono("../tests/data/pure_mode_alignment.aln",1,1,1,"",0,1,0); | |
90 | fail_unless( compare_files("../tests/data/pure_mode_alignment.aln.vcf", "pure_mode_alignment.aln.vcf" ) == 1, "Invalid VCF file for multiple lines per seq" ); | |
91 | fail_unless( compare_files("../tests/data/pure_mode_alignment.aln.phylip", "pure_mode_alignment.aln.phylip" ) == 1, "Invalid Phylip file for multiple lines per seq" ); | |
92 | fail_unless( compare_files("../tests/data/pure_mode_alignment.aln.snp_sites.aln","pure_mode_alignment.aln.snp_sites.aln" ) == 1 ,"Invalid ALN file for multiple lines per seq"); | |
103 | ||
104 | END_TEST | |
105 | ||
106 | START_TEST(valid_alignment_with_pure_mode) | |
107 | { | |
108 | generate_snp_sites_with_ref_pure_mono("../tests/data/pure_mode_alignment.aln", 1, 1, 1, "", 0, 1, 0); | |
109 | fail_unless(compare_files("../tests/data/pure_mode_alignment.aln.vcf", "pure_mode_alignment.aln.vcf") == 1, | |
110 | "Invalid VCF file for multiple lines per seq"); | |
111 | fail_unless(compare_files("../tests/data/pure_mode_alignment.aln.phylip", "pure_mode_alignment.aln.phylip") == 1, | |
112 | "Invalid Phylip file for multiple lines per seq"); | |
113 | fail_unless(compare_files("../tests/data/pure_mode_alignment.aln.snp_sites.aln", | |
114 | "pure_mode_alignment.aln.snp_sites.aln") == 1, | |
115 | "Invalid ALN file for multiple lines per seq"); | |
93 | 116 | remove("pure_mode_alignment.aln.vcf"); |
94 | 117 | remove("pure_mode_alignment.aln.phylip"); |
95 | 118 | remove("pure_mode_alignment.aln.snp_sites.aln"); |
96 | 119 | } |
97 | END_TEST | |
98 | ||
99 | START_TEST (valid_alignment_with_monomorphic_sites) | |
100 | { | |
101 | generate_snp_sites_with_ref_pure_mono("../tests/data/pure_mode_monomorphic_alignment.aln",1,1,1,"",0,1,1); | |
102 | fail_unless( compare_files("../tests/data/pure_mode_monomorphic_alignment.aln.vcf", "pure_mode_monomorphic_alignment.aln.vcf" ) == 1, "Invalid VCF file for multiple lines per seq" ); | |
103 | fail_unless( compare_files("../tests/data/pure_mode_monomorphic_alignment.aln.phylip", "pure_mode_monomorphic_alignment.aln.phylip" ) == 1, "Invalid Phylip file for multiple lines per seq" ); | |
104 | fail_unless( compare_files("../tests/data/pure_mode_monomorphic_alignment.aln.snp_sites.aln","pure_mode_monomorphic_alignment.aln.snp_sites.aln" ) == 1 ,"Invalid ALN file for multiple lines per seq"); | |
120 | ||
121 | END_TEST | |
122 | ||
123 | START_TEST(valid_alignment_with_monomorphic_sites) | |
124 | { | |
125 | generate_snp_sites_with_ref_pure_mono("../tests/data/pure_mode_monomorphic_alignment.aln", 1, 1, 1, "", 0, 1, 1); | |
126 | fail_unless(compare_files("../tests/data/pure_mode_monomorphic_alignment.aln.vcf", | |
127 | "pure_mode_monomorphic_alignment.aln.vcf") == 1, | |
128 | "Invalid VCF file for multiple lines per seq"); | |
129 | fail_unless(compare_files("../tests/data/pure_mode_monomorphic_alignment.aln.phylip", | |
130 | "pure_mode_monomorphic_alignment.aln.phylip") == 1, | |
131 | "Invalid Phylip file for multiple lines per seq"); | |
132 | fail_unless(compare_files("../tests/data/pure_mode_monomorphic_alignment.aln.snp_sites.aln", | |
133 | "pure_mode_monomorphic_alignment.aln.snp_sites.aln") == 1, | |
134 | "Invalid ALN file for multiple lines per seq"); | |
105 | 135 | remove("pure_mode_monomorphic_alignment.aln.vcf"); |
106 | 136 | remove("pure_mode_monomorphic_alignment.aln.phylip"); |
107 | 137 | remove("pure_mode_monomorphic_alignment.aln.snp_sites.aln"); |
108 | 138 | } |
109 | END_TEST | |
110 | ||
111 | ||
112 | START_TEST (valid_with_only_aln_file_output_default) | |
113 | { | |
114 | generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln",0,0,0,""); | |
115 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln","alignment_file_one_line_per_sequence.aln.snp_sites.aln" ) == 1 ,"Invalid ALN file for multiple lines per seq"); | |
116 | remove("alignment_file_one_line_per_sequence.aln.snp_sites.aln"); | |
117 | } | |
118 | END_TEST | |
119 | ||
120 | START_TEST (valid_with_only_aln_file_output) | |
121 | { | |
122 | generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln",1,0,0,""); | |
123 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln","alignment_file_one_line_per_sequence.aln.snp_sites.aln" ) == 1 ,"Invalid ALN file for multiple lines per seq"); | |
124 | remove("alignment_file_one_line_per_sequence.aln.snp_sites.aln"); | |
125 | } | |
126 | END_TEST | |
127 | ||
128 | START_TEST (valid_with_only_aln_file_output_with_custom_name) | |
129 | { | |
130 | char output_filename[100] = {"some_custom_name"}; | |
131 | generate_snp_sites("../tests/data/alignment_file_multiple_lines_per_sequence.aln",1,0,0,output_filename); | |
132 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln","some_custom_name" ) == 1 ,"Only aln with custom name should be outputted"); | |
133 | remove("some_custom_name"); | |
134 | } | |
135 | END_TEST | |
136 | ||
137 | START_TEST (valid_with_phylip_outputted_with_custom_name) | |
138 | { | |
139 | generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln",0,0,1,"some_custom_name"); | |
140 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip", "some_custom_name" ) == 1, "Custom name needs extra extension for phylip" ); | |
141 | remove("some_custom_name"); | |
142 | ||
143 | } | |
144 | END_TEST | |
145 | ||
146 | START_TEST (invalid_with_uneven_file_lengths) | |
147 | { | |
148 | generate_snp_sites("../tests/data/uneven_alignment.aln",1,0,0,""); | |
149 | } | |
150 | END_TEST | |
151 | ||
152 | ||
153 | START_TEST (valid_with_all_outputted_with_custom_name) | |
154 | { | |
155 | generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln",1,1,1,"some_custom_name"); | |
156 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.vcf", "some_custom_name.vcf" ) == 1, "Custom name needs extra extension for VCF" ); | |
157 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip", "some_custom_name.phylip" ) == 1, "Custom name needs extra extension for phylip" ); | |
158 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln","some_custom_name.snp_sites.aln" ) == 1 , "Custom name needs extra extension for ALN"); | |
159 | remove("some_custom_name.vcf"); | |
160 | remove("some_custom_name.phylip"); | |
161 | remove("some_custom_name.snp_sites.aln"); | |
162 | } | |
163 | END_TEST | |
164 | ||
165 | ||
166 | START_TEST (valid_aln_plus_reference) | |
167 | { | |
168 | generate_snp_sites_with_ref("../tests/data/alignment_file_one_line_per_sequence.aln",1,0,0,""); | |
169 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence_reference.snp_sites.aln","alignment_file_one_line_per_sequence.aln.snp_sites.aln" ) == 1 ,"Invalid ALN file for multiple lines per seq"); | |
170 | remove("alignment_file_one_line_per_sequence.aln.snp_sites.aln"); | |
171 | } | |
172 | END_TEST | |
173 | ||
174 | START_TEST (valid_phylip_plus_reference) | |
175 | { | |
176 | generate_snp_sites_with_ref("../tests/data/alignment_file_one_line_per_sequence.aln",0,0,1,""); | |
177 | fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence_reference.aln.phylip", "alignment_file_one_line_per_sequence.aln.phylip" ) == 1, "invalid phylib with reference" ); | |
178 | remove("alignment_file_one_line_per_sequence.aln.phylip"); | |
179 | ||
180 | } | |
181 | END_TEST | |
182 | ||
183 | START_TEST (valid_genome_length) | |
184 | { | |
185 | detect_snps("../tests/data/alignment_file_one_line_per_sequence.aln",0,0); | |
186 | fail_unless( get_length_of_genome() == 2000 ); | |
187 | } | |
188 | END_TEST | |
189 | ||
190 | START_TEST (valid_genome_length_with_multiple_lines_per_sequence) | |
191 | { | |
192 | detect_snps("../tests/data/alignment_file_multiple_lines_per_sequence.aln",0,0); | |
193 | fail_unless( get_length_of_genome() == 2000 ); | |
194 | } | |
195 | END_TEST | |
196 | ||
197 | START_TEST (valid_number_of_sequences_in_file) | |
198 | { | |
199 | detect_snps("../tests/data/alignment_file_one_line_per_sequence.aln",0,0); | |
200 | fail_unless( get_number_of_samples() == 109 ); | |
201 | } | |
202 | END_TEST | |
203 | ||
204 | START_TEST (valid_number_of_sequences_in_file_with_multiple_lines_per_sequence) | |
205 | { | |
206 | detect_snps("../tests/data/alignment_file_multiple_lines_per_sequence.aln",0,0); | |
207 | fail_unless( get_number_of_samples() == 109 ); | |
208 | } | |
209 | END_TEST | |
210 | ||
211 | START_TEST (number_of_snps_detected) | |
212 | { | |
213 | detect_snps("../tests/data/alignment_file_multiple_lines_per_sequence.aln",0,0); | |
214 | fail_unless( get_number_of_snps() == 5); | |
215 | } | |
216 | END_TEST | |
217 | ||
218 | START_TEST (number_of_snps_detected_small) | |
219 | { | |
220 | detect_snps("../tests/data/small_alignment.aln",0,0); | |
221 | fail_unless( get_number_of_snps() == 1); | |
222 | } | |
223 | END_TEST | |
224 | ||
225 | START_TEST (detect_snps_pure_mode) | |
226 | { | |
227 | detect_snps("../tests/data/pure_mode_alignment.aln",1,0); | |
228 | fail_unless( get_number_of_snps() == 2); | |
229 | } | |
230 | END_TEST | |
231 | ||
232 | ||
233 | START_TEST (detect_snps_pure_mode_monomorphic) | |
234 | { | |
235 | detect_snps("../tests/data/pure_mode_monomorphic_alignment.aln",1,1); | |
236 | fail_unless( get_number_of_snps() == 3); | |
237 | } | |
238 | END_TEST | |
239 | ||
240 | START_TEST (sample_names_from_alignment_file) | |
241 | { | |
242 | detect_snps("../tests/data/small_alignment.aln",0,0); | |
243 | char ** current_sequence_names = get_sequence_names(); | |
244 | ||
245 | fail_unless(strcmp(current_sequence_names[0],"reference_sequence") == 0); | |
246 | fail_unless(strcmp(current_sequence_names[1],"comparison_sequence") == 0); | |
247 | fail_unless(strcmp(current_sequence_names[2],"another_comparison_sequence") == 0); | |
248 | } | |
249 | END_TEST | |
250 | ||
251 | START_TEST (check_strip_directory_from_filename_without_directory) | |
252 | { | |
253 | char *input_filename_without_directory = "my_file_name.aln"; | |
254 | char output_filename[30]; | |
255 | memset(output_filename,'\0',30); | |
256 | strip_directory_from_filename(input_filename_without_directory, output_filename); | |
257 | fail_unless( strcmp(input_filename_without_directory, output_filename) ==0 ); | |
258 | } | |
259 | END_TEST | |
260 | ||
261 | START_TEST (check_strip_directory_from_filename_with_directory) | |
262 | { | |
263 | char *input_filename_without_directory = "/some/directory/name/my_file_name.aln"; | |
264 | char output_filename[30]; | |
265 | memset(output_filename,'\0',30); | |
266 | strip_directory_from_filename(input_filename_without_directory, output_filename); | |
267 | fail_unless( strcmp("my_file_name.aln", output_filename) ==0 ); | |
268 | } | |
269 | END_TEST | |
270 | ||
271 | START_TEST (check_count_constant_sites) | |
139 | ||
140 | END_TEST | |
141 | ||
142 | ||
143 | START_TEST(valid_with_only_aln_file_output_default) | |
144 | { | |
145 | generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln", 0, 0, 0, ""); | |
146 | fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln", | |
147 | "alignment_file_one_line_per_sequence.aln.snp_sites.aln") == 1, | |
148 | "Invalid ALN file for multiple lines per seq"); | |
149 | remove("alignment_file_one_line_per_sequence.aln.snp_sites.aln"); | |
150 | } | |
151 | ||
152 | END_TEST | |
153 | ||
154 | START_TEST(valid_with_only_aln_file_output) | |
155 | { | |
156 | generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln", 1, 0, 0, ""); | |
157 | fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln", | |
158 | "alignment_file_one_line_per_sequence.aln.snp_sites.aln") == 1, | |
159 | "Invalid ALN file for multiple lines per seq"); | |
160 | remove("alignment_file_one_line_per_sequence.aln.snp_sites.aln"); | |
161 | } | |
162 | ||
163 | END_TEST | |
164 | ||
165 | START_TEST(valid_with_only_aln_file_output_with_custom_name) | |
166 | { | |
167 | char output_filename[100] = {"some_custom_name"}; | |
168 | generate_snp_sites("../tests/data/alignment_file_multiple_lines_per_sequence.aln", 1, 0, 0, output_filename); | |
169 | fail_unless( | |
170 | compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln", "some_custom_name") == | |
171 | 1, "Only aln with custom name should be outputted"); | |
172 | remove("some_custom_name"); | |
173 | } | |
174 | ||
175 | END_TEST | |
176 | ||
177 | START_TEST(valid_with_phylip_outputted_with_custom_name) | |
178 | { | |
179 | generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln", 0, 0, 1, "some_custom_name"); | |
180 | fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip", "some_custom_name") == 1, | |
181 | "Custom name needs extra extension for phylip"); | |
182 | remove("some_custom_name"); | |
183 | ||
184 | } | |
185 | ||
186 | END_TEST | |
187 | ||
188 | START_TEST(invalid_with_uneven_file_lengths) | |
189 | { | |
190 | generate_snp_sites("../tests/data/uneven_alignment.aln", 1, 0, 0, ""); | |
191 | } | |
192 | ||
193 | END_TEST | |
194 | ||
195 | ||
196 | START_TEST(valid_with_all_outputted_with_custom_name) | |
197 | { | |
198 | generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln", 1, 1, 1, "some_custom_name"); | |
199 | fail_unless( | |
200 | compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.vcf", "some_custom_name.vcf") == 1, | |
201 | "Custom name needs extra extension for VCF"); | |
202 | fail_unless( | |
203 | compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip", "some_custom_name.phylip") == | |
204 | 1, "Custom name needs extra extension for phylip"); | |
205 | fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln", | |
206 | "some_custom_name.snp_sites.aln") == 1, "Custom name needs extra extension for ALN"); | |
207 | remove("some_custom_name.vcf"); | |
208 | remove("some_custom_name.phylip"); | |
209 | remove("some_custom_name.snp_sites.aln"); | |
210 | } | |
211 | ||
212 | END_TEST | |
213 | ||
214 | ||
215 | START_TEST(valid_aln_plus_reference) | |
216 | { | |
217 | generate_snp_sites_with_ref("../tests/data/alignment_file_one_line_per_sequence.aln", 1, 0, 0, ""); | |
218 | fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence_reference.snp_sites.aln", | |
219 | "alignment_file_one_line_per_sequence.aln.snp_sites.aln") == 1, | |
220 | "Invalid ALN file for multiple lines per seq"); | |
221 | remove("alignment_file_one_line_per_sequence.aln.snp_sites.aln"); | |
222 | } | |
223 | ||
224 | END_TEST | |
225 | ||
226 | START_TEST(valid_phylip_plus_reference) | |
227 | { | |
228 | generate_snp_sites_with_ref("../tests/data/alignment_file_one_line_per_sequence.aln", 0, 0, 1, ""); | |
229 | fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence_reference.aln.phylip", | |
230 | "alignment_file_one_line_per_sequence.aln.phylip") == 1, "invalid phylib with reference"); | |
231 | remove("alignment_file_one_line_per_sequence.aln.phylip"); | |
232 | ||
233 | } | |
234 | ||
235 | END_TEST | |
236 | ||
237 | START_TEST(valid_genome_length) | |
238 | { | |
239 | detect_snps("../tests/data/alignment_file_one_line_per_sequence.aln", 0, 0); | |
240 | fail_unless(get_length_of_genome() == 2000); | |
241 | } | |
242 | ||
243 | END_TEST | |
244 | ||
245 | START_TEST(valid_genome_length_with_multiple_lines_per_sequence) | |
246 | { | |
247 | detect_snps("../tests/data/alignment_file_multiple_lines_per_sequence.aln", 0, 0); | |
248 | fail_unless(get_length_of_genome() == 2000); | |
249 | } | |
250 | ||
251 | END_TEST | |
252 | ||
253 | START_TEST(valid_number_of_sequences_in_file) | |
254 | { | |
255 | detect_snps("../tests/data/alignment_file_one_line_per_sequence.aln", 0, 0); | |
256 | fail_unless(get_number_of_samples() == 109); | |
257 | } | |
258 | ||
259 | END_TEST | |
260 | ||
261 | START_TEST(valid_number_of_sequences_in_file_with_multiple_lines_per_sequence) | |
262 | { | |
263 | detect_snps("../tests/data/alignment_file_multiple_lines_per_sequence.aln", 0, 0); | |
264 | fail_unless(get_number_of_samples() == 109); | |
265 | } | |
266 | ||
267 | END_TEST | |
268 | ||
269 | START_TEST(number_of_snps_detected) | |
270 | { | |
271 | detect_snps("../tests/data/alignment_file_multiple_lines_per_sequence.aln", 0, 0); | |
272 | fail_unless(get_number_of_snps() == 5); | |
273 | } | |
274 | ||
275 | END_TEST | |
276 | ||
277 | START_TEST(number_of_snps_detected_small) | |
278 | { | |
279 | detect_snps("../tests/data/small_alignment.aln", 0, 0); | |
280 | fail_unless(get_number_of_snps() == 1); | |
281 | } | |
282 | ||
283 | END_TEST | |
284 | ||
285 | START_TEST(detect_snps_pure_mode) | |
286 | { | |
287 | detect_snps("../tests/data/pure_mode_alignment.aln", 1, 0); | |
288 | fail_unless(get_number_of_snps() == 2); | |
289 | } | |
290 | ||
291 | END_TEST | |
292 | ||
293 | ||
294 | START_TEST(detect_snps_pure_mode_monomorphic) | |
295 | { | |
296 | detect_snps("../tests/data/pure_mode_monomorphic_alignment.aln", 1, 1); | |
297 | fail_unless(get_number_of_snps() == 3); | |
298 | } | |
299 | ||
300 | END_TEST | |
301 | ||
302 | START_TEST(sample_names_from_alignment_file) | |
303 | { | |
304 | detect_snps("../tests/data/small_alignment.aln", 0, 0); | |
305 | char **current_sequence_names = get_sequence_names(); | |
306 | ||
307 | fail_unless(strcmp(current_sequence_names[0], "reference_sequence") == 0); | |
308 | fail_unless(strcmp(current_sequence_names[1], "comparison_sequence") == 0); | |
309 | fail_unless(strcmp(current_sequence_names[2], "another_comparison_sequence") == 0); | |
310 | } | |
311 | ||
312 | END_TEST | |
313 | ||
314 | START_TEST(check_strip_directory_from_filename_without_directory) | |
315 | { | |
316 | char *input_filename_without_directory = "my_file_name.aln"; | |
317 | char output_filename[30]; | |
318 | memset(output_filename, '\0', 30); | |
319 | strip_directory_from_filename(input_filename_without_directory, output_filename); | |
320 | fail_unless(strcmp(input_filename_without_directory, output_filename) == 0); | |
321 | } | |
322 | ||
323 | END_TEST | |
324 | ||
325 | START_TEST(check_strip_directory_from_filename_with_directory) | |
326 | { | |
327 | char *input_filename_without_directory = "/some/directory/name/my_file_name.aln"; | |
328 | char output_filename[30]; | |
329 | memset(output_filename, '\0', 30); | |
330 | strip_directory_from_filename(input_filename_without_directory, output_filename); | |
331 | fail_unless(strcmp("my_file_name.aln", output_filename) == 0); | |
332 | } | |
333 | ||
334 | END_TEST | |
335 | ||
336 | START_TEST(check_count_constant_sites) | |
272 | 337 | { |
273 | 338 | count_constant_sites("../tests/data/small_alignment.aln", "small_alignment.constant_site_counts.txt"); |
274 | fail_unless(compare_files("../tests/data/small_alignment.constant_site_counts.txt", "small_alignment.constant_site_counts.txt")); | |
339 | fail_unless(compare_files("../tests/data/small_alignment.constant_site_counts.txt", | |
340 | "small_alignment.constant_site_counts.txt")); | |
275 | 341 | remove("small_alignment.constant_site_counts.txt"); |
276 | 342 | } |
277 | END_TEST | |
278 | ||
279 | Suite * snp_sites_suite (void) | |
280 | { | |
281 | Suite *s = suite_create ("Creating_SNP_Sites"); | |
282 | ||
283 | TCase *tc_alignment_file = tcase_create ("alignment_file"); | |
284 | tcase_add_test (tc_alignment_file, valid_genome_length); | |
285 | tcase_add_test (tc_alignment_file, valid_genome_length_with_multiple_lines_per_sequence); | |
286 | tcase_add_test (tc_alignment_file, valid_number_of_sequences_in_file); | |
287 | tcase_add_test (tc_alignment_file, valid_number_of_sequences_in_file_with_multiple_lines_per_sequence); | |
288 | tcase_add_test (tc_alignment_file, number_of_snps_detected_small); | |
289 | tcase_add_test (tc_alignment_file, number_of_snps_detected); | |
290 | tcase_add_test (tc_alignment_file, sample_names_from_alignment_file); | |
291 | tcase_add_test (tc_alignment_file, detect_snps_pure_mode); | |
292 | tcase_add_test (tc_alignment_file, check_strip_directory_from_filename_without_directory); | |
293 | tcase_add_test (tc_alignment_file, check_strip_directory_from_filename_with_directory); | |
294 | tcase_add_test (tc_alignment_file, detect_snps_pure_mode_monomorphic); | |
295 | ||
296 | suite_add_tcase (s, tc_alignment_file); | |
297 | ||
298 | TCase *tc_snp_sites = tcase_create ("snp_sites"); | |
299 | tcase_add_test (tc_snp_sites, valid_alignment_with_one_line_per_sequence); | |
300 | tcase_add_test (tc_snp_sites, valid_alignment_with_multiple_lines_per_sequence); | |
301 | tcase_add_test (tc_snp_sites, valid_alignment_with_one_line_per_sequence_gzipped); | |
302 | tcase_add_test (tc_snp_sites, valid_alignment_with_n_as_gap); | |
303 | tcase_add_test (tc_snp_sites, valid_with_only_aln_file_output); | |
304 | tcase_add_test (tc_snp_sites, valid_with_only_aln_file_output_with_custom_name); | |
305 | tcase_add_test (tc_snp_sites, valid_with_all_outputted_with_custom_name); | |
306 | tcase_add_test (tc_snp_sites, valid_with_only_aln_file_output_default); | |
307 | tcase_add_test (tc_snp_sites, valid_with_phylip_outputted_with_custom_name); | |
308 | tcase_add_test (tc_snp_sites, valid_aln_plus_reference); | |
309 | tcase_add_test (tc_snp_sites, valid_phylip_plus_reference); | |
310 | tcase_add_test (tc_snp_sites, valid_alignment_with_pure_mode); | |
311 | tcase_add_test (tc_snp_sites, valid_alignment_with_monomorphic_sites); | |
312 | tcase_add_test (tc_snp_sites, check_count_constant_sites); | |
313 | ||
314 | tcase_add_exit_test(tc_snp_sites, invalid_with_uneven_file_lengths,EXIT_FAILURE); | |
315 | remove("uneven_alignment.aln.snp_sites.aln"); | |
316 | suite_add_tcase (s, tc_snp_sites); | |
317 | ||
318 | return s; | |
319 | } | |
320 | ||
343 | ||
344 | END_TEST | |
345 | ||
346 | Suite | |
347 | * | |
348 | ||
349 | snp_sites_suite(void) | |
350 | { | |
351 | Suite * s = suite_create("Creating_SNP_Sites"); | |
352 | ||
353 | TCase *tc_alignment_file = tcase_create("alignment_file"); | |
354 | tcase_add_test(tc_alignment_file, valid_genome_length); | |
355 | tcase_add_test(tc_alignment_file, valid_genome_length_with_multiple_lines_per_sequence); | |
356 | tcase_add_test(tc_alignment_file, valid_number_of_sequences_in_file); | |
357 | tcase_add_test(tc_alignment_file, valid_number_of_sequences_in_file_with_multiple_lines_per_sequence); | |
358 | tcase_add_test(tc_alignment_file, number_of_snps_detected_small); | |
359 | tcase_add_test(tc_alignment_file, number_of_snps_detected); | |
360 | tcase_add_test(tc_alignment_file, sample_names_from_alignment_file); | |
361 | tcase_add_test(tc_alignment_file, detect_snps_pure_mode); | |
362 | tcase_add_test(tc_alignment_file, check_strip_directory_from_filename_without_directory); | |
363 | tcase_add_test(tc_alignment_file, check_strip_directory_from_filename_with_directory); | |
364 | tcase_add_test(tc_alignment_file, detect_snps_pure_mode_monomorphic); | |
365 | ||
366 | suite_add_tcase(s, tc_alignment_file); | |
367 | ||
368 | TCase *tc_snp_sites = tcase_create("snp_sites"); | |
369 | tcase_add_test(tc_snp_sites, valid_alignment_with_one_line_per_sequence); | |
370 | tcase_add_test(tc_snp_sites, valid_alignment_with_multiple_lines_per_sequence); | |
371 | tcase_add_test(tc_snp_sites, valid_alignment_with_one_line_per_sequence_gzipped); | |
372 | tcase_add_test(tc_snp_sites, valid_alignment_with_n_as_gap); | |
373 | tcase_add_test(tc_snp_sites, valid_with_only_aln_file_output); | |
374 | tcase_add_test(tc_snp_sites, valid_with_only_aln_file_output_with_custom_name); | |
375 | tcase_add_test(tc_snp_sites, valid_with_all_outputted_with_custom_name); | |
376 | tcase_add_test(tc_snp_sites, valid_with_only_aln_file_output_default); | |
377 | tcase_add_test(tc_snp_sites, valid_with_phylip_outputted_with_custom_name); | |
378 | tcase_add_test(tc_snp_sites, valid_aln_plus_reference); | |
379 | tcase_add_test(tc_snp_sites, valid_phylip_plus_reference); | |
380 | tcase_add_test(tc_snp_sites, valid_alignment_with_pure_mode); | |
381 | tcase_add_test(tc_snp_sites, valid_alignment_with_monomorphic_sites); | |
382 | tcase_add_test(tc_snp_sites, check_count_constant_sites); | |
383 | ||
384 | tcase_add_exit_test(tc_snp_sites, invalid_with_uneven_file_lengths, EXIT_FAILURE); | |
385 | remove("uneven_alignment.aln.snp_sites.aln"); | |
386 | suite_add_tcase(s, tc_snp_sites); | |
387 | ||
388 | return s; | |
389 | } | |
390 |
19 | 19 | #ifndef _CHECK_SNP_SITES_H_ |
20 | 20 | #define _CHECK_SNP_SITES_H_ |
21 | 21 | |
22 | Suite * snp_sites_suite (void); | |
22 | Suite *snp_sites_suite(void); | |
23 | ||
23 | 24 | #endif |
24 | 25 | |
25 | 26 |
27 | 27 | #include "check-vcf.h" |
28 | 28 | #include "vcf.h" |
29 | 29 | |
30 | void check_alternative_bases(char reference_base, char * bases_for_snp, int number_of_samples, char * expected_result) | |
30 | void check_alternative_bases(char reference_base, char *bases_for_snp, int number_of_samples, char *expected_result) | |
31 | 31 | { |
32 | char * result; | |
33 | result = alternative_bases(reference_base, bases_for_snp, number_of_samples); | |
34 | ck_assert_str_eq(result, expected_result); | |
35 | free(result); | |
32 | char *result; | |
33 | result = alternative_bases(reference_base, bases_for_snp, number_of_samples); | |
34 | ck_assert_str_eq(result, expected_result); | |
35 | free(result); | |
36 | 36 | } |
37 | 37 | |
38 | 38 | START_TEST (alternative_bases_test) |
39 | { | |
40 | check_alternative_bases('A', "AGCT-nN", 6, "GCT*"); | |
41 | } | |
39 | { | |
40 | check_alternative_bases('A', "AGCT-nN", 6, "GCT*"); | |
41 | } | |
42 | ||
42 | 43 | END_TEST |
43 | 44 | |
44 | void check_format_alternative_bases(char * test_case, char * expected_result) | |
45 | void check_format_alternative_bases(char *test_case, char *expected_result) | |
45 | 46 | { |
46 | char * result; | |
47 | result = format_alternative_bases(test_case); | |
48 | ck_assert_str_eq(result, expected_result); | |
49 | free(result); | |
47 | char *result; | |
48 | result = format_alternative_bases(test_case); | |
49 | ck_assert_str_eq(result, expected_result); | |
50 | free(result); | |
50 | 51 | } |
51 | 52 | |
52 | 53 | START_TEST (format_alternative_bases_test) |
53 | { | |
54 | check_format_alternative_bases("", "."); | |
55 | check_format_alternative_bases("A", "A"); | |
56 | check_format_alternative_bases("AC", "A,C"); | |
57 | check_format_alternative_bases("ACT", "A,C,T"); | |
58 | } | |
54 | { | |
55 | check_format_alternative_bases("", "."); | |
56 | check_format_alternative_bases("A", "A"); | |
57 | check_format_alternative_bases("AC", "A,C"); | |
58 | check_format_alternative_bases("ACT", "A,C,T"); | |
59 | } | |
60 | ||
59 | 61 | END_TEST |
60 | 62 | |
61 | void check_format_allele_index(char test_base, char reference_base, char * alt_bases, char * expected_result) | |
63 | void check_format_allele_index(char test_base, char reference_base, char *alt_bases, char *expected_result) | |
62 | 64 | { |
63 | char * result; | |
64 | result = format_allele_index(test_base, reference_base, alt_bases); | |
65 | ck_assert_str_eq(result, expected_result); | |
66 | free(result); | |
65 | char *result; | |
66 | result = format_allele_index(test_base, reference_base, alt_bases); | |
67 | ck_assert_str_eq(result, expected_result); | |
68 | free(result); | |
67 | 69 | } |
68 | 70 | |
69 | 71 | START_TEST (format_allele_index_test) |
70 | { | |
71 | check_format_allele_index('A', 'A', "", "0"); | |
72 | check_format_allele_index('A', 'A', "C", "0"); | |
73 | check_format_allele_index('A', 'A', "CA", "0"); | |
72 | { | |
73 | check_format_allele_index('A', 'A', "", "0"); | |
74 | check_format_allele_index('A', 'A', "C", "0"); | |
75 | check_format_allele_index('A', 'A', "CA", "0"); | |
74 | 76 | |
75 | check_format_allele_index('A', 'C', "A", "1"); | |
76 | check_format_allele_index('A', 'C', "GA", "2"); | |
77 | check_format_allele_index('A', 'C', "A", "1"); | |
78 | check_format_allele_index('A', 'C', "GA", "2"); | |
77 | 79 | |
78 | check_format_allele_index('A', 'C', "", "."); | |
79 | check_format_allele_index('A', 'C', "G", "."); | |
80 | check_format_allele_index('A', 'C', "", "."); | |
81 | check_format_allele_index('A', 'C', "G", "."); | |
80 | 82 | |
81 | check_format_allele_index('A', 'B', "CDEFGHIJKLMNOPAQRST", "15"); | |
83 | check_format_allele_index('A', 'B', "CDEFGHIJKLMNOPAQRST", "15"); | |
82 | 84 | |
83 | check_format_allele_index('-', 'A', "C*", "2"); | |
84 | check_format_allele_index('N', 'A', "C*", "2"); | |
85 | check_format_allele_index('n', 'A', "C*", "2"); | |
86 | } | |
85 | check_format_allele_index('-', 'A', "C*", "2"); | |
86 | check_format_allele_index('N', 'A', "C*", "2"); | |
87 | check_format_allele_index('n', 'A', "C*", "2"); | |
88 | } | |
87 | 89 | END_TEST |
88 | 90 | |
89 | Suite * vcf_suite (void) | |
91 | Suite | |
92 | * | |
93 | ||
94 | vcf_suite(void) | |
90 | 95 | { |
91 | Suite *s = suite_create ("Creating_VCF_file"); | |
96 | Suite * s = suite_create("Creating_VCF_file"); | |
92 | 97 | |
93 | TCase *tc_vcf_file = tcase_create ("vcf_file"); | |
94 | tcase_add_test (tc_vcf_file, alternative_bases_test); | |
95 | tcase_add_test (tc_vcf_file, format_alternative_bases_test); | |
96 | tcase_add_test (tc_vcf_file, format_allele_index_test); | |
97 | suite_add_tcase (s, tc_vcf_file); | |
98 | TCase *tc_vcf_file = tcase_create("vcf_file"); | |
99 | tcase_add_test(tc_vcf_file, alternative_bases_test); | |
100 | tcase_add_test(tc_vcf_file, format_alternative_bases_test); | |
101 | tcase_add_test(tc_vcf_file, format_allele_index_test); | |
102 | suite_add_tcase(s, tc_vcf_file); | |
98 | 103 | |
99 | return s; | |
104 | return s; | |
100 | 105 | } |
101 | 106 | |
102 | 107 |
20 | 20 | #define _CHECK_VCF_H_ |
21 | 21 | |
22 | 22 | void check_alternative_bases(char, char *, int, char *); |
23 | ||
23 | 24 | void check_format_alternative_bases(char *, char *); |
25 | ||
24 | 26 | void check_format_allele_index(char, char, char *, char *); |
25 | Suite * vcf_suite (void); | |
27 | ||
28 | Suite *vcf_suite(void); | |
29 | ||
26 | 30 | #endif |
27 | 31 | |
28 | 32 |
26 | 26 | #include "helper-methods.h" |
27 | 27 | |
28 | 28 | |
29 | int compare_files(char expected_output_filename[],char actual_output_filename[] ) | |
29 | int compare_files(char expected_output_filename[], char actual_output_filename[]) | |
30 | 30 | { |
31 | FILE *expected_output_fh; | |
32 | FILE *actual_output_fh; | |
33 | ||
34 | char *expected_buffer; | |
35 | char *actual_buffer; | |
36 | long numbytes; | |
31 | FILE *expected_output_fh; | |
32 | FILE *actual_output_fh; | |
37 | 33 | |
38 | size_t fsize_expected; | |
39 | size_t fsize_actual; | |
40 | ||
41 | expected_output_fh = fopen(expected_output_filename, "r"); | |
42 | actual_output_fh = fopen(actual_output_filename, "r"); | |
43 | ||
44 | fseek(expected_output_fh, 0L, SEEK_END); | |
45 | numbytes = ftell(expected_output_fh); | |
46 | fseek(expected_output_fh, 0L, SEEK_SET); | |
47 | expected_buffer = (char*)calloc(numbytes +1, sizeof(char)); | |
48 | fsize_expected = fread(expected_buffer, sizeof(char), numbytes, expected_output_fh); | |
49 | fclose(expected_output_fh); | |
50 | ||
51 | fseek(actual_output_fh, 0L, SEEK_END); | |
52 | numbytes = ftell(actual_output_fh); | |
53 | fseek(actual_output_fh, 0L, SEEK_SET); | |
54 | actual_buffer = (char*)calloc(numbytes +1, sizeof(char)); | |
55 | fsize_actual = fread(actual_buffer, sizeof(char), numbytes, actual_output_fh); | |
56 | fclose(actual_output_fh); | |
57 | ||
58 | if(strcmp(expected_buffer,actual_buffer) == 0) | |
59 | { | |
34 | char *expected_buffer; | |
35 | char *actual_buffer; | |
36 | long numbytes; | |
37 | ||
38 | size_t fsize_expected; | |
39 | size_t fsize_actual; | |
40 | ||
41 | expected_output_fh = fopen(expected_output_filename, "r"); | |
42 | actual_output_fh = fopen(actual_output_filename, "r"); | |
43 | ||
44 | fseek(expected_output_fh, 0L, SEEK_END); | |
45 | numbytes = ftell(expected_output_fh); | |
46 | fseek(expected_output_fh, 0L, SEEK_SET); | |
47 | expected_buffer = (char *) calloc(numbytes + 1, sizeof(char)); | |
48 | fsize_expected = fread(expected_buffer, sizeof(char), numbytes, expected_output_fh); | |
49 | fclose(expected_output_fh); | |
50 | ||
51 | fseek(actual_output_fh, 0L, SEEK_END); | |
52 | numbytes = ftell(actual_output_fh); | |
53 | fseek(actual_output_fh, 0L, SEEK_SET); | |
54 | actual_buffer = (char *) calloc(numbytes + 1, sizeof(char)); | |
55 | fsize_actual = fread(actual_buffer, sizeof(char), numbytes, actual_output_fh); | |
56 | fclose(actual_output_fh); | |
57 | ||
58 | if (strcmp(expected_buffer, actual_buffer) == 0) { | |
59 | free(expected_buffer); | |
60 | free(actual_buffer); | |
61 | return 1; | |
62 | } | |
63 | ||
60 | 64 | free(expected_buffer); |
61 | 65 | free(actual_buffer); |
62 | return 1; | |
63 | } | |
64 | 66 | |
65 | free(expected_buffer); | |
66 | free(actual_buffer); | |
67 | ||
68 | return 0; | |
67 | return 0; | |
69 | 68 | } |
19 | 19 | #ifndef _HELPER_METHODS_H_ |
20 | 20 | #define _HELPER_METHODS_H_ |
21 | 21 | |
22 | int compare_files(char expected_output_filename[],char actual_output_filename[] ); | |
22 | int compare_files(char expected_output_filename[], char actual_output_filename[]); | |
23 | ||
23 | 24 | #endif |
24 | 25 |
24 | 24 | #include "check-vcf.h" |
25 | 25 | |
26 | 26 | |
27 | int main(void) | |
28 | { | |
29 | int number_failed; | |
30 | Suite *s; | |
31 | SRunner *sr; | |
27 | 32 | |
28 | int main (void) | |
29 | { | |
30 | int number_failed; | |
31 | Suite *s; | |
32 | SRunner *sr; | |
33 | s = snp_sites_suite(); | |
34 | sr = srunner_create(s); | |
35 | srunner_run_all(sr, CK_NORMAL); | |
36 | number_failed = srunner_ntests_failed(sr); | |
37 | srunner_free(sr); | |
33 | 38 | |
34 | s = snp_sites_suite (); | |
35 | sr = srunner_create (s); | |
36 | srunner_run_all (sr, CK_NORMAL); | |
37 | number_failed = srunner_ntests_failed (sr); | |
38 | srunner_free (sr); | |
39 | s = vcf_suite(); | |
40 | sr = srunner_create(s); | |
41 | srunner_run_all(sr, CK_NORMAL); | |
42 | number_failed += srunner_ntests_failed(sr); | |
43 | srunner_free(sr); | |
39 | 44 | |
40 | s = vcf_suite (); | |
41 | sr = srunner_create (s); | |
42 | srunner_run_all (sr, CK_NORMAL); | |
43 | number_failed += srunner_ntests_failed (sr); | |
44 | srunner_free (sr); | |
45 | ||
46 | return (number_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE; | |
45 | return (number_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE; | |
47 | 46 | } |