Codebase list snp-sites / fresh-snapshots/main
New upstream snapshot. Debian Janitor 4 years ago
22 changed file(s) with 1281 addition(s) and 1198 deletion(s). Raw diff Collapse all Expand all
44 addons:
55 apt:
66 packages:
7 - check
7 - check
88 before_install:
9 - sudo apt-get update -qq
10 script:
11 - autoreconf -i && ./configure --enable-maintainer-mode CFLAGS='-O0 --coverage' && make && make check
9 - sudo apt-get update -qq
10 script:
11 - autoreconf -i && ./configure --enable-maintainer-mode CFLAGS='-O0 --coverage' && make && make check
1212 after_success:
1313 - bash <(curl -s https://codecov.io/bash)
00 Andrew J. Page <ap13@sanger.ac.uk>
11 Ben Taylor <path-help@sanger.ac.uk>
22 Jorge Soares <path-help@sanger.ac.uk>
3 Peter van Heusden
3 Peter van Heusden <pvh@sanbi.ac.za>
0 snp-sites (2.5.1+git20191111.a274ec4-1) UNRELEASED; urgency=medium
1
2 * New upstream snapshot.
3
4 -- Debian Janitor <janitor@jelmer.uk> Thu, 30 Jan 2020 10:21:48 +0000
5
06 snp-sites (2.5.1-1) unstable; urgency=medium
17
28 [ Andreas Tille ]
3333 int length_of_genome;
3434 int number_of_samples;
3535 int number_of_snps;
36 char ** sequence_names;
37 int * snp_locations;
38 char * pseudo_reference_sequence;
36 char **sequence_names;
37 int *snp_locations;
38 char *pseudo_reference_sequence;
3939
4040 int get_length_of_genome()
4141 {
5252 return number_of_snps;
5353 }
5454
55 char ** get_sequence_names()
55 char **get_sequence_names()
5656 {
5757 return sequence_names;
5858 }
5959
60 int * get_snp_locations()
60 int *get_snp_locations()
6161 {
6262 return snp_locations;
6363 }
6464
65 char * get_pseudo_reference_sequence()
65 char *get_pseudo_reference_sequence()
6666 {
6767 return pseudo_reference_sequence;
6868 }
6969
70 void get_bases_for_each_snp(char filename[], char ** bases_for_snps)
71 {
72 int l;
73 int i = 0;
74 int sequence_number = 0;
75 size_t length_of_genome_found =0;
76
77 gzFile fp;
78 kseq_t *seq;
79
80 fp = gzopen(filename, "r");
81 seq = kseq_init(fp);
82
83 while ((l = kseq_read(seq)) >= 0)
84 {
85 if(sequence_number == 0)
86 {
87 length_of_genome_found = seq->seq.l;
88 }
89 for(i = 0; i< number_of_snps; i++)
90 {
91 bases_for_snps[i][sequence_number] = toupper(((char *) seq->seq.s)[snp_locations[i]]);
92 }
93
94 if(seq->seq.l != length_of_genome_found)
95 {
96 fprintf(stderr, "Alignment %s contains sequences of unequal length. Expected length is %i but got %i in sequence %s\n\n",filename, (int) length_of_genome_found, (int) seq->seq.l,seq->name.s);
97 fflush(stderr);
98 exit(EXIT_FAILURE);
99 }
100 sequence_number++;
101 }
102
103 kseq_destroy(seq);
104 gzclose(fp);
105 }
106
107 void detect_snps(char filename[], int pure_mode, int output_monomorphic) {
70 void get_bases_for_each_snp(char filename[], char **bases_for_snps)
71 {
72 int l;
73 int i = 0;
74 int sequence_number = 0;
75 size_t length_of_genome_found = 0;
76
77 gzFile fp;
78 kseq_t *seq;
79
80 fp = gzopen(filename, "r");
81 seq = kseq_init(fp);
82
83 while ((l = kseq_read(seq)) >= 0) {
84 if (sequence_number == 0) {
85 length_of_genome_found = seq->seq.l;
86 }
87 for (i = 0; i < number_of_snps; i++) {
88 bases_for_snps[i][sequence_number] = toupper(((char *) seq->seq.s)[snp_locations[i]]);
89 }
90
91 if (seq->seq.l != length_of_genome_found) {
92 fprintf(stderr,
93 "Alignment %s contains sequences of unequal length. Expected length is %i but got %i in sequence %s\n\n",
94 filename, (int) length_of_genome_found, (int) seq->seq.l, seq->name.s);
95 fflush(stderr);
96 exit(EXIT_FAILURE);
97 }
98 sequence_number++;
99 }
100
101 kseq_destroy(seq);
102 gzclose(fp);
103 }
104
105 void detect_snps(char filename[], int pure_mode, int output_monomorphic)
106 {
108107 detect_snps_count_constant_sites(filename, pure_mode, output_monomorphic, NULL);
109108 }
110109
111 void detect_snps_count_constant_sites(char filename[], int pure_mode, int output_monomorphic, int* constant_site_counts)
112 {
113 int i;
114 int l;
115 number_of_snps = 0;
116 number_of_samples = 0;
117 length_of_genome = 0;
118 char * first_sequence;
119 /* array below allows quick mapping of A, C, T and G characters to indices in base_counts array */
120 const int char_to_base_count_index[] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, 1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3};
110 void
111 detect_snps_count_constant_sites(char filename[], int pure_mode, int output_monomorphic, int *constant_site_counts)
112 {
113 int i;
114 int l;
115 number_of_snps = 0;
116 number_of_samples = 0;
117 length_of_genome = 0;
118 char *first_sequence;
119 /* array below allows quick mapping of A, C, T and G characters to indices in base_counts array */
120 const int char_to_base_count_index[] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
121 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
122 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
123 -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, 1, -1, -1, -1, 2, -1, -1, -1, -1, -1,
124 -1, -1, -1, -1, -1, -1, -1, 3};
121125
122126
123127 gzFile fp;
124 kseq_t *seq;
125
126 fp = gzopen(filename, "r");
127 seq = kseq_init(fp);
128
129 sequence_names = calloc(DEFAULT_NUM_SAMPLES, sizeof(char*));
130
131 while ((l = kseq_read(seq)) >= 0) {
132 if(number_of_samples == 0)
133 {
134 length_of_genome = seq->seq.l;
135 first_sequence = calloc(length_of_genome + 1, sizeof(char));
136 pseudo_reference_sequence = calloc(length_of_genome + 1, sizeof(char));
137
138 memset(first_sequence, 'N', length_of_genome);
139 memset(pseudo_reference_sequence, 'N', length_of_genome);
140 }
141 for(i = 0; i < length_of_genome; i++)
142 {
143 if(first_sequence[i] == '#')
144 {
145 continue;
146 }
147
148 if(first_sequence[i] == 'N' && !is_unknown(seq->seq.s[i]))
149 {
150 first_sequence[i] = toupper(seq->seq.s[i]);
151 pseudo_reference_sequence[i] = toupper(seq->seq.s[i]);
152 }
153
154 // in pure mode we only want /ACGT/i, if any other base is found the whole column is excluded
155 if(pure_mode && !is_pure(seq->seq.s[i]))
156 {
157 first_sequence[i] = '#';
158 continue;
159 }
160
161 if(first_sequence[i] != '>' && !is_unknown(seq->seq.s[i]) && first_sequence[i] != 'N' && first_sequence[i] != toupper(seq->seq.s[i]))
162 {
163 first_sequence[i] = '>';
164 }
165 }
166
167 if(number_of_samples >= DEFAULT_NUM_SAMPLES)
168 {
169 sequence_names = realloc(sequence_names, (number_of_samples + 1) * sizeof(char*));
170 }
171 sequence_names[number_of_samples] = calloc(MAX_SAMPLE_NAME_SIZE,sizeof(char));
172 strcpy(sequence_names[number_of_samples], seq->name.s);
173
174 number_of_samples++;
175 }
176
177 for(i = 0; i < length_of_genome; i++)
178 {
179 if(first_sequence[i] == '>' || (output_monomorphic && first_sequence[i] != '#'))
180 {
181 number_of_snps++;
182 }
183 }
184
185 if(number_of_snps == 0)
186 {
187 fprintf(stderr, "Warning: No SNPs were detected so there is nothing to output.\n");
188 fflush(stderr);
189 exit(EXIT_FAILURE);
190 }
191
192 int current_snp_index = 0;
193 snp_locations = calloc(number_of_snps +1, sizeof(int));
194 for(i = 0; i < length_of_genome; i++)
195 {
196 if(first_sequence[i] == '>' || (output_monomorphic && first_sequence[i] != '#'))
197 {
198 snp_locations[current_snp_index] = i;
199 current_snp_index++;
200 } else if (constant_site_counts != NULL && is_pure(first_sequence[i])) {
201 constant_site_counts[char_to_base_count_index[(int) toupper(first_sequence[i])]]++;
202 }
203
204 }
205
206 free(first_sequence);
207 kseq_destroy(seq);
208 gzclose(fp);
209 return;
128 kseq_t *seq;
129
130 fp = gzopen(filename, "r");
131 seq = kseq_init(fp);
132
133 sequence_names = calloc(DEFAULT_NUM_SAMPLES, sizeof(char *));
134
135 while ((l = kseq_read(seq)) >= 0) {
136 if (number_of_samples == 0) {
137 length_of_genome = seq->seq.l;
138 first_sequence = calloc(length_of_genome + 1, sizeof(char));
139 pseudo_reference_sequence = calloc(length_of_genome + 1, sizeof(char));
140
141 memset(first_sequence, 'N', length_of_genome);
142 memset(pseudo_reference_sequence, 'N', length_of_genome);
143 }
144 for (i = 0; i < length_of_genome; i++) {
145 if (first_sequence[i] == '#') {
146 continue;
147 }
148
149 if (first_sequence[i] == 'N' && !is_unknown(seq->seq.s[i])) {
150 first_sequence[i] = toupper(seq->seq.s[i]);
151 pseudo_reference_sequence[i] = toupper(seq->seq.s[i]);
152 }
153
154 // in pure mode we only want /ACGT/i, if any other base is found the whole column is excluded
155 if (pure_mode && !is_pure(seq->seq.s[i])) {
156 first_sequence[i] = '#';
157 continue;
158 }
159
160 if (first_sequence[i] != '>' && !is_unknown(seq->seq.s[i]) && first_sequence[i] != 'N' &&
161 first_sequence[i] != toupper(seq->seq.s[i])) {
162 first_sequence[i] = '>';
163 }
164 }
165
166 if (number_of_samples >= DEFAULT_NUM_SAMPLES) {
167 sequence_names = realloc(sequence_names, (number_of_samples + 1) * sizeof(char *));
168 }
169 sequence_names[number_of_samples] = calloc(MAX_SAMPLE_NAME_SIZE, sizeof(char));
170 strcpy(sequence_names[number_of_samples], seq->name.s);
171
172 number_of_samples++;
173 }
174
175 for (i = 0; i < length_of_genome; i++) {
176 if (first_sequence[i] == '>' || (output_monomorphic && first_sequence[i] != '#')) {
177 number_of_snps++;
178 }
179 }
180
181 if (number_of_snps == 0) {
182 fprintf(stderr, "Warning: No SNPs were detected so there is nothing to output.\n");
183 fflush(stderr);
184 exit(EXIT_FAILURE);
185 }
186
187 int current_snp_index = 0;
188 snp_locations = calloc(number_of_snps + 1, sizeof(int));
189 for (i = 0; i < length_of_genome; i++) {
190 if (first_sequence[i] == '>' || (output_monomorphic && first_sequence[i] != '#')) {
191 snp_locations[current_snp_index] = i;
192 current_snp_index++;
193 } else if (constant_site_counts != NULL && is_pure(first_sequence[i])) {
194 constant_site_counts[char_to_base_count_index[(int) toupper(first_sequence[i])]]++;
195 }
196
197 }
198
199 free(first_sequence);
200 kseq_destroy(seq);
201 gzclose(fp);
202 return;
210203 }
211204
212205 int is_unknown(char base)
213206 {
214 switch (base) {
215 case 'N':
216 case 'n':
217 case '-':
218 case '?':
219 return 1;
220 default:
221 return 0;
222 }
207 switch (base) {
208 case 'N':
209 case 'n':
210 case '-':
211 case '?':
212 return 1;
213 default:
214 return 0;
215 }
223216 }
224217
225218 int is_pure(char base)
226219 {
227 switch (base) {
228 case 'A':
229 case 'C':
230 case 'G':
231 case 'T':
232 case 'a':
233 case 'c':
234 case 'g':
235 case 't':
236 return 1;
237 default:
238 return 0;
239 }
240 }
220 switch (base) {
221 case 'A':
222 case 'C':
223 case 'G':
224 case 'T':
225 case 'a':
226 case 'c':
227 case 'g':
228 case 't':
229 return 1;
230 default:
231 return 0;
232 }
233 }
2121
2222 #include "kseq.h"
2323
24 void detect_snps( char filename[], int pure_mode, int output_monomorphic);
25 void detect_snps_count_constant_sites(char filename[], int pure_mode, int output_monomorphic, int *constant_site_counts);
26 void get_bases_for_each_snp(char filename[], char ** bases_for_snps);
24 void detect_snps(char filename[], int pure_mode, int output_monomorphic);
25
26 void
27 detect_snps_count_constant_sites(char filename[], int pure_mode, int output_monomorphic, int *constant_site_counts);
28
29 void get_bases_for_each_snp(char filename[], char **bases_for_snps);
30
2731 int is_unknown(char base);
32
2833 int get_length_of_genome();
34
2935 int get_number_of_samples();
36
3037 int get_number_of_snps();
31 char ** get_sequence_names();
32 int * get_snp_locations();
33 char * get_pseudo_reference_sequence();
38
39 char **get_sequence_names();
40
41 int *get_snp_locations();
42
43 char *get_pseudo_reference_sequence();
44
3445 int is_pure(char base);
3546
3647 #define MAX_SAMPLE_NAME_SIZE 2048
2323 #include <regex.h>
2424 #include "fasta-of-snp-sites.h"
2525
26 void create_fasta_of_snp_sites(char filename[], int number_of_snps, char ** bases_for_snps, char ** sequence_names, int number_of_samples, int output_reference, char * pseudo_reference_sequence, int snp_locations[])
26 void create_fasta_of_snp_sites(char filename[], int number_of_snps, char **bases_for_snps, char **sequence_names,
27 int number_of_samples, int output_reference, char *pseudo_reference_sequence,
28 int snp_locations[])
2729 {
28 FILE *fasta_file_pointer;
29 int sample_counter;
30 int snp_counter;
30 FILE *fasta_file_pointer;
31 int sample_counter;
32 int snp_counter;
3133
32 fasta_file_pointer = fopen(filename, "w");
33
34 if(output_reference == 1)
35 {
36 fprintf( fasta_file_pointer, ">pseudo_reference_sequence\n");
37 for(snp_counter=0; snp_counter< number_of_snps; snp_counter++)
38 {
39 fputc( pseudo_reference_sequence[snp_locations[snp_counter]], fasta_file_pointer );
34 fasta_file_pointer = fopen(filename, "w");
35
36 if (output_reference == 1) {
37 fprintf(fasta_file_pointer, ">pseudo_reference_sequence\n");
38 for (snp_counter = 0; snp_counter < number_of_snps; snp_counter++) {
39 fputc(pseudo_reference_sequence[snp_locations[snp_counter]], fasta_file_pointer);
40 }
41 fprintf(fasta_file_pointer, "\n");
4042 }
41 fprintf( fasta_file_pointer, "\n");
42 }
43
44 for(sample_counter=0; sample_counter< number_of_samples; sample_counter++)
45 {
46 fprintf( fasta_file_pointer, ">%s\n", sequence_names[sample_counter]);
47 for(snp_counter=0; snp_counter< number_of_snps; snp_counter++)
48 {
49 fputc( bases_for_snps[snp_counter][sample_counter], fasta_file_pointer );
50 }
51 fprintf( fasta_file_pointer, "\n");
52 }
53
54 fclose(fasta_file_pointer);
43
44 for (sample_counter = 0; sample_counter < number_of_samples; sample_counter++) {
45 fprintf(fasta_file_pointer, ">%s\n", sequence_names[sample_counter]);
46 for (snp_counter = 0; snp_counter < number_of_snps; snp_counter++) {
47 fputc(bases_for_snps[snp_counter][sample_counter], fasta_file_pointer);
48 }
49 fprintf(fasta_file_pointer, "\n");
50 }
51
52 fclose(fasta_file_pointer);
5553 }
2222
2323 #include <stdio.h>
2424
25 void create_fasta_of_snp_sites(char filename[], int number_of_snps, char ** bases_for_snps, char ** sequence_names, int number_of_samples, int output_reference, char * pseudo_reference_sequence, int snp_locations[]);
25 void create_fasta_of_snp_sites(char filename[], int number_of_snps, char **bases_for_snps, char **sequence_names,
26 int number_of_samples, int output_reference, char *pseudo_reference_sequence,
27 int snp_locations[]);
2628
2729 #endif
3636 #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
3737 #define KS_SEP_MAX 2
3838
39 #define __KS_TYPE(type_t) \
40 typedef struct __kstream_t { \
41 unsigned char *buf; \
42 int begin, end, is_eof; \
43 type_t f; \
44 } kstream_t;
39 #define __KS_TYPE(type_t) \
40 typedef struct __kstream_t { \
41 unsigned char *buf; \
42 int begin, end, is_eof; \
43 type_t f; \
44 } kstream_t;
4545
4646 #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
4747 #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
4848
49 #define __KS_BASIC(type_t, __bufsize) \
50 static inline kstream_t *ks_init(type_t f) \
51 { \
52 kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
53 ks->f = f; \
54 ks->buf = (unsigned char*)malloc(__bufsize); \
55 return ks; \
56 } \
57 static inline void ks_destroy(kstream_t *ks) \
58 { \
59 if (ks) { \
60 free(ks->buf); \
61 free(ks); \
62 } \
63 }
64
65 #define __KS_GETC(__read, __bufsize) \
66 static inline int ks_getc(kstream_t *ks) \
67 { \
68 if (ks->is_eof && ks->begin >= ks->end) return -1; \
69 if (ks->begin >= ks->end) { \
70 ks->begin = 0; \
71 ks->end = __read(ks->f, ks->buf, __bufsize); \
72 if (ks->end < __bufsize) ks->is_eof = 1; \
73 if (ks->end == 0) return -1; \
74 } \
75 return (int)ks->buf[ks->begin++]; \
76 }
49 #define __KS_BASIC(type_t, __bufsize) \
50 static inline kstream_t *ks_init(type_t f) \
51 { \
52 kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
53 ks->f = f; \
54 ks->buf = (unsigned char*)malloc(__bufsize); \
55 return ks; \
56 } \
57 static inline void ks_destroy(kstream_t *ks) \
58 { \
59 if (ks) { \
60 free(ks->buf); \
61 free(ks); \
62 } \
63 }
64
65 #define __KS_GETC(__read, __bufsize) \
66 static inline int ks_getc(kstream_t *ks) \
67 { \
68 if (ks->is_eof && ks->begin >= ks->end) return -1; \
69 if (ks->begin >= ks->end) { \
70 ks->begin = 0; \
71 ks->end = __read(ks->f, ks->buf, __bufsize); \
72 if (ks->end < __bufsize) ks->is_eof = 1; \
73 if (ks->end == 0) return -1; \
74 } \
75 return (int)ks->buf[ks->begin++]; \
76 }
7777
7878 #ifndef KSTRING_T
7979 #define KSTRING_T kstring_t
80 typedef struct __kstring_t {
81 size_t l, m;
82 char *s;
80 typedef struct __kstring_t
81 {
82 size_t l, m;
83 char *s;
8384 } kstring_t;
8485 #endif
8586
8788 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
8889 #endif
8990
90 #define __KS_GETUNTIL(__read, __bufsize) \
91 static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
92 { \
93 if (dret) *dret = 0; \
94 str->l = append? str->l : 0; \
95 if (ks->begin >= ks->end && ks->is_eof) return -1; \
96 for (;;) { \
97 int i; \
98 if (ks->begin >= ks->end) { \
99 if (!ks->is_eof) { \
100 ks->begin = 0; \
101 ks->end = __read(ks->f, ks->buf, __bufsize); \
102 if (ks->end < __bufsize) ks->is_eof = 1; \
103 if (ks->end == 0) break; \
104 } else break; \
105 } \
106 if (delimiter == KS_SEP_LINE) { \
107 for (i = ks->begin; i < ks->end; ++i) \
108 if (ks->buf[i] == '\n') break; \
109 } else if (delimiter > KS_SEP_MAX) { \
110 for (i = ks->begin; i < ks->end; ++i) \
111 if (ks->buf[i] == delimiter) break; \
112 } else if (delimiter == KS_SEP_SPACE) { \
113 for (i = ks->begin; i < ks->end; ++i) \
114 if (isspace(ks->buf[i])) break; \
115 } else if (delimiter == KS_SEP_TAB) { \
116 for (i = ks->begin; i < ks->end; ++i) \
117 if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
118 } else i = 0; /* never come to here! */ \
119 if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
120 str->m = str->l + (i - ks->begin) + 1; \
121 kroundup32(str->m); \
122 str->s = (char*)realloc(str->s, str->m); \
123 } \
124 memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
125 str->l = str->l + (i - ks->begin); \
126 ks->begin = i + 1; \
127 if (i < ks->end) { \
128 if (dret) *dret = ks->buf[i]; \
129 break; \
130 } \
131 } \
132 if (str->s == 0) { \
133 str->m = 1; \
134 str->s = (char*)calloc(1, 1); \
135 } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
136 str->s[str->l] = '\0'; \
137 return str->l; \
138 } \
139 static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
140 { return ks_getuntil2(ks, delimiter, str, dret, 0); }
91 #define __KS_GETUNTIL(__read, __bufsize) \
92 static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
93 { \
94 if (dret) *dret = 0; \
95 str->l = append? str->l : 0; \
96 if (ks->begin >= ks->end && ks->is_eof) return -1; \
97 for (;;) { \
98 int i; \
99 if (ks->begin >= ks->end) { \
100 if (!ks->is_eof) { \
101 ks->begin = 0; \
102 ks->end = __read(ks->f, ks->buf, __bufsize); \
103 if (ks->end < __bufsize) ks->is_eof = 1; \
104 if (ks->end == 0) break; \
105 } else break; \
106 } \
107 if (delimiter == KS_SEP_LINE) { \
108 for (i = ks->begin; i < ks->end; ++i) \
109 if (ks->buf[i] == '\n') break; \
110 } else if (delimiter > KS_SEP_MAX) { \
111 for (i = ks->begin; i < ks->end; ++i) \
112 if (ks->buf[i] == delimiter) break; \
113 } else if (delimiter == KS_SEP_SPACE) { \
114 for (i = ks->begin; i < ks->end; ++i) \
115 if (isspace(ks->buf[i])) break; \
116 } else if (delimiter == KS_SEP_TAB) { \
117 for (i = ks->begin; i < ks->end; ++i) \
118 if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
119 } else i = 0; /* never come to here! */ \
120 if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
121 str->m = str->l + (i - ks->begin) + 1; \
122 kroundup32(str->m); \
123 str->s = (char*)realloc(str->s, str->m); \
124 } \
125 memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
126 str->l = str->l + (i - ks->begin); \
127 ks->begin = i + 1; \
128 if (i < ks->end) { \
129 if (dret) *dret = ks->buf[i]; \
130 break; \
131 } \
132 } \
133 if (str->s == 0) { \
134 str->m = 1; \
135 str->s = (char*)calloc(1, 1); \
136 } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
137 str->s[str->l] = '\0'; \
138 return str->l; \
139 } \
140 static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
141 { return ks_getuntil2(ks, delimiter, str, dret, 0); }
141142
142143 #define KSTREAM_INIT(type_t, __read, __bufsize) \
143 __KS_TYPE(type_t) \
144 __KS_BASIC(type_t, __bufsize) \
145 __KS_GETC(__read, __bufsize) \
146 __KS_GETUNTIL(__read, __bufsize)
144 __KS_TYPE(type_t) \
145 __KS_BASIC(type_t, __bufsize) \
146 __KS_GETC(__read, __bufsize) \
147 __KS_GETUNTIL(__read, __bufsize)
147148
148149 #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
149150
150 #define __KSEQ_BASIC(SCOPE, type_t) \
151 SCOPE kseq_t *kseq_init(type_t fd) \
152 { \
153 kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
154 s->f = ks_init(fd); \
155 return s; \
156 } \
157 SCOPE void kseq_destroy(kseq_t *ks) \
158 { \
159 if (!ks) return; \
160 free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
161 ks_destroy(ks->f); \
162 free(ks); \
163 }
151 #define __KSEQ_BASIC(SCOPE, type_t) \
152 SCOPE kseq_t *kseq_init(type_t fd) \
153 { \
154 kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
155 s->f = ks_init(fd); \
156 return s; \
157 } \
158 SCOPE void kseq_destroy(kseq_t *ks) \
159 { \
160 if (!ks) return; \
161 free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
162 ks_destroy(ks->f); \
163 free(ks); \
164 }
164165
165166 /* Return value:
166167 >=0 length of the sequence (normal)
168169 -2 truncated quality string
169170 */
170171 #define __KSEQ_READ(SCOPE) \
171 SCOPE int kseq_read(kseq_t *seq) \
172 { \
173 int c; \
174 kstream_t *ks = seq->f; \
175 if (seq->last_char == 0) { /* then jump to the next header line */ \
176 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
177 if (c == -1) return -1; /* end of file */ \
178 seq->last_char = c; \
179 } /* else: the first header char has been read in the previous call */ \
180 seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
181 if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
182 if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
183 if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
184 seq->seq.m = 256; \
185 seq->seq.s = (char*)malloc(seq->seq.m); \
186 } \
187 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
188 if (c == '\n') continue; /* skip empty lines */ \
189 seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
190 ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
191 } \
192 if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
193 if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
194 seq->seq.m = seq->seq.l + 2; \
195 kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
196 seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
197 } \
198 seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
199 if (c != '+') return seq->seq.l; /* FASTA */ \
200 if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
201 seq->qual.m = seq->seq.m; \
202 seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
203 } \
204 while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
205 if (c == -1) return -2; /* error: no quality string */ \
206 while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
207 seq->last_char = 0; /* we have not come to the next header line */ \
208 if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
209 return seq->seq.l; \
210 }
211
212 #define __KSEQ_TYPE(type_t) \
213 typedef struct { \
214 kstring_t name, comment, seq, qual; \
215 int last_char; \
216 kstream_t *f; \
217 } kseq_t;
218
219 #define KSEQ_INIT2(SCOPE, type_t, __read) \
220 KSTREAM_INIT(type_t, __read, 2048) \
221 __KSEQ_TYPE(type_t) \
222 __KSEQ_BASIC(SCOPE, type_t) \
223 __KSEQ_READ(SCOPE)
172 SCOPE int kseq_read(kseq_t *seq) \
173 { \
174 int c; \
175 kstream_t *ks = seq->f; \
176 if (seq->last_char == 0) { /* then jump to the next header line */ \
177 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
178 if (c == -1) return -1; /* end of file */ \
179 seq->last_char = c; \
180 } /* else: the first header char has been read in the previous call */ \
181 seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
182 if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
183 if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
184 if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
185 seq->seq.m = 256; \
186 seq->seq.s = (char*)malloc(seq->seq.m); \
187 } \
188 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
189 if (c == '\n') continue; /* skip empty lines */ \
190 seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
191 ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
192 } \
193 if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
194 if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
195 seq->seq.m = seq->seq.l + 2; \
196 kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
197 seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
198 } \
199 seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
200 if (c != '+') return seq->seq.l; /* FASTA */ \
201 if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
202 seq->qual.m = seq->seq.m; \
203 seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
204 } \
205 while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
206 if (c == -1) return -2; /* error: no quality string */ \
207 while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
208 seq->last_char = 0; /* we have not come to the next header line */ \
209 if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
210 return seq->seq.l; \
211 }
212
213 #define __KSEQ_TYPE(type_t) \
214 typedef struct { \
215 kstring_t name, comment, seq, qual; \
216 int last_char; \
217 kstream_t *f; \
218 } kseq_t;
219
220 #define KSEQ_INIT2(SCOPE, type_t, __read) \
221 KSTREAM_INIT(type_t, __read, 2048) \
222 __KSEQ_TYPE(type_t) \
223 __KSEQ_BASIC(SCOPE, type_t) \
224 __KSEQ_READ(SCOPE)
224225
225226 #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
226227
227228 #define KSEQ_DECLARE(type_t) \
228 __KS_TYPE(type_t) \
229 __KSEQ_TYPE(type_t) \
230 extern kseq_t *kseq_init(type_t fd); \
231 void kseq_destroy(kseq_t *ks); \
232 int kseq_read(kseq_t *seq);
229 __KS_TYPE(type_t) \
230 __KSEQ_TYPE(type_t) \
231 extern kseq_t *kseq_init(type_t fd); \
232 void kseq_destroy(kseq_t *ks); \
233 int kseq_read(kseq_t *seq);
233234
234235 #endif
3131
3232 static void print_usage()
3333 {
34 printf("Usage: snp-sites [-rmvpcbhV] [-o output_filename] <file>\n");
35 printf("This program finds snp sites from a multi FASTA alignment file.\n");
36 printf(" -r output internal pseudo reference sequence\n");
37 printf(" -m output a multi fasta alignment file (default)\n");
38 printf(" -v output a VCF file\n");
39 printf(" -p output a phylip file\n");
40 printf(" -o STR specify an output filename [STDOUT]\n");
41 printf(" -c only output columns containing exclusively ACGT\n");
42 printf(" -C only output count of constant sites (suitable for IQ-TREE -fconst) and nothing else\n");
43 printf(" -b output monomorphic sites, used for BEAST\n");
44 printf(" -h this help message\n");
45 printf(" -V print version and exit\n");
46 printf(" <file> input alignment file which can optionally be gzipped\n\n");
47
48 printf("Example: creating files for BEAST\n");
49 printf(" snp-sites -cb -o outputfile.aln inputfile.aln\n\n");
34 printf("Usage: snp-sites [-rmvpcbhV] [-o output_filename] <file>\n");
35 printf("This program finds snp sites from a multi FASTA alignment file.\n");
36 printf(" -r output internal pseudo reference sequence\n");
37 printf(" -m output a multi fasta alignment file (default)\n");
38 printf(" -v output a VCF file\n");
39 printf(" -p output a phylip file\n");
40 printf(" -o STR specify an output filename [STDOUT]\n");
41 printf(" -c only output columns containing exclusively ACGT\n");
42 printf(" -C only output count of constant sites (suitable for IQ-TREE -fconst) and nothing else\n");
43 printf(" -b output monomorphic sites, used for BEAST\n");
44 printf(" -h this help message\n");
45 printf(" -V print version and exit\n");
46 printf(" <file> input alignment file which can optionally be gzipped\n\n");
5047
51 printf("If you use this program, please cite:\n");
52 printf("\"SNP-sites: rapid efficient extraction of SNPs from multi-FASTA alignments\",\n");
53 printf("Andrew J. Page, Ben Taylor, Aidan J. Delaney, Jorge Soares, Torsten Seemann, Jacqueline A. Keane, Simon R. Harris,\n");
54 printf("Microbial Genomics 2(4), (2016). http://dx.doi.org/10.1099/mgen.0.000056\n");
55
48 printf("Example: creating files for BEAST\n");
49 printf(" snp-sites -cb -o outputfile.aln inputfile.aln\n\n");
50
51 printf("If you use this program, please cite:\n");
52 printf("\"SNP-sites: rapid efficient extraction of SNPs from multi-FASTA alignments\",\n");
53 printf("Andrew J. Page, Ben Taylor, Aidan J. Delaney, Jorge Soares, Torsten Seemann, Jacqueline A. Keane, Simon R. Harris,\n");
54 printf("Microbial Genomics 2(4), (2016). http://dx.doi.org/10.1099/mgen.0.000056\n");
55
5656 }
5757
5858 static void print_version()
5959 {
60 printf("%s %s\n", PROGRAM_NAME, PROGRAM_VERSION);
60 printf("%s %s\n", PROGRAM_NAME, PROGRAM_VERSION);
6161 }
6262
63 int main (int argc, char **argv) {
64 char multi_fasta_filename[FILENAME_MAX] = {""};
65 char output_filename[FILENAME_MAX] = {"/dev/stdout"};
63 int main(int argc, char **argv)
64 {
65 char multi_fasta_filename[FILENAME_MAX] = {""};
66 char output_filename[FILENAME_MAX] = {"/dev/stdout"};
6667
67 int c;
68 int index;
69 int output_multi_fasta_file = 0;
70 int output_vcf_file = 0;
71 int output_phylip_file = 0;
72 int output_reference = 0;
73 int output_constant_site_counts = 0;
74 int pure_mode = 0;
75 int output_monomorphic =0;
76
77 while ((c = getopt (argc, argv, "mvrbpcCo:V")) != -1)
78 switch (c)
79 {
80 case 'm':
81 output_multi_fasta_file = 1;
82 break;
83 case 'v':
84 output_vcf_file = 1;
85 break;
86 case 'V':
87 print_version();
88 return 0;
89 case 'p':
90 output_phylip_file = 1;
91 break;
92 case 'r':
93 output_reference = 1;
94 break;
95 case 'c':
96 pure_mode = 1;
97 break;
98 case 'C':
99 output_constant_site_counts = 1;
100 break;
101 case 'b':
102 output_monomorphic = 1;
103 break;
104 case 'o':
105 strncpy(output_filename, optarg, FILENAME_MAX);
106 break;
107 case 'h':
108 print_usage();
109 exit(EXIT_SUCCESS);
110 default:
111 output_multi_fasta_file = 1;
112 }
113
114
115 if(optind < argc)
116 {
117 // check to see if the input alignment file exists
118 if( access( argv[optind], F_OK ) == -1 ) {
119 fprintf(stderr,"ERROR: cannot access input alignment file '%s'\n", argv[optind]);
120 fflush(stderr);
121 exit(EXIT_FAILURE);
68 int c;
69 int index;
70 int output_multi_fasta_file = 0;
71 int output_vcf_file = 0;
72 int output_phylip_file = 0;
73 int output_reference = 0;
74 int output_constant_site_counts = 0;
75 int pure_mode = 0;
76 int output_monomorphic = 0;
77
78 while ((c = getopt(argc, argv, "mvrbpcCo:V")) != -1)
79 switch (c) {
80 case 'm':
81 output_multi_fasta_file = 1;
82 break;
83 case 'v':
84 output_vcf_file = 1;
85 break;
86 case 'V':
87 print_version();
88 return 0;
89 case 'p':
90 output_phylip_file = 1;
91 break;
92 case 'r':
93 output_reference = 1;
94 break;
95 case 'c':
96 pure_mode = 1;
97 break;
98 case 'C':
99 output_constant_site_counts = 1;
100 break;
101 case 'b':
102 output_monomorphic = 1;
103 break;
104 case 'o':
105 strncpy(output_filename, optarg, FILENAME_MAX);
106 break;
107 case 'h':
108 print_usage();
109 exit(EXIT_SUCCESS);
110 default:
111 output_multi_fasta_file = 1;
112 }
113
114
115 if (optind < argc) {
116 // check to see if the input alignment file exists
117 if (access(argv[optind], F_OK) == -1) {
118 fprintf(stderr, "ERROR: cannot access input alignment file '%s'\n", argv[optind]);
119 fflush(stderr);
120 exit(EXIT_FAILURE);
121 }
122
123 strncpy(multi_fasta_filename, argv[optind], FILENAME_MAX);
124
125 if (output_constant_site_counts) {
126 count_constant_sites(multi_fasta_filename, output_filename);
127 } else if (pure_mode || output_monomorphic) {
128 generate_snp_sites_with_ref_pure_mono(multi_fasta_filename,
129 output_multi_fasta_file,
130 output_vcf_file, output_phylip_file,
131 output_filename, output_reference, pure_mode, output_monomorphic);
132 } else if (output_reference) {
133 generate_snp_sites_with_ref(multi_fasta_filename,
134 output_multi_fasta_file,
135 output_vcf_file, output_phylip_file,
136 output_filename);
137 } else {
138 generate_snp_sites(multi_fasta_filename, output_multi_fasta_file,
139 output_vcf_file, output_phylip_file,
140 output_filename);
141 }
142 } else {
143 print_usage();
122144 }
123
124 strncpy(multi_fasta_filename, argv[optind], FILENAME_MAX);
125145
126 if (output_constant_site_counts) {
127 count_constant_sites(multi_fasta_filename, output_filename);
128 } else if( pure_mode || output_monomorphic)
129 {
130 generate_snp_sites_with_ref_pure_mono(multi_fasta_filename,
131 output_multi_fasta_file,
132 output_vcf_file, output_phylip_file,
133 output_filename,output_reference,pure_mode,output_monomorphic);
134 }
135 else if(output_reference ) {
136 generate_snp_sites_with_ref(multi_fasta_filename,
137 output_multi_fasta_file,
138 output_vcf_file, output_phylip_file,
139 output_filename);
140 } else {
141 generate_snp_sites(multi_fasta_filename, output_multi_fasta_file,
142 output_vcf_file, output_phylip_file,
143 output_filename);
144 }
145 }
146 else
147 {
148 print_usage();
149 }
150
151 exit(EXIT_SUCCESS);
146 exit(EXIT_SUCCESS);
152147 }
2424 #include "phylib-of-snp-sites.h"
2525
2626
27 void create_phylib_of_snp_sites(char filename[], int number_of_snps, char ** bases_for_snps, char ** sequence_names, int number_of_samples, int output_reference, char * pseudo_reference_sequence, int snp_locations[])
27 void create_phylib_of_snp_sites(char filename[], int number_of_snps, char **bases_for_snps, char **sequence_names,
28 int number_of_samples, int output_reference, char *pseudo_reference_sequence,
29 int snp_locations[])
2830 {
29 FILE *phylip_file_pointer;
30 int sample_counter;
31 int snp_counter;
31 FILE *phylip_file_pointer;
32 int sample_counter;
33 int snp_counter;
3234
33 phylip_file_pointer = fopen(filename, "w");
34
35 if(output_reference == 1)
36 {
37 fprintf( phylip_file_pointer, "%d %d\n", number_of_samples+1, number_of_snps);
38 fprintf( phylip_file_pointer, "pseudo_reference_sequence\t");
39 for(snp_counter=0; snp_counter< number_of_snps; snp_counter++)
40 {
41 fputc( pseudo_reference_sequence[snp_locations[snp_counter]], phylip_file_pointer );
35 phylip_file_pointer = fopen(filename, "w");
36
37 if (output_reference == 1) {
38 fprintf(phylip_file_pointer, "%d %d\n", number_of_samples + 1, number_of_snps);
39 fprintf(phylip_file_pointer, "pseudo_reference_sequence\t");
40 for (snp_counter = 0; snp_counter < number_of_snps; snp_counter++) {
41 fputc(pseudo_reference_sequence[snp_locations[snp_counter]], phylip_file_pointer);
42 }
43 fprintf(phylip_file_pointer, "\n");
44 } else {
45 fprintf(phylip_file_pointer, "%d %d\n", number_of_samples, number_of_snps);
4246 }
43 fprintf( phylip_file_pointer, "\n");
44 }
45 else
46 {
47 fprintf( phylip_file_pointer, "%d %d\n", number_of_samples, number_of_snps);
48 }
49
50 for(sample_counter=0; sample_counter< number_of_samples; sample_counter++)
51 {
52 // sequence_name can be more than 10 (relaxed phylib format) and contain [\w\s]
53 //TODO check for illegal characters [^\w\s]
54 fprintf( phylip_file_pointer, "%s\t", sequence_names[sample_counter]);
55
56 for(snp_counter=0; snp_counter< number_of_snps; snp_counter++)
57 {
58 fputc( bases_for_snps[snp_counter][sample_counter], phylip_file_pointer);
59 }
60 fprintf( phylip_file_pointer, "\n");
61 }
6247
63 fclose(phylip_file_pointer);
48 for (sample_counter = 0; sample_counter < number_of_samples; sample_counter++) {
49 // sequence_name can be more than 10 (relaxed phylib format) and contain [\w\s]
50 //TODO check for illegal characters [^\w\s]
51 fprintf(phylip_file_pointer, "%s\t", sequence_names[sample_counter]);
52
53 for (snp_counter = 0; snp_counter < number_of_snps; snp_counter++) {
54 fputc(bases_for_snps[snp_counter][sample_counter], phylip_file_pointer);
55 }
56 fprintf(phylip_file_pointer, "\n");
57 }
58
59 fclose(phylip_file_pointer);
6460 }
2323
2424 #include <stdio.h>
2525
26 void create_phylib_of_snp_sites(char filename[], int number_of_snps, char ** bases_for_snps, char ** sequence_names, int number_of_samples, int output_reference, char * pseudo_reference_sequence, int snp_locations[]);
26 void create_phylib_of_snp_sites(char filename[], int number_of_snps, char **bases_for_snps, char **sequence_names,
27 int number_of_samples, int output_reference, char *pseudo_reference_sequence,
28 int snp_locations[]);
2729
2830 #endif
2626 #include "phylib-of-snp-sites.h"
2727 #include "fasta-of-snp-sites.h"
2828
29 char ** bases_for_snps;
29 char **bases_for_snps;
3030
3131 static int generate_snp_sites_generic(char filename[],
3232 int output_multi_fasta_file,
3636 int output_reference, int pure_mode,
3737 int output_monomorphic)
3838 {
39 int i;
40 detect_snps(filename, pure_mode, output_monomorphic);
39 int i;
40 detect_snps(filename, pure_mode, output_monomorphic);
4141
42 bases_for_snps = calloc(get_number_of_snps()+1, sizeof(char*));
43
44 for(i = 0; i < get_number_of_snps(); i++)
45 {
46 bases_for_snps[i] = calloc(get_number_of_samples()+1, sizeof(char));
47 }
48
49 get_bases_for_each_snp(filename, bases_for_snps);
50
51 char output_filename_base[FILENAME_MAX];
52 char filename_without_directory[FILENAME_MAX];
53 strip_directory_from_filename(filename, filename_without_directory);
54 strncpy(output_filename_base, filename_without_directory, FILENAME_MAX);
55
56 if(output_filename != NULL && *output_filename != '\0')
57 {
58 strncpy(output_filename_base, output_filename, FILENAME_MAX);
59 }
42 bases_for_snps = calloc(get_number_of_snps() + 1, sizeof(char *));
6043
61 if(output_vcf_file)
62 {
63 char vcf_output_filename[FILENAME_MAX];
64 strncpy(vcf_output_filename, output_filename_base, FILENAME_MAX);
65 if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') )
66 {
67 strcat(vcf_output_filename, ".vcf");
68 }
69
70 create_vcf_file(vcf_output_filename, get_snp_locations(), get_number_of_snps(), bases_for_snps, get_sequence_names(), get_number_of_samples(), get_length_of_genome(), get_pseudo_reference_sequence());
71 }
44 for (i = 0; i < get_number_of_snps(); i++) {
45 bases_for_snps[i] = calloc(get_number_of_samples() + 1, sizeof(char));
46 }
7247
73
74 if(output_phylip_file)
75 {
76 char phylip_output_filename[FILENAME_MAX];
77 strncpy(phylip_output_filename, output_filename_base, FILENAME_MAX);
78 if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') )
79 {
80 strcat(phylip_output_filename, ".phylip");
81 }
82 create_phylib_of_snp_sites(phylip_output_filename, get_number_of_snps(), bases_for_snps, get_sequence_names(), get_number_of_samples(), output_reference, get_pseudo_reference_sequence(),get_snp_locations());
83 }
48 get_bases_for_each_snp(filename, bases_for_snps);
8449
85 if((output_multi_fasta_file) || (output_vcf_file ==0 && output_phylip_file == 0 && output_multi_fasta_file == 0))
86 {
87 char multi_fasta_output_filename[FILENAME_MAX];
88 strncpy(multi_fasta_output_filename, output_filename_base, FILENAME_MAX);
89 if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') )
90 {
91 strcat(multi_fasta_output_filename, ".snp_sites.aln");
92 }
93 create_fasta_of_snp_sites(multi_fasta_output_filename, get_number_of_snps(), bases_for_snps, get_sequence_names(), get_number_of_samples(), output_reference, get_pseudo_reference_sequence(),get_snp_locations());
94 }
50 char output_filename_base[FILENAME_MAX];
51 char filename_without_directory[FILENAME_MAX];
52 strip_directory_from_filename(filename, filename_without_directory);
53 strncpy(output_filename_base, filename_without_directory, FILENAME_MAX);
9554
96 // free memory
97 free(get_snp_locations());
98 for(i = 0; i < get_number_of_samples(); i++)
99 {
100 // free(get_sequence_names().[i]);
101 }
102 for(i = 0; i < get_number_of_snps(); i++)
103 {
104 free(bases_for_snps[i]);
105 }
106 free(get_pseudo_reference_sequence());
55 if (output_filename != NULL && *output_filename != '\0') {
56 strncpy(output_filename_base, output_filename, FILENAME_MAX);
57 }
10758
108 return 1;
59 if (output_vcf_file) {
60 char vcf_output_filename[FILENAME_MAX];
61 strncpy(vcf_output_filename, output_filename_base, FILENAME_MAX);
62 if ((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 ||
63 (output_filename == NULL || *output_filename == '\0')) {
64 strcat(vcf_output_filename, ".vcf");
65 }
66
67 create_vcf_file(vcf_output_filename, get_snp_locations(), get_number_of_snps(), bases_for_snps,
68 get_sequence_names(), get_number_of_samples(), get_length_of_genome(),
69 get_pseudo_reference_sequence());
70 }
71
72
73 if (output_phylip_file) {
74 char phylip_output_filename[FILENAME_MAX];
75 strncpy(phylip_output_filename, output_filename_base, FILENAME_MAX);
76 if ((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 ||
77 (output_filename == NULL || *output_filename == '\0')) {
78 strcat(phylip_output_filename, ".phylip");
79 }
80 create_phylib_of_snp_sites(phylip_output_filename, get_number_of_snps(), bases_for_snps, get_sequence_names(),
81 get_number_of_samples(), output_reference, get_pseudo_reference_sequence(),
82 get_snp_locations());
83 }
84
85 if ((output_multi_fasta_file) ||
86 (output_vcf_file == 0 && output_phylip_file == 0 && output_multi_fasta_file == 0)) {
87 char multi_fasta_output_filename[FILENAME_MAX];
88 strncpy(multi_fasta_output_filename, output_filename_base, FILENAME_MAX);
89 if ((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 ||
90 (output_filename == NULL || *output_filename == '\0')) {
91 strcat(multi_fasta_output_filename, ".snp_sites.aln");
92 }
93 create_fasta_of_snp_sites(multi_fasta_output_filename, get_number_of_snps(), bases_for_snps,
94 get_sequence_names(), get_number_of_samples(), output_reference,
95 get_pseudo_reference_sequence(), get_snp_locations());
96 }
97
98 // free memory
99 free(get_snp_locations());
100 for (i = 0; i < get_number_of_samples(); i++) {
101 // free(get_sequence_names().[i]);
102 }
103 for (i = 0; i < get_number_of_snps(); i++) {
104 free(bases_for_snps[i]);
105 }
106 free(get_pseudo_reference_sequence());
107
108 return 1;
109109 }
110110
111111 int generate_snp_sites(char filename[], int output_multi_fasta_file,
112112 int output_vcf_file, int output_phylip_file,
113113 char output_filename[])
114114 {
115 return generate_snp_sites_generic(filename, output_multi_fasta_file,
116 output_vcf_file, output_phylip_file,
117 output_filename, 0,0,0);
115 return generate_snp_sites_generic(filename, output_multi_fasta_file,
116 output_vcf_file, output_phylip_file,
117 output_filename, 0, 0, 0);
118118 }
119119
120120 int generate_snp_sites_with_ref(char filename[], int output_multi_fasta_file,
123123 {
124124 return generate_snp_sites_generic(filename, output_multi_fasta_file,
125125 output_vcf_file, output_phylip_file,
126 output_filename, 1,0,0);
126 output_filename, 1, 0, 0);
127127 }
128128
129129 int generate_snp_sites_with_ref_pure_mono(char filename[],
130 int output_multi_fasta_file,
131 int output_vcf_file,
132 int output_phylip_file,
133 char output_filename[],
134 int output_reference,
135 int pure_mode,
136 int output_monomorphic)
130 int output_multi_fasta_file,
131 int output_vcf_file,
132 int output_phylip_file,
133 char output_filename[],
134 int output_reference,
135 int pure_mode,
136 int output_monomorphic)
137137 {
138 return generate_snp_sites_generic(filename, output_multi_fasta_file,
139 output_vcf_file, output_phylip_file,
140 output_filename, output_reference, pure_mode, output_monomorphic);
138 return generate_snp_sites_generic(filename, output_multi_fasta_file,
139 output_vcf_file, output_phylip_file,
140 output_filename, output_reference, pure_mode, output_monomorphic);
141141 }
142142
143 void count_constant_sites(char multi_fasta_filename[], char output_filename[]) {
143 void count_constant_sites(char multi_fasta_filename[], char output_filename[])
144 {
144145 char cwd[100];
145146 FILE *input_file;
146147 FILE *output_file;
166167 fclose(output_file);
167168 free(constant_site_counts);
168169 }
170
169171 // Inefficient
170 void strip_directory_from_filename(char * input_filename, char * output_filename)
172 void strip_directory_from_filename(char *input_filename, char *output_filename)
171173 {
172 int i;
173 int end_index = 0;
174 int last_forward_slash_index = -1;
175 for(i = 0; i< FILENAME_MAX; i++)
176 {
177 if(input_filename[i] == '/')
178 {
179 last_forward_slash_index = i;
174 int i;
175 int end_index = 0;
176 int last_forward_slash_index = -1;
177 for (i = 0; i < FILENAME_MAX; i++) {
178 if (input_filename[i] == '/') {
179 last_forward_slash_index = i;
180 }
181
182 if (input_filename[i] == '\0' || input_filename[i] == '\n') {
183 end_index = i;
184 break;
185 }
180186 }
181
182 if(input_filename[i] == '\0' || input_filename[i] == '\n')
183 {
184 end_index = i;
185 break;
187
188 int current_index = 0;
189 for (i = last_forward_slash_index + 1; i < end_index; i++) {
190 output_filename[current_index] = input_filename[i];
191 current_index++;
186192 }
187 }
188
189 int current_index = 0;
190 for(i = last_forward_slash_index+1; i< end_index; i++)
191 {
192 output_filename[current_index] = input_filename[i];
193 current_index++;
194 }
195 output_filename[current_index] = '\0';
193 output_filename[current_index] = '\0';
196194 }
197195
198196
3030 int output_vcf_file,
3131 int output_phylip_file,
3232 char output_filename[]);
33
3334 int generate_snp_sites_with_ref(char filename[],
3435 int output_multi_fasta_file,
3536 int output_vcf_file,
3637 int output_phylip_file,
3738 char output_filename[]);
38
39
3940 int generate_snp_sites_with_ref_pure_mono(char filename[],
40 int output_multi_fasta_file,
41 int output_vcf_file,
42 int output_phylip_file,
43 char output_filename[],
44 int output_reference,
45 int pure_mode,
46 int output_monomorphic);
41 int output_multi_fasta_file,
42 int output_vcf_file,
43 int output_phylip_file,
44 char output_filename[],
45 int output_reference,
46 int pure_mode,
47 int output_monomorphic);
4748
4849 void count_constant_sites(char multi_fasta_filename[], char filename[]);
4950
2727 #include "snp-sites.h"
2828 #include <assert.h>
2929
30 void create_vcf_file(char filename[], int snp_locations[],int number_of_snps, char ** bases_for_snps, char ** sequence_names, int number_of_samples, size_t length_of_genome, char pseudo_reference_sequence[])
31 {
32 FILE *vcf_file_pointer;
33 char * base_filename;
34 base_filename = (char *) malloc(FILENAME_MAX*sizeof(char));
35 strcpy(base_filename, filename);
36
37 vcf_file_pointer=fopen(base_filename, "w");
38 output_vcf_header(vcf_file_pointer,sequence_names, number_of_samples, (int) length_of_genome);
39 output_vcf_snps(vcf_file_pointer, bases_for_snps, snp_locations, number_of_snps, number_of_samples,pseudo_reference_sequence);
40 fclose(vcf_file_pointer);
41 free(base_filename);
42 }
43
44 void output_vcf_snps(FILE * vcf_file_pointer, char ** bases_for_snps, int * snp_locations, int number_of_snps, int number_of_samples, char pseudo_reference_sequence[])
45 {
46 int i;
47 for(i=0; i < number_of_snps; i++)
48 {
49 output_vcf_row(vcf_file_pointer, bases_for_snps[i], snp_locations[i], number_of_samples,pseudo_reference_sequence);
50 }
51 }
52
53 void output_vcf_header( FILE * vcf_file_pointer, char ** sequence_names, int number_of_samples, size_t length_of_genome)
54 {
55 int i;
56 fprintf( vcf_file_pointer, "##fileformat=VCFv4.1\n" );
57 fprintf( vcf_file_pointer, "##contig=<ID=1,length=%i>\n", (int) length_of_genome );
58 fprintf( vcf_file_pointer, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n" );
59 fprintf( vcf_file_pointer, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT" );
60
61 for(i=0; i<number_of_samples; i++)
62 {
63 fprintf( vcf_file_pointer, "\t%s", sequence_names[i]);
64 }
65 fprintf( vcf_file_pointer, "\n");
66 }
67
68 void output_vcf_row(FILE * vcf_file_pointer, char * bases_for_snp, int snp_location, int number_of_samples, char pseudo_reference_sequence[])
69 {
70 char reference_base = pseudo_reference_sequence[snp_location];
71 if(reference_base == '\0')
72 {
73 fprintf(stderr, "Couldnt get the reference base for coordinate %d\n",snp_location);
74 fflush(stderr);
75 exit(EXIT_FAILURE);
76 }
77
78 // Chromosome
79 fprintf( vcf_file_pointer, "1\t");
80
81 // Position
82 fprintf( vcf_file_pointer, "%d\t", (int) snp_location +1 );
83
84 //ID
85 fprintf( vcf_file_pointer, ".\t");
86
87 // REF
88 fprintf( vcf_file_pointer, "%c\t", (char) reference_base );
89
90 // ALT
91 // Need to look through list and find unique characters
92
93 char * alt_bases = alternative_bases(reference_base, bases_for_snp, number_of_samples);
94 char * alternative_bases_string = format_alternative_bases(alt_bases);
95 fprintf( vcf_file_pointer, "%s\t", alternative_bases_string );
96 free(alternative_bases_string);
97
98 // QUAL
99 fprintf( vcf_file_pointer, ".\t");
100
101 // FILTER
102 fprintf( vcf_file_pointer, ".\t");
103
104 // INFO
105 fprintf( vcf_file_pointer, ".\t");
106
107 // FORMAT
108 fprintf( vcf_file_pointer, "GT\t");
109
110 // Bases for each sample
111 output_vcf_row_samples_bases(vcf_file_pointer, reference_base, alt_bases, bases_for_snp, number_of_samples );
112 free(alt_bases);
113
114 fprintf( vcf_file_pointer, "\n");
115
116 }
117
118
119 char * alternative_bases(char reference_base, char * bases_for_snp, int number_of_samples)
120 {
121 int i;
122 int num_alt_bases = 0;
123 char * alt_bases = calloc(MAXIMUM_NUMBER_OF_ALT_BASES+1, sizeof(char));
124 for(i=0; i< number_of_samples; i++ )
125 {
126 char current_base = bases_for_snp[i];
127 if(current_base != reference_base)
128 {
129 if(is_unknown(current_base))
130 {
131 // VCF spec only allows ACGTN* for alts
132 current_base = '*';
133 }
134
135 if(check_if_char_in_string(alt_bases, current_base, num_alt_bases) == 0)
136 {
137 if (num_alt_bases >= MAXIMUM_NUMBER_OF_ALT_BASES)
138 {
139 fprintf(stderr, "Unexpectedly large number of alternative bases found between sequences. Please check input file is not corrupted\n\n");
140 fflush(stderr);
141 exit(EXIT_FAILURE);
142 }
143 alt_bases[num_alt_bases] = current_base;
144 num_alt_bases++;
145 }
146 }
147 }
148 alt_bases[num_alt_bases] = '\0';
149 return alt_bases;
150 }
151
152 char * format_allele_index(char base, char reference_base, char * alt_bases)
153 {
154 int length_of_alt_bases = strlen(alt_bases);
155 assert(length_of_alt_bases < 100);
156 char * result = calloc(5, sizeof(char));
157 int index;
158
159 if(is_unknown(base))
160 {
161 base = '*';
162 }
163
164 if (reference_base == base)
165 {
166 sprintf(result, "0");
167 }
168 else
169 {
170 sprintf(result, ".");
171 for (index = 1; index <= length_of_alt_bases; index++)
172 {
173 if (alt_bases[index-1] == base)
174 {
175 sprintf(result, "%i", index);
176 break;
177 }
178 }
179 }
180 return result;
181 }
182
183 char * format_alternative_bases(char * alt_bases)
184 {
185 int number_of_alt_bases = strlen(alt_bases);
186 assert( number_of_alt_bases < MAXIMUM_NUMBER_OF_ALT_BASES );
187
188 if(number_of_alt_bases ==0)
189 {
190 char * formatted_alt_bases = calloc(2 + 1, sizeof(char));
191 formatted_alt_bases[0] = '.';
30 void
31 create_vcf_file(char filename[], int snp_locations[], int number_of_snps, char **bases_for_snps, char **sequence_names,
32 int number_of_samples, size_t length_of_genome, char pseudo_reference_sequence[])
33 {
34 FILE *vcf_file_pointer;
35 char *base_filename;
36 base_filename = (char *) malloc(FILENAME_MAX * sizeof(char));
37 strcpy(base_filename, filename);
38
39 vcf_file_pointer = fopen(base_filename, "w");
40 output_vcf_header(vcf_file_pointer, sequence_names, number_of_samples, (int) length_of_genome);
41 output_vcf_snps(vcf_file_pointer, bases_for_snps, snp_locations, number_of_snps, number_of_samples,
42 pseudo_reference_sequence);
43 fclose(vcf_file_pointer);
44 free(base_filename);
45 }
46
47 void output_vcf_snps(FILE *vcf_file_pointer, char **bases_for_snps, int *snp_locations, int number_of_snps,
48 int number_of_samples, char pseudo_reference_sequence[])
49 {
50 int i;
51 for (i = 0; i < number_of_snps; i++) {
52 output_vcf_row(vcf_file_pointer, bases_for_snps[i], snp_locations[i], number_of_samples,
53 pseudo_reference_sequence);
54 }
55 }
56
57 void output_vcf_header(FILE *vcf_file_pointer, char **sequence_names, int number_of_samples, size_t length_of_genome)
58 {
59 int i;
60 fprintf(vcf_file_pointer, "##fileformat=VCFv4.1\n");
61 fprintf(vcf_file_pointer, "##contig=<ID=1,length=%i>\n", (int) length_of_genome);
62 fprintf(vcf_file_pointer, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n");
63 fprintf(vcf_file_pointer, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
64
65 for (i = 0; i < number_of_samples; i++) {
66 fprintf(vcf_file_pointer, "\t%s", sequence_names[i]);
67 }
68 fprintf(vcf_file_pointer, "\n");
69 }
70
71 void output_vcf_row(FILE *vcf_file_pointer, char *bases_for_snp, int snp_location, int number_of_samples,
72 char pseudo_reference_sequence[])
73 {
74 char reference_base = pseudo_reference_sequence[snp_location];
75 if (reference_base == '\0') {
76 fprintf(stderr, "Couldnt get the reference base for coordinate %d\n", snp_location);
77 fflush(stderr);
78 exit(EXIT_FAILURE);
79 }
80
81 // Chromosome
82 fprintf(vcf_file_pointer, "1\t");
83
84 // Position
85 fprintf(vcf_file_pointer, "%d\t", (int) snp_location + 1);
86
87 //ID
88 fprintf(vcf_file_pointer, ".\t");
89
90 // REF
91 fprintf(vcf_file_pointer, "%c\t", (char) reference_base);
92
93 // ALT
94 // Need to look through list and find unique characters
95
96 char *alt_bases = alternative_bases(reference_base, bases_for_snp, number_of_samples);
97 char *alternative_bases_string = format_alternative_bases(alt_bases);
98 fprintf(vcf_file_pointer, "%s\t", alternative_bases_string);
99 free(alternative_bases_string);
100
101 // QUAL
102 fprintf(vcf_file_pointer, ".\t");
103
104 // FILTER
105 fprintf(vcf_file_pointer, ".\t");
106
107 // INFO
108 fprintf(vcf_file_pointer, ".\t");
109
110 // FORMAT
111 fprintf(vcf_file_pointer, "GT\t");
112
113 // Bases for each sample
114 output_vcf_row_samples_bases(vcf_file_pointer, reference_base, alt_bases, bases_for_snp, number_of_samples);
115 free(alt_bases);
116
117 fprintf(vcf_file_pointer, "\n");
118
119 }
120
121
122 char *alternative_bases(char reference_base, char *bases_for_snp, int number_of_samples)
123 {
124 int i;
125 int num_alt_bases = 0;
126 char *alt_bases = calloc(MAXIMUM_NUMBER_OF_ALT_BASES + 1, sizeof(char));
127 for (i = 0; i < number_of_samples; i++) {
128 char current_base = bases_for_snp[i];
129 if (current_base != reference_base) {
130 if (is_unknown(current_base)) {
131 // VCF spec only allows ACGTN* for alts
132 current_base = '*';
133 }
134
135 if (check_if_char_in_string(alt_bases, current_base, num_alt_bases) == 0) {
136 if (num_alt_bases >= MAXIMUM_NUMBER_OF_ALT_BASES) {
137 fprintf(stderr,
138 "Unexpectedly large number of alternative bases found between sequences. Please check input file is not corrupted\n\n");
139 fflush(stderr);
140 exit(EXIT_FAILURE);
141 }
142 alt_bases[num_alt_bases] = current_base;
143 num_alt_bases++;
144 }
145 }
146 }
147 alt_bases[num_alt_bases] = '\0';
148 return alt_bases;
149 }
150
151 char *format_allele_index(char base, char reference_base, char *alt_bases)
152 {
153 int length_of_alt_bases = strlen(alt_bases);
154 assert(length_of_alt_bases < 100);
155 char *result = calloc(5, sizeof(char));
156 int index;
157
158 if (is_unknown(base)) {
159 base = '*';
160 }
161
162 if (reference_base == base) {
163 sprintf(result, "0");
164 } else {
165 sprintf(result, ".");
166 for (index = 1; index <= length_of_alt_bases; index++) {
167 if (alt_bases[index - 1] == base) {
168 sprintf(result, "%i", index);
169 break;
170 }
171 }
172 }
173 return result;
174 }
175
176 char *format_alternative_bases(char *alt_bases)
177 {
178 int number_of_alt_bases = strlen(alt_bases);
179 assert(number_of_alt_bases < MAXIMUM_NUMBER_OF_ALT_BASES);
180
181 if (number_of_alt_bases == 0) {
182 char *formatted_alt_bases = calloc(2 + 1, sizeof(char));
183 formatted_alt_bases[0] = '.';
184 return formatted_alt_bases;
185 }
186
187 char *formatted_alt_bases = calloc(number_of_alt_bases * 2 + 1, sizeof(char));
188 int i;
189 formatted_alt_bases[0] = alt_bases[0];
190 for (i = 1; i < number_of_alt_bases; i++) {
191 formatted_alt_bases[i * 2 - 1] = ',';
192 formatted_alt_bases[i * 2] = alt_bases[i];
193 }
192194 return formatted_alt_bases;
193 }
194
195 char * formatted_alt_bases = calloc(number_of_alt_bases*2 + 1, sizeof(char));
196 int i;
197 formatted_alt_bases[0] = alt_bases[0];
198 for (i = 1; i < number_of_alt_bases; i++)
199 {
200 formatted_alt_bases[i*2 - 1] = ',';
201 formatted_alt_bases[i*2] = alt_bases[i];
202 }
203 return formatted_alt_bases;
204195 }
205196
206197 int check_if_char_in_string(char search_string[], char target_char, int search_string_length)
207198 {
208 int i;
209 for(i=0; i < search_string_length ; i++ )
210 {
211 if(search_string[i] == target_char)
212 {
213 return 1;
214 }
215 }
216 return 0;
217 }
218
219 void output_vcf_row_samples_bases(FILE * vcf_file_pointer, char reference_base, char * alt_bases, char * bases_for_snp, int number_of_samples)
220 {
221 int i;
222 char * format;
223
224 for(i=0; i < number_of_samples ; i++ )
225 {
226 format = format_allele_index(bases_for_snp[i], reference_base, alt_bases);
227 fprintf( vcf_file_pointer, "%s", format);
228 free(format);
229 if(i+1 != number_of_samples)
230 {
231 fprintf( vcf_file_pointer, "\t");
232 }
233 }
234 }
235
236
237
238
199 int i;
200 for (i = 0; i < search_string_length; i++) {
201 if (search_string[i] == target_char) {
202 return 1;
203 }
204 }
205 return 0;
206 }
207
208 void output_vcf_row_samples_bases(FILE *vcf_file_pointer, char reference_base, char *alt_bases, char *bases_for_snp,
209 int number_of_samples)
210 {
211 int i;
212 char *format;
213
214 for (i = 0; i < number_of_samples; i++) {
215 format = format_allele_index(bases_for_snp[i], reference_base, alt_bases);
216 fprintf(vcf_file_pointer, "%s", format);
217 free(format);
218 if (i + 1 != number_of_samples) {
219 fprintf(vcf_file_pointer, "\t");
220 }
221 }
222 }
223
224
225
226
2222
2323 #include <stdio.h>
2424
25 void output_vcf_header( FILE * vcf_file_pointer, char ** sequence_names, int number_of_samples, size_t length_of_genome);
26 void create_vcf_file(char filename[], int snp_locations[], int number_of_snps, char ** bases_for_snps, char ** sequence_names, int number_of_samples, size_t length_of_genome, char pseudo_reference_sequence[]);
27 void output_vcf_snps(FILE * vcf_file_pointer, char ** bases_for_snps, int * snp_locations, int number_of_snps, int number_of_samples, char pseudo_reference_sequence[]);
28 void output_vcf_row(FILE * vcf_file_pointer, char * bases_for_snp, int snp_location, int number_of_samples, char pseudo_reference_sequence[]);
29 void output_vcf_row_samples_bases(FILE * vcf_file_pointer, char reference_base, char * alt_bases, char * bases_for_snp, int number_of_samples);
30 char * alternative_bases(char reference_base, char * bases_for_snp, int number_of_samples);
31 char * format_alternative_bases(char *);
32 char * format_allele_index(char, char, char *);
25 void output_vcf_header(FILE *vcf_file_pointer, char **sequence_names, int number_of_samples, size_t length_of_genome);
26
27 void
28 create_vcf_file(char filename[], int snp_locations[], int number_of_snps, char **bases_for_snps, char **sequence_names,
29 int number_of_samples, size_t length_of_genome, char pseudo_reference_sequence[]);
30
31 void output_vcf_snps(FILE *vcf_file_pointer, char **bases_for_snps, int *snp_locations, int number_of_snps,
32 int number_of_samples, char pseudo_reference_sequence[]);
33
34 void output_vcf_row(FILE *vcf_file_pointer, char *bases_for_snp, int snp_location, int number_of_samples,
35 char pseudo_reference_sequence[]);
36
37 void output_vcf_row_samples_bases(FILE *vcf_file_pointer, char reference_base, char *alt_bases, char *bases_for_snp,
38 int number_of_samples);
39
40 char *alternative_bases(char reference_base, char *bases_for_snp, int number_of_samples);
41
42 char *format_alternative_bases(char *);
43
44 char *format_allele_index(char, char, char *);
45
3346 int check_if_char_in_string(char search_string[], char target_char, int search_string_length);
3447
3548 #define MAXIMUM_NUMBER_OF_ALT_BASES 30
3434 #define actual_ref_seq_length2 10 //Former was 9
3535
3636 START_TEST (valid_alignment_with_one_line_per_sequence)
37 {
38 generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln",1,1,1,"");
39 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.vcf", "alignment_file_one_line_per_sequence.aln.vcf" ) == 1, "Invalid VCF file for 1 line per seq" );
40 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip", "alignment_file_one_line_per_sequence.aln.phylip" ) == 1, "Invalid Phylip file for 1 line per seq" );
41 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln","alignment_file_one_line_per_sequence.aln.snp_sites.aln" ) == 1 , "Invalid ALN file for 1 line per seq");
42 remove("alignment_file_one_line_per_sequence.aln.vcf");
43 remove("alignment_file_one_line_per_sequence.aln.phylip");
44 remove("alignment_file_one_line_per_sequence.aln.snp_sites.aln");
45 }
46 END_TEST
47
48
49 START_TEST (valid_alignment_with_n_as_gap)
50 {
51 generate_snp_sites("../tests/data/alignment_file_with_n.aln",1,1,1, "");
52 fail_unless( compare_files("../tests/data/alignment_file_with_n.aln.vcf", "alignment_file_with_n.aln.vcf" ) == 1, "Invalid VCF file for 1 line per seq" );
53 fail_unless( compare_files("../tests/data/alignment_file_with_n.aln.phylip", "alignment_file_with_n.aln.phylip" ) == 1, "Invalid Phylip file for 1 line per seq" );
54 fail_unless( compare_files("../tests/data/alignment_file_with_n.aln.snp_sites.aln","alignment_file_with_n.aln.snp_sites.aln" ) == 1 , "Invalid ALN file for 1 line per seq");
55 remove("alignment_file_with_n.aln.vcf");
56 remove("alignment_file_with_n.aln.phylip");
57 remove("alignment_file_with_n.aln.snp_sites.aln");
58 }
59 END_TEST
60
61
62
63 START_TEST (valid_alignment_with_one_line_per_sequence_gzipped)
64 {
65 generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln.gz",1,1,1, NULL);
66 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.vcf", "alignment_file_one_line_per_sequence.aln.gz.vcf" ) == 1, "Invalid VCF file for 1 line per seq" );
67 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip", "alignment_file_one_line_per_sequence.aln.gz.phylip" ) == 1, "Invalid Phylip file for 1 line per seq" );
68 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln","alignment_file_one_line_per_sequence.aln.gz.snp_sites.aln" ) == 1 , "Invalid ALN file for 1 line per seq");
69 remove("alignment_file_one_line_per_sequence.aln.gz.vcf");
70 remove("alignment_file_one_line_per_sequence.aln.gz.phylip");
71 remove("alignment_file_one_line_per_sequence.aln.gz.snp_sites.aln");
72 }
73 END_TEST
74
75 START_TEST (valid_alignment_with_multiple_lines_per_sequence)
76 {
77 generate_snp_sites("../tests/data/alignment_file_multiple_lines_per_sequence.aln",1,1,1,"");
78 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.vcf", "alignment_file_multiple_lines_per_sequence.aln.vcf" ) == 1, "Invalid VCF file for multiple lines per seq" );
79 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip", "alignment_file_multiple_lines_per_sequence.aln.phylip" ) == 1, "Invalid Phylip file for multiple lines per seq" );
80 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln","alignment_file_multiple_lines_per_sequence.aln.snp_sites.aln" ) == 1 ,"Invalid ALN file for multiple lines per seq");
37 {
38 generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln", 1, 1, 1, "");
39 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.vcf", "alignment_file_one_line_per_sequence.aln.vcf" ) == 1, "Invalid VCF file for 1 line per seq" );
40 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip", "alignment_file_one_line_per_sequence.aln.phylip" ) == 1, "Invalid Phylip file for 1 line per seq" );
41 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln", "alignment_file_one_line_per_sequence.aln.snp_sites.aln" ) == 1, "Invalid ALN file for 1 line per seq");
42 remove("alignment_file_one_line_per_sequence.aln.vcf");
43 remove("alignment_file_one_line_per_sequence.aln.phylip");
44 remove("alignment_file_one_line_per_sequence.aln.snp_sites.aln");
45 }
46
47 END_TEST
48
49
50 START_TEST(valid_alignment_with_n_as_gap)
51 {
52 generate_snp_sites("../tests/data/alignment_file_with_n.aln", 1, 1, 1, "");
53 fail_unless(compare_files("../tests/data/alignment_file_with_n.aln.vcf", "alignment_file_with_n.aln.vcf") == 1,
54 "Invalid VCF file for 1 line per seq");
55 fail_unless(
56 compare_files("../tests/data/alignment_file_with_n.aln.phylip", "alignment_file_with_n.aln.phylip") == 1,
57 "Invalid Phylip file for 1 line per seq");
58 fail_unless(compare_files("../tests/data/alignment_file_with_n.aln.snp_sites.aln",
59 "alignment_file_with_n.aln.snp_sites.aln") == 1, "Invalid ALN file for 1 line per seq");
60 remove("alignment_file_with_n.aln.vcf");
61 remove("alignment_file_with_n.aln.phylip");
62 remove("alignment_file_with_n.aln.snp_sites.aln");
63 }
64
65 END_TEST
66
67
68 START_TEST(valid_alignment_with_one_line_per_sequence_gzipped)
69 {
70 generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln.gz", 1, 1, 1, NULL);
71 fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.vcf",
72 "alignment_file_one_line_per_sequence.aln.gz.vcf") == 1,
73 "Invalid VCF file for 1 line per seq");
74 fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip",
75 "alignment_file_one_line_per_sequence.aln.gz.phylip") == 1,
76 "Invalid Phylip file for 1 line per seq");
77 fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln",
78 "alignment_file_one_line_per_sequence.aln.gz.snp_sites.aln") == 1,
79 "Invalid ALN file for 1 line per seq");
80 remove("alignment_file_one_line_per_sequence.aln.gz.vcf");
81 remove("alignment_file_one_line_per_sequence.aln.gz.phylip");
82 remove("alignment_file_one_line_per_sequence.aln.gz.snp_sites.aln");
83 }
84
85 END_TEST
86
87 START_TEST(valid_alignment_with_multiple_lines_per_sequence)
88 {
89 generate_snp_sites("../tests/data/alignment_file_multiple_lines_per_sequence.aln", 1, 1, 1, "");
90 fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.vcf",
91 "alignment_file_multiple_lines_per_sequence.aln.vcf") == 1,
92 "Invalid VCF file for multiple lines per seq");
93 fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip",
94 "alignment_file_multiple_lines_per_sequence.aln.phylip") == 1,
95 "Invalid Phylip file for multiple lines per seq");
96 fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln",
97 "alignment_file_multiple_lines_per_sequence.aln.snp_sites.aln") == 1,
98 "Invalid ALN file for multiple lines per seq");
8199 remove("alignment_file_multiple_lines_per_sequence.aln.vcf");
82100 remove("alignment_file_multiple_lines_per_sequence.aln.phylip");
83101 remove("alignment_file_multiple_lines_per_sequence.aln.snp_sites.aln");
84102 }
85 END_TEST
86
87 START_TEST (valid_alignment_with_pure_mode)
88 {
89 generate_snp_sites_with_ref_pure_mono("../tests/data/pure_mode_alignment.aln",1,1,1,"",0,1,0);
90 fail_unless( compare_files("../tests/data/pure_mode_alignment.aln.vcf", "pure_mode_alignment.aln.vcf" ) == 1, "Invalid VCF file for multiple lines per seq" );
91 fail_unless( compare_files("../tests/data/pure_mode_alignment.aln.phylip", "pure_mode_alignment.aln.phylip" ) == 1, "Invalid Phylip file for multiple lines per seq" );
92 fail_unless( compare_files("../tests/data/pure_mode_alignment.aln.snp_sites.aln","pure_mode_alignment.aln.snp_sites.aln" ) == 1 ,"Invalid ALN file for multiple lines per seq");
103
104 END_TEST
105
106 START_TEST(valid_alignment_with_pure_mode)
107 {
108 generate_snp_sites_with_ref_pure_mono("../tests/data/pure_mode_alignment.aln", 1, 1, 1, "", 0, 1, 0);
109 fail_unless(compare_files("../tests/data/pure_mode_alignment.aln.vcf", "pure_mode_alignment.aln.vcf") == 1,
110 "Invalid VCF file for multiple lines per seq");
111 fail_unless(compare_files("../tests/data/pure_mode_alignment.aln.phylip", "pure_mode_alignment.aln.phylip") == 1,
112 "Invalid Phylip file for multiple lines per seq");
113 fail_unless(compare_files("../tests/data/pure_mode_alignment.aln.snp_sites.aln",
114 "pure_mode_alignment.aln.snp_sites.aln") == 1,
115 "Invalid ALN file for multiple lines per seq");
93116 remove("pure_mode_alignment.aln.vcf");
94117 remove("pure_mode_alignment.aln.phylip");
95118 remove("pure_mode_alignment.aln.snp_sites.aln");
96119 }
97 END_TEST
98
99 START_TEST (valid_alignment_with_monomorphic_sites)
100 {
101 generate_snp_sites_with_ref_pure_mono("../tests/data/pure_mode_monomorphic_alignment.aln",1,1,1,"",0,1,1);
102 fail_unless( compare_files("../tests/data/pure_mode_monomorphic_alignment.aln.vcf", "pure_mode_monomorphic_alignment.aln.vcf" ) == 1, "Invalid VCF file for multiple lines per seq" );
103 fail_unless( compare_files("../tests/data/pure_mode_monomorphic_alignment.aln.phylip", "pure_mode_monomorphic_alignment.aln.phylip" ) == 1, "Invalid Phylip file for multiple lines per seq" );
104 fail_unless( compare_files("../tests/data/pure_mode_monomorphic_alignment.aln.snp_sites.aln","pure_mode_monomorphic_alignment.aln.snp_sites.aln" ) == 1 ,"Invalid ALN file for multiple lines per seq");
120
121 END_TEST
122
123 START_TEST(valid_alignment_with_monomorphic_sites)
124 {
125 generate_snp_sites_with_ref_pure_mono("../tests/data/pure_mode_monomorphic_alignment.aln", 1, 1, 1, "", 0, 1, 1);
126 fail_unless(compare_files("../tests/data/pure_mode_monomorphic_alignment.aln.vcf",
127 "pure_mode_monomorphic_alignment.aln.vcf") == 1,
128 "Invalid VCF file for multiple lines per seq");
129 fail_unless(compare_files("../tests/data/pure_mode_monomorphic_alignment.aln.phylip",
130 "pure_mode_monomorphic_alignment.aln.phylip") == 1,
131 "Invalid Phylip file for multiple lines per seq");
132 fail_unless(compare_files("../tests/data/pure_mode_monomorphic_alignment.aln.snp_sites.aln",
133 "pure_mode_monomorphic_alignment.aln.snp_sites.aln") == 1,
134 "Invalid ALN file for multiple lines per seq");
105135 remove("pure_mode_monomorphic_alignment.aln.vcf");
106136 remove("pure_mode_monomorphic_alignment.aln.phylip");
107137 remove("pure_mode_monomorphic_alignment.aln.snp_sites.aln");
108138 }
109 END_TEST
110
111
112 START_TEST (valid_with_only_aln_file_output_default)
113 {
114 generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln",0,0,0,"");
115 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln","alignment_file_one_line_per_sequence.aln.snp_sites.aln" ) == 1 ,"Invalid ALN file for multiple lines per seq");
116 remove("alignment_file_one_line_per_sequence.aln.snp_sites.aln");
117 }
118 END_TEST
119
120 START_TEST (valid_with_only_aln_file_output)
121 {
122 generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln",1,0,0,"");
123 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln","alignment_file_one_line_per_sequence.aln.snp_sites.aln" ) == 1 ,"Invalid ALN file for multiple lines per seq");
124 remove("alignment_file_one_line_per_sequence.aln.snp_sites.aln");
125 }
126 END_TEST
127
128 START_TEST (valid_with_only_aln_file_output_with_custom_name)
129 {
130 char output_filename[100] = {"some_custom_name"};
131 generate_snp_sites("../tests/data/alignment_file_multiple_lines_per_sequence.aln",1,0,0,output_filename);
132 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln","some_custom_name" ) == 1 ,"Only aln with custom name should be outputted");
133 remove("some_custom_name");
134 }
135 END_TEST
136
137 START_TEST (valid_with_phylip_outputted_with_custom_name)
138 {
139 generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln",0,0,1,"some_custom_name");
140 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip", "some_custom_name" ) == 1, "Custom name needs extra extension for phylip" );
141 remove("some_custom_name");
142
143 }
144 END_TEST
145
146 START_TEST (invalid_with_uneven_file_lengths)
147 {
148 generate_snp_sites("../tests/data/uneven_alignment.aln",1,0,0,"");
149 }
150 END_TEST
151
152
153 START_TEST (valid_with_all_outputted_with_custom_name)
154 {
155 generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln",1,1,1,"some_custom_name");
156 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.vcf", "some_custom_name.vcf" ) == 1, "Custom name needs extra extension for VCF" );
157 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip", "some_custom_name.phylip" ) == 1, "Custom name needs extra extension for phylip" );
158 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln","some_custom_name.snp_sites.aln" ) == 1 , "Custom name needs extra extension for ALN");
159 remove("some_custom_name.vcf");
160 remove("some_custom_name.phylip");
161 remove("some_custom_name.snp_sites.aln");
162 }
163 END_TEST
164
165
166 START_TEST (valid_aln_plus_reference)
167 {
168 generate_snp_sites_with_ref("../tests/data/alignment_file_one_line_per_sequence.aln",1,0,0,"");
169 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence_reference.snp_sites.aln","alignment_file_one_line_per_sequence.aln.snp_sites.aln" ) == 1 ,"Invalid ALN file for multiple lines per seq");
170 remove("alignment_file_one_line_per_sequence.aln.snp_sites.aln");
171 }
172 END_TEST
173
174 START_TEST (valid_phylip_plus_reference)
175 {
176 generate_snp_sites_with_ref("../tests/data/alignment_file_one_line_per_sequence.aln",0,0,1,"");
177 fail_unless( compare_files("../tests/data/alignment_file_one_line_per_sequence_reference.aln.phylip", "alignment_file_one_line_per_sequence.aln.phylip" ) == 1, "invalid phylib with reference" );
178 remove("alignment_file_one_line_per_sequence.aln.phylip");
179
180 }
181 END_TEST
182
183 START_TEST (valid_genome_length)
184 {
185 detect_snps("../tests/data/alignment_file_one_line_per_sequence.aln",0,0);
186 fail_unless( get_length_of_genome() == 2000 );
187 }
188 END_TEST
189
190 START_TEST (valid_genome_length_with_multiple_lines_per_sequence)
191 {
192 detect_snps("../tests/data/alignment_file_multiple_lines_per_sequence.aln",0,0);
193 fail_unless( get_length_of_genome() == 2000 );
194 }
195 END_TEST
196
197 START_TEST (valid_number_of_sequences_in_file)
198 {
199 detect_snps("../tests/data/alignment_file_one_line_per_sequence.aln",0,0);
200 fail_unless( get_number_of_samples() == 109 );
201 }
202 END_TEST
203
204 START_TEST (valid_number_of_sequences_in_file_with_multiple_lines_per_sequence)
205 {
206 detect_snps("../tests/data/alignment_file_multiple_lines_per_sequence.aln",0,0);
207 fail_unless( get_number_of_samples() == 109 );
208 }
209 END_TEST
210
211 START_TEST (number_of_snps_detected)
212 {
213 detect_snps("../tests/data/alignment_file_multiple_lines_per_sequence.aln",0,0);
214 fail_unless( get_number_of_snps() == 5);
215 }
216 END_TEST
217
218 START_TEST (number_of_snps_detected_small)
219 {
220 detect_snps("../tests/data/small_alignment.aln",0,0);
221 fail_unless( get_number_of_snps() == 1);
222 }
223 END_TEST
224
225 START_TEST (detect_snps_pure_mode)
226 {
227 detect_snps("../tests/data/pure_mode_alignment.aln",1,0);
228 fail_unless( get_number_of_snps() == 2);
229 }
230 END_TEST
231
232
233 START_TEST (detect_snps_pure_mode_monomorphic)
234 {
235 detect_snps("../tests/data/pure_mode_monomorphic_alignment.aln",1,1);
236 fail_unless( get_number_of_snps() == 3);
237 }
238 END_TEST
239
240 START_TEST (sample_names_from_alignment_file)
241 {
242 detect_snps("../tests/data/small_alignment.aln",0,0);
243 char ** current_sequence_names = get_sequence_names();
244
245 fail_unless(strcmp(current_sequence_names[0],"reference_sequence") == 0);
246 fail_unless(strcmp(current_sequence_names[1],"comparison_sequence") == 0);
247 fail_unless(strcmp(current_sequence_names[2],"another_comparison_sequence") == 0);
248 }
249 END_TEST
250
251 START_TEST (check_strip_directory_from_filename_without_directory)
252 {
253 char *input_filename_without_directory = "my_file_name.aln";
254 char output_filename[30];
255 memset(output_filename,'\0',30);
256 strip_directory_from_filename(input_filename_without_directory, output_filename);
257 fail_unless( strcmp(input_filename_without_directory, output_filename) ==0 );
258 }
259 END_TEST
260
261 START_TEST (check_strip_directory_from_filename_with_directory)
262 {
263 char *input_filename_without_directory = "/some/directory/name/my_file_name.aln";
264 char output_filename[30];
265 memset(output_filename,'\0',30);
266 strip_directory_from_filename(input_filename_without_directory, output_filename);
267 fail_unless( strcmp("my_file_name.aln", output_filename) ==0 );
268 }
269 END_TEST
270
271 START_TEST (check_count_constant_sites)
139
140 END_TEST
141
142
143 START_TEST(valid_with_only_aln_file_output_default)
144 {
145 generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln", 0, 0, 0, "");
146 fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln",
147 "alignment_file_one_line_per_sequence.aln.snp_sites.aln") == 1,
148 "Invalid ALN file for multiple lines per seq");
149 remove("alignment_file_one_line_per_sequence.aln.snp_sites.aln");
150 }
151
152 END_TEST
153
154 START_TEST(valid_with_only_aln_file_output)
155 {
156 generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln", 1, 0, 0, "");
157 fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln",
158 "alignment_file_one_line_per_sequence.aln.snp_sites.aln") == 1,
159 "Invalid ALN file for multiple lines per seq");
160 remove("alignment_file_one_line_per_sequence.aln.snp_sites.aln");
161 }
162
163 END_TEST
164
165 START_TEST(valid_with_only_aln_file_output_with_custom_name)
166 {
167 char output_filename[100] = {"some_custom_name"};
168 generate_snp_sites("../tests/data/alignment_file_multiple_lines_per_sequence.aln", 1, 0, 0, output_filename);
169 fail_unless(
170 compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln", "some_custom_name") ==
171 1, "Only aln with custom name should be outputted");
172 remove("some_custom_name");
173 }
174
175 END_TEST
176
177 START_TEST(valid_with_phylip_outputted_with_custom_name)
178 {
179 generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln", 0, 0, 1, "some_custom_name");
180 fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip", "some_custom_name") == 1,
181 "Custom name needs extra extension for phylip");
182 remove("some_custom_name");
183
184 }
185
186 END_TEST
187
188 START_TEST(invalid_with_uneven_file_lengths)
189 {
190 generate_snp_sites("../tests/data/uneven_alignment.aln", 1, 0, 0, "");
191 }
192
193 END_TEST
194
195
196 START_TEST(valid_with_all_outputted_with_custom_name)
197 {
198 generate_snp_sites("../tests/data/alignment_file_one_line_per_sequence.aln", 1, 1, 1, "some_custom_name");
199 fail_unless(
200 compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.vcf", "some_custom_name.vcf") == 1,
201 "Custom name needs extra extension for VCF");
202 fail_unless(
203 compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.phylip", "some_custom_name.phylip") ==
204 1, "Custom name needs extra extension for phylip");
205 fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence.aln.snp_sites.aln",
206 "some_custom_name.snp_sites.aln") == 1, "Custom name needs extra extension for ALN");
207 remove("some_custom_name.vcf");
208 remove("some_custom_name.phylip");
209 remove("some_custom_name.snp_sites.aln");
210 }
211
212 END_TEST
213
214
215 START_TEST(valid_aln_plus_reference)
216 {
217 generate_snp_sites_with_ref("../tests/data/alignment_file_one_line_per_sequence.aln", 1, 0, 0, "");
218 fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence_reference.snp_sites.aln",
219 "alignment_file_one_line_per_sequence.aln.snp_sites.aln") == 1,
220 "Invalid ALN file for multiple lines per seq");
221 remove("alignment_file_one_line_per_sequence.aln.snp_sites.aln");
222 }
223
224 END_TEST
225
226 START_TEST(valid_phylip_plus_reference)
227 {
228 generate_snp_sites_with_ref("../tests/data/alignment_file_one_line_per_sequence.aln", 0, 0, 1, "");
229 fail_unless(compare_files("../tests/data/alignment_file_one_line_per_sequence_reference.aln.phylip",
230 "alignment_file_one_line_per_sequence.aln.phylip") == 1, "invalid phylib with reference");
231 remove("alignment_file_one_line_per_sequence.aln.phylip");
232
233 }
234
235 END_TEST
236
237 START_TEST(valid_genome_length)
238 {
239 detect_snps("../tests/data/alignment_file_one_line_per_sequence.aln", 0, 0);
240 fail_unless(get_length_of_genome() == 2000);
241 }
242
243 END_TEST
244
245 START_TEST(valid_genome_length_with_multiple_lines_per_sequence)
246 {
247 detect_snps("../tests/data/alignment_file_multiple_lines_per_sequence.aln", 0, 0);
248 fail_unless(get_length_of_genome() == 2000);
249 }
250
251 END_TEST
252
253 START_TEST(valid_number_of_sequences_in_file)
254 {
255 detect_snps("../tests/data/alignment_file_one_line_per_sequence.aln", 0, 0);
256 fail_unless(get_number_of_samples() == 109);
257 }
258
259 END_TEST
260
261 START_TEST(valid_number_of_sequences_in_file_with_multiple_lines_per_sequence)
262 {
263 detect_snps("../tests/data/alignment_file_multiple_lines_per_sequence.aln", 0, 0);
264 fail_unless(get_number_of_samples() == 109);
265 }
266
267 END_TEST
268
269 START_TEST(number_of_snps_detected)
270 {
271 detect_snps("../tests/data/alignment_file_multiple_lines_per_sequence.aln", 0, 0);
272 fail_unless(get_number_of_snps() == 5);
273 }
274
275 END_TEST
276
277 START_TEST(number_of_snps_detected_small)
278 {
279 detect_snps("../tests/data/small_alignment.aln", 0, 0);
280 fail_unless(get_number_of_snps() == 1);
281 }
282
283 END_TEST
284
285 START_TEST(detect_snps_pure_mode)
286 {
287 detect_snps("../tests/data/pure_mode_alignment.aln", 1, 0);
288 fail_unless(get_number_of_snps() == 2);
289 }
290
291 END_TEST
292
293
294 START_TEST(detect_snps_pure_mode_monomorphic)
295 {
296 detect_snps("../tests/data/pure_mode_monomorphic_alignment.aln", 1, 1);
297 fail_unless(get_number_of_snps() == 3);
298 }
299
300 END_TEST
301
302 START_TEST(sample_names_from_alignment_file)
303 {
304 detect_snps("../tests/data/small_alignment.aln", 0, 0);
305 char **current_sequence_names = get_sequence_names();
306
307 fail_unless(strcmp(current_sequence_names[0], "reference_sequence") == 0);
308 fail_unless(strcmp(current_sequence_names[1], "comparison_sequence") == 0);
309 fail_unless(strcmp(current_sequence_names[2], "another_comparison_sequence") == 0);
310 }
311
312 END_TEST
313
314 START_TEST(check_strip_directory_from_filename_without_directory)
315 {
316 char *input_filename_without_directory = "my_file_name.aln";
317 char output_filename[30];
318 memset(output_filename, '\0', 30);
319 strip_directory_from_filename(input_filename_without_directory, output_filename);
320 fail_unless(strcmp(input_filename_without_directory, output_filename) == 0);
321 }
322
323 END_TEST
324
325 START_TEST(check_strip_directory_from_filename_with_directory)
326 {
327 char *input_filename_without_directory = "/some/directory/name/my_file_name.aln";
328 char output_filename[30];
329 memset(output_filename, '\0', 30);
330 strip_directory_from_filename(input_filename_without_directory, output_filename);
331 fail_unless(strcmp("my_file_name.aln", output_filename) == 0);
332 }
333
334 END_TEST
335
336 START_TEST(check_count_constant_sites)
272337 {
273338 count_constant_sites("../tests/data/small_alignment.aln", "small_alignment.constant_site_counts.txt");
274 fail_unless(compare_files("../tests/data/small_alignment.constant_site_counts.txt", "small_alignment.constant_site_counts.txt"));
339 fail_unless(compare_files("../tests/data/small_alignment.constant_site_counts.txt",
340 "small_alignment.constant_site_counts.txt"));
275341 remove("small_alignment.constant_site_counts.txt");
276342 }
277 END_TEST
278
279 Suite * snp_sites_suite (void)
280 {
281 Suite *s = suite_create ("Creating_SNP_Sites");
282
283 TCase *tc_alignment_file = tcase_create ("alignment_file");
284 tcase_add_test (tc_alignment_file, valid_genome_length);
285 tcase_add_test (tc_alignment_file, valid_genome_length_with_multiple_lines_per_sequence);
286 tcase_add_test (tc_alignment_file, valid_number_of_sequences_in_file);
287 tcase_add_test (tc_alignment_file, valid_number_of_sequences_in_file_with_multiple_lines_per_sequence);
288 tcase_add_test (tc_alignment_file, number_of_snps_detected_small);
289 tcase_add_test (tc_alignment_file, number_of_snps_detected);
290 tcase_add_test (tc_alignment_file, sample_names_from_alignment_file);
291 tcase_add_test (tc_alignment_file, detect_snps_pure_mode);
292 tcase_add_test (tc_alignment_file, check_strip_directory_from_filename_without_directory);
293 tcase_add_test (tc_alignment_file, check_strip_directory_from_filename_with_directory);
294 tcase_add_test (tc_alignment_file, detect_snps_pure_mode_monomorphic);
295
296 suite_add_tcase (s, tc_alignment_file);
297
298 TCase *tc_snp_sites = tcase_create ("snp_sites");
299 tcase_add_test (tc_snp_sites, valid_alignment_with_one_line_per_sequence);
300 tcase_add_test (tc_snp_sites, valid_alignment_with_multiple_lines_per_sequence);
301 tcase_add_test (tc_snp_sites, valid_alignment_with_one_line_per_sequence_gzipped);
302 tcase_add_test (tc_snp_sites, valid_alignment_with_n_as_gap);
303 tcase_add_test (tc_snp_sites, valid_with_only_aln_file_output);
304 tcase_add_test (tc_snp_sites, valid_with_only_aln_file_output_with_custom_name);
305 tcase_add_test (tc_snp_sites, valid_with_all_outputted_with_custom_name);
306 tcase_add_test (tc_snp_sites, valid_with_only_aln_file_output_default);
307 tcase_add_test (tc_snp_sites, valid_with_phylip_outputted_with_custom_name);
308 tcase_add_test (tc_snp_sites, valid_aln_plus_reference);
309 tcase_add_test (tc_snp_sites, valid_phylip_plus_reference);
310 tcase_add_test (tc_snp_sites, valid_alignment_with_pure_mode);
311 tcase_add_test (tc_snp_sites, valid_alignment_with_monomorphic_sites);
312 tcase_add_test (tc_snp_sites, check_count_constant_sites);
313
314 tcase_add_exit_test(tc_snp_sites, invalid_with_uneven_file_lengths,EXIT_FAILURE);
315 remove("uneven_alignment.aln.snp_sites.aln");
316 suite_add_tcase (s, tc_snp_sites);
317
318 return s;
319 }
320
343
344 END_TEST
345
346 Suite
347 *
348
349 snp_sites_suite(void)
350 {
351 Suite * s = suite_create("Creating_SNP_Sites");
352
353 TCase *tc_alignment_file = tcase_create("alignment_file");
354 tcase_add_test(tc_alignment_file, valid_genome_length);
355 tcase_add_test(tc_alignment_file, valid_genome_length_with_multiple_lines_per_sequence);
356 tcase_add_test(tc_alignment_file, valid_number_of_sequences_in_file);
357 tcase_add_test(tc_alignment_file, valid_number_of_sequences_in_file_with_multiple_lines_per_sequence);
358 tcase_add_test(tc_alignment_file, number_of_snps_detected_small);
359 tcase_add_test(tc_alignment_file, number_of_snps_detected);
360 tcase_add_test(tc_alignment_file, sample_names_from_alignment_file);
361 tcase_add_test(tc_alignment_file, detect_snps_pure_mode);
362 tcase_add_test(tc_alignment_file, check_strip_directory_from_filename_without_directory);
363 tcase_add_test(tc_alignment_file, check_strip_directory_from_filename_with_directory);
364 tcase_add_test(tc_alignment_file, detect_snps_pure_mode_monomorphic);
365
366 suite_add_tcase(s, tc_alignment_file);
367
368 TCase *tc_snp_sites = tcase_create("snp_sites");
369 tcase_add_test(tc_snp_sites, valid_alignment_with_one_line_per_sequence);
370 tcase_add_test(tc_snp_sites, valid_alignment_with_multiple_lines_per_sequence);
371 tcase_add_test(tc_snp_sites, valid_alignment_with_one_line_per_sequence_gzipped);
372 tcase_add_test(tc_snp_sites, valid_alignment_with_n_as_gap);
373 tcase_add_test(tc_snp_sites, valid_with_only_aln_file_output);
374 tcase_add_test(tc_snp_sites, valid_with_only_aln_file_output_with_custom_name);
375 tcase_add_test(tc_snp_sites, valid_with_all_outputted_with_custom_name);
376 tcase_add_test(tc_snp_sites, valid_with_only_aln_file_output_default);
377 tcase_add_test(tc_snp_sites, valid_with_phylip_outputted_with_custom_name);
378 tcase_add_test(tc_snp_sites, valid_aln_plus_reference);
379 tcase_add_test(tc_snp_sites, valid_phylip_plus_reference);
380 tcase_add_test(tc_snp_sites, valid_alignment_with_pure_mode);
381 tcase_add_test(tc_snp_sites, valid_alignment_with_monomorphic_sites);
382 tcase_add_test(tc_snp_sites, check_count_constant_sites);
383
384 tcase_add_exit_test(tc_snp_sites, invalid_with_uneven_file_lengths, EXIT_FAILURE);
385 remove("uneven_alignment.aln.snp_sites.aln");
386 suite_add_tcase(s, tc_snp_sites);
387
388 return s;
389 }
390
1919 #ifndef _CHECK_SNP_SITES_H_
2020 #define _CHECK_SNP_SITES_H_
2121
22 Suite * snp_sites_suite (void);
22 Suite *snp_sites_suite(void);
23
2324 #endif
2425
2526
2727 #include "check-vcf.h"
2828 #include "vcf.h"
2929
30 void check_alternative_bases(char reference_base, char * bases_for_snp, int number_of_samples, char * expected_result)
30 void check_alternative_bases(char reference_base, char *bases_for_snp, int number_of_samples, char *expected_result)
3131 {
32 char * result;
33 result = alternative_bases(reference_base, bases_for_snp, number_of_samples);
34 ck_assert_str_eq(result, expected_result);
35 free(result);
32 char *result;
33 result = alternative_bases(reference_base, bases_for_snp, number_of_samples);
34 ck_assert_str_eq(result, expected_result);
35 free(result);
3636 }
3737
3838 START_TEST (alternative_bases_test)
39 {
40 check_alternative_bases('A', "AGCT-nN", 6, "GCT*");
41 }
39 {
40 check_alternative_bases('A', "AGCT-nN", 6, "GCT*");
41 }
42
4243 END_TEST
4344
44 void check_format_alternative_bases(char * test_case, char * expected_result)
45 void check_format_alternative_bases(char *test_case, char *expected_result)
4546 {
46 char * result;
47 result = format_alternative_bases(test_case);
48 ck_assert_str_eq(result, expected_result);
49 free(result);
47 char *result;
48 result = format_alternative_bases(test_case);
49 ck_assert_str_eq(result, expected_result);
50 free(result);
5051 }
5152
5253 START_TEST (format_alternative_bases_test)
53 {
54 check_format_alternative_bases("", ".");
55 check_format_alternative_bases("A", "A");
56 check_format_alternative_bases("AC", "A,C");
57 check_format_alternative_bases("ACT", "A,C,T");
58 }
54 {
55 check_format_alternative_bases("", ".");
56 check_format_alternative_bases("A", "A");
57 check_format_alternative_bases("AC", "A,C");
58 check_format_alternative_bases("ACT", "A,C,T");
59 }
60
5961 END_TEST
6062
61 void check_format_allele_index(char test_base, char reference_base, char * alt_bases, char * expected_result)
63 void check_format_allele_index(char test_base, char reference_base, char *alt_bases, char *expected_result)
6264 {
63 char * result;
64 result = format_allele_index(test_base, reference_base, alt_bases);
65 ck_assert_str_eq(result, expected_result);
66 free(result);
65 char *result;
66 result = format_allele_index(test_base, reference_base, alt_bases);
67 ck_assert_str_eq(result, expected_result);
68 free(result);
6769 }
6870
6971 START_TEST (format_allele_index_test)
70 {
71 check_format_allele_index('A', 'A', "", "0");
72 check_format_allele_index('A', 'A', "C", "0");
73 check_format_allele_index('A', 'A', "CA", "0");
72 {
73 check_format_allele_index('A', 'A', "", "0");
74 check_format_allele_index('A', 'A', "C", "0");
75 check_format_allele_index('A', 'A', "CA", "0");
7476
75 check_format_allele_index('A', 'C', "A", "1");
76 check_format_allele_index('A', 'C', "GA", "2");
77 check_format_allele_index('A', 'C', "A", "1");
78 check_format_allele_index('A', 'C', "GA", "2");
7779
78 check_format_allele_index('A', 'C', "", ".");
79 check_format_allele_index('A', 'C', "G", ".");
80 check_format_allele_index('A', 'C', "", ".");
81 check_format_allele_index('A', 'C', "G", ".");
8082
81 check_format_allele_index('A', 'B', "CDEFGHIJKLMNOPAQRST", "15");
83 check_format_allele_index('A', 'B', "CDEFGHIJKLMNOPAQRST", "15");
8284
83 check_format_allele_index('-', 'A', "C*", "2");
84 check_format_allele_index('N', 'A', "C*", "2");
85 check_format_allele_index('n', 'A', "C*", "2");
86 }
85 check_format_allele_index('-', 'A', "C*", "2");
86 check_format_allele_index('N', 'A', "C*", "2");
87 check_format_allele_index('n', 'A', "C*", "2");
88 }
8789 END_TEST
8890
89 Suite * vcf_suite (void)
91 Suite
92 *
93
94 vcf_suite(void)
9095 {
91 Suite *s = suite_create ("Creating_VCF_file");
96 Suite * s = suite_create("Creating_VCF_file");
9297
93 TCase *tc_vcf_file = tcase_create ("vcf_file");
94 tcase_add_test (tc_vcf_file, alternative_bases_test);
95 tcase_add_test (tc_vcf_file, format_alternative_bases_test);
96 tcase_add_test (tc_vcf_file, format_allele_index_test);
97 suite_add_tcase (s, tc_vcf_file);
98 TCase *tc_vcf_file = tcase_create("vcf_file");
99 tcase_add_test(tc_vcf_file, alternative_bases_test);
100 tcase_add_test(tc_vcf_file, format_alternative_bases_test);
101 tcase_add_test(tc_vcf_file, format_allele_index_test);
102 suite_add_tcase(s, tc_vcf_file);
98103
99 return s;
104 return s;
100105 }
101106
102107
2020 #define _CHECK_VCF_H_
2121
2222 void check_alternative_bases(char, char *, int, char *);
23
2324 void check_format_alternative_bases(char *, char *);
25
2426 void check_format_allele_index(char, char, char *, char *);
25 Suite * vcf_suite (void);
27
28 Suite *vcf_suite(void);
29
2630 #endif
2731
2832
2626 #include "helper-methods.h"
2727
2828
29 int compare_files(char expected_output_filename[],char actual_output_filename[] )
29 int compare_files(char expected_output_filename[], char actual_output_filename[])
3030 {
31 FILE *expected_output_fh;
32 FILE *actual_output_fh;
33
34 char *expected_buffer;
35 char *actual_buffer;
36 long numbytes;
31 FILE *expected_output_fh;
32 FILE *actual_output_fh;
3733
38 size_t fsize_expected;
39 size_t fsize_actual;
40
41 expected_output_fh = fopen(expected_output_filename, "r");
42 actual_output_fh = fopen(actual_output_filename, "r");
43
44 fseek(expected_output_fh, 0L, SEEK_END);
45 numbytes = ftell(expected_output_fh);
46 fseek(expected_output_fh, 0L, SEEK_SET);
47 expected_buffer = (char*)calloc(numbytes +1, sizeof(char));
48 fsize_expected = fread(expected_buffer, sizeof(char), numbytes, expected_output_fh);
49 fclose(expected_output_fh);
50
51 fseek(actual_output_fh, 0L, SEEK_END);
52 numbytes = ftell(actual_output_fh);
53 fseek(actual_output_fh, 0L, SEEK_SET);
54 actual_buffer = (char*)calloc(numbytes +1, sizeof(char));
55 fsize_actual = fread(actual_buffer, sizeof(char), numbytes, actual_output_fh);
56 fclose(actual_output_fh);
57
58 if(strcmp(expected_buffer,actual_buffer) == 0)
59 {
34 char *expected_buffer;
35 char *actual_buffer;
36 long numbytes;
37
38 size_t fsize_expected;
39 size_t fsize_actual;
40
41 expected_output_fh = fopen(expected_output_filename, "r");
42 actual_output_fh = fopen(actual_output_filename, "r");
43
44 fseek(expected_output_fh, 0L, SEEK_END);
45 numbytes = ftell(expected_output_fh);
46 fseek(expected_output_fh, 0L, SEEK_SET);
47 expected_buffer = (char *) calloc(numbytes + 1, sizeof(char));
48 fsize_expected = fread(expected_buffer, sizeof(char), numbytes, expected_output_fh);
49 fclose(expected_output_fh);
50
51 fseek(actual_output_fh, 0L, SEEK_END);
52 numbytes = ftell(actual_output_fh);
53 fseek(actual_output_fh, 0L, SEEK_SET);
54 actual_buffer = (char *) calloc(numbytes + 1, sizeof(char));
55 fsize_actual = fread(actual_buffer, sizeof(char), numbytes, actual_output_fh);
56 fclose(actual_output_fh);
57
58 if (strcmp(expected_buffer, actual_buffer) == 0) {
59 free(expected_buffer);
60 free(actual_buffer);
61 return 1;
62 }
63
6064 free(expected_buffer);
6165 free(actual_buffer);
62 return 1;
63 }
6466
65 free(expected_buffer);
66 free(actual_buffer);
67
68 return 0;
67 return 0;
6968 }
1919 #ifndef _HELPER_METHODS_H_
2020 #define _HELPER_METHODS_H_
2121
22 int compare_files(char expected_output_filename[],char actual_output_filename[] );
22 int compare_files(char expected_output_filename[], char actual_output_filename[]);
23
2324 #endif
2425
2424 #include "check-vcf.h"
2525
2626
27 int main(void)
28 {
29 int number_failed;
30 Suite *s;
31 SRunner *sr;
2732
28 int main (void)
29 {
30 int number_failed;
31 Suite *s;
32 SRunner *sr;
33 s = snp_sites_suite();
34 sr = srunner_create(s);
35 srunner_run_all(sr, CK_NORMAL);
36 number_failed = srunner_ntests_failed(sr);
37 srunner_free(sr);
3338
34 s = snp_sites_suite ();
35 sr = srunner_create (s);
36 srunner_run_all (sr, CK_NORMAL);
37 number_failed = srunner_ntests_failed (sr);
38 srunner_free (sr);
39 s = vcf_suite();
40 sr = srunner_create(s);
41 srunner_run_all(sr, CK_NORMAL);
42 number_failed += srunner_ntests_failed(sr);
43 srunner_free(sr);
3944
40 s = vcf_suite ();
41 sr = srunner_create (s);
42 srunner_run_all (sr, CK_NORMAL);
43 number_failed += srunner_ntests_failed (sr);
44 srunner_free (sr);
45
46 return (number_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
45 return (number_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
4746 }