Merge tag 'upstream/3.02'
Upstream version 3.02
Andreas Tille
6 years ago
0 | Preamble | |
1 | ||
2 | The intent of this document is to state the conditions under which a | |
3 | Package may be copied, such that the Copyright Holder maintains some | |
4 | semblance of artistic control over the development of the package, while | |
5 | giving the users of the package the right to use and distribute the | |
6 | Package in a more-or-less customary fashion, plus the right to make | |
7 | reasonable modifications. | |
8 | ||
9 | Definitions: | |
10 | ||
11 | * "Package" refers to the collection of files distributed by the | |
12 | Copyright Holder, and derivatives of that collection of files | |
13 | created through textual modification. | |
14 | * "Standard Version" refers to such a Package if it has not been | |
15 | modified, or has been modified in accordance with the wishes of | |
16 | the Copyright Holder. | |
17 | * "Copyright Holder" is whoever is named in the copyright or | |
18 | copyrights for the package. | |
19 | * "You" is you, if you're thinking about copying or distributing this Package. | |
20 | * "Reasonable copying fee" is whatever you can justify on the basis | |
21 | of media cost, duplication charges, time of people involved, and | |
22 | so on. (You will not be required to justify it to the Copyright | |
23 | Holder, but only to the computing community at large as a market | |
24 | that must bear the fee.) | |
25 | * "Freely Available" means that no fee is charged for the item | |
26 | itself, though there may be fees involved in handling the item. It | |
27 | also means that recipients of the item may redistribute it under | |
28 | the same conditions they received it. | |
29 | ||
30 | 1. You may make and give away verbatim copies of the source form of the | |
31 | Standard Version of this Package without restriction, provided that | |
32 | you duplicate all of the original copyright notices and associated | |
33 | disclaimers. | |
34 | ||
35 | 2. You may apply bug fixes, portability fixes and other modifications | |
36 | derived from the Public Domain or from the Copyright Holder. A | |
37 | Package modified in such a way shall still be considered the Standard | |
38 | Version. | |
39 | ||
40 | 3. You may otherwise modify your copy of this Package in any way, | |
41 | provided that you insert a prominent notice in each changed file | |
42 | stating how and when you changed that file, and provided that you do | |
43 | at least ONE of the following: | |
44 | ||
45 | a) place your modifications in the Public Domain or otherwise make | |
46 | them Freely Available, such as by posting said modifications to | |
47 | Usenet or an equivalent medium, or placing the modifications on a | |
48 | major archive site such as ftp.uu.net, or by allowing the Copyright | |
49 | Holder to include your modifications in the Standard Version of the | |
50 | Package. | |
51 | ||
52 | b) use the modified Package only within your corporation or | |
53 | organization. | |
54 | ||
55 | c) rename any non-standard executables so the names do not conflict | |
56 | with standard executables, which must also be provided, and provide a | |
57 | separate manual page for each non-standard executable that clearly | |
58 | documents how it differs from the Standard Version. | |
59 | ||
60 | d) make other distribution arrangements with the Copyright Holder. | |
61 | ||
62 | 4. You may distribute the programs of this Package in object code or | |
63 | executable form, provided that you do at least ONE of the following: | |
64 | ||
65 | a) distribute a Standard Version of the executables and library | |
66 | files, together with instructions (in the manual page or equivalent) | |
67 | on where to get the Standard Version. | |
68 | ||
69 | b) accompany the distribution with the machine-readable source of | |
70 | the Package with your modifications. | |
71 | ||
72 | c) accompany any non-standard executables with their corresponding | |
73 | Standard Version executables, giving the non-standard executables | |
74 | non-standard names, and clearly documenting the differences in | |
75 | manual pages (or equivalent), together with instructions on where to | |
76 | get the Standard Version. | |
77 | ||
78 | d) make other distribution arrangements with the Copyright Holder. | |
79 | ||
80 | 5. You may charge a reasonable copying fee for any distribution of this | |
81 | Package. You may charge any fee you choose for support of this | |
82 | Package. You may not charge a fee for this Package itself. However, | |
83 | you may distribute this Package in aggregate with other (possibly | |
84 | commercial) programs as part of a larger (possibly commercial) | |
85 | software distribution provided that you do not advertise this | |
86 | Package as a product of your own. | |
87 | ||
88 | 6. The scripts and library files supplied as input to or produced as | |
89 | output from the programs of this Package do not automatically fall | |
90 | under the copyright of this Package, but belong to whomever | |
91 | generated them, and may be sold commercially, and may be aggregated | |
92 | with this Package. | |
93 | ||
94 | 7. C or perl subroutines supplied by you and linked into this Package | |
95 | shall not be considered part of this Package. | |
96 | ||
97 | 8. The name of the Copyright Holder may not be used to endorse or | |
98 | promote products derived from this software without specific prior | |
99 | written permission. | |
100 | ||
101 | 9. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED | |
102 | WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF | |
103 | MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. | |
104 | ||
105 | The End⏎ |
0 | # Simple Makefile for Glimmer3 | |
1 | ||
2 | BINDIR = ../bin | |
3 | OBJDIR = ../obj | |
4 | LIBDIR = ../lib | |
5 | ||
6 | VPATH = $(BINDIR):$(OBJDIR):$(LIBDIR) | |
7 | ||
8 | COMMON_SRCS = delcher.cc fasta.cc gene.cc | |
9 | COMMON_OBJS = $(COMMON_SRCS:.cc=.o) | |
10 | ||
11 | GLIMMER_SRCS = anomaly.cc glimmer3.cc long-orfs.cc test.cc | |
12 | GLIMMER_OBJS = $(GLIMMER_SRCS:.cc=.o) | |
13 | GLIMMER_PROGS = $(GLIMMER_SRCS:.cc=) | |
14 | ||
15 | ICM_SRCS = icm.cc build-icm.cc build-fixed.cc score-fixed.cc | |
16 | ICM_OBJS = $(ICM_SRCS:.cc=.o) | |
17 | ICM_PROGS = build-icm build-fixed score-fixed | |
18 | ||
19 | UTIL_SRCS = entropy-profile.cc entropy-score.cc extract.cc multi-extract.cc \ | |
20 | start-codon-distrib.cc uncovered.cc window-acgt.cc | |
21 | UTIL_OBJS = $(UTIL_SRCS:.cc=.o) | |
22 | UTIL_PROGS = $(UTIL_SRCS:.cc=) | |
23 | ||
24 | SOURCES = $(COMMON_SRCS) $(GLIMMER_SRCS) $(ICM_SRCS) $(UTIL_SRCS) | |
25 | OBJECTS = $(COMMON_OBJS) $(GLIMMER_OBJS) $(ICM_OBJS) $(UTIL_OBJS) | |
26 | PROGS = $(COMMON_PROGS) $(GLIMMER_PROGS) $(ICM_PROGS) $(UTIL_PROGS) | |
27 | ||
28 | LIBRARIES = libGLMcommon.a libGLMicm.a | |
29 | ||
30 | AR = ar | |
31 | ARFLAGS = -rsv | |
32 | CPPC = g++ | |
33 | CFLAGS = -g | |
34 | LDFLAGS = -lm -L $(LIBDIR) | |
35 | ||
36 | DEPEND_FILES = *.cc *.hh | |
37 | CLEANABLE_FILES = $(OBJDIR)/*.o *~ | |
38 | ||
39 | ALL = $(LIBRARIES) $(PROGS) | |
40 | ||
41 | .SUFFIXES: .cc | |
42 | ||
43 | .cc.o: | |
44 | $(CPPC) $(CFLAGS) -c $*.cc -o $(OBJDIR)/$*.o | |
45 | ||
46 | all: $(ALL) | |
47 | ||
48 | anomaly: anomaly.o libGLMcommon.a | |
49 | $(CPPC) -o $(BINDIR)/$@ $(OBJDIR)/anomaly.o $(LDFLAGS) -lGLMcommon | |
50 | ||
51 | build-fixed: build-fixed.o libGLMicm.a libGLMcommon.a | |
52 | $(CPPC) -o $(BINDIR)/$@ $(OBJDIR)/build-fixed.o $(LDFLAGS) -lGLMicm -lGLMcommon | |
53 | ||
54 | build-icm: build-icm.o libGLMicm.a libGLMcommon.a | |
55 | $(CPPC) -o $(BINDIR)/$@ $(OBJDIR)/build-icm.o $(LDFLAGS) -lGLMicm -lGLMcommon | |
56 | ||
57 | entropy-profile: entropy-profile.o libGLMcommon.a | |
58 | $(CPPC) -o $(BINDIR)/$@ $(OBJDIR)/entropy-profile.o $(LDFLAGS) -lGLMcommon | |
59 | ||
60 | entropy-score: entropy-score.o libGLMcommon.a | |
61 | $(CPPC) -o $(BINDIR)/$@ $(OBJDIR)/entropy-score.o $(LDFLAGS) -lGLMcommon | |
62 | ||
63 | extract: extract.o libGLMcommon.a | |
64 | $(CPPC) -o $(BINDIR)/$@ $(OBJDIR)/extract.o $(LDFLAGS) -lGLMcommon | |
65 | ||
66 | glimmer3: glimmer3.o libGLMicm.a libGLMcommon.a | |
67 | $(CPPC) -o $(BINDIR)/$@ $(OBJDIR)/glimmer3.o $(LDFLAGS) -lGLMicm -lGLMcommon | |
68 | ||
69 | long-orfs: long-orfs.o libGLMcommon.a | |
70 | $(CPPC) -o $(BINDIR)/$@ $(OBJDIR)/long-orfs.o $(LDFLAGS) -lGLMcommon | |
71 | ||
72 | multi-extract: multi-extract.o libGLMcommon.a | |
73 | $(CPPC) -o $(BINDIR)/$@ $(OBJDIR)/multi-extract.o $(LDFLAGS) -lGLMcommon | |
74 | ||
75 | score-fixed: score-fixed.o libGLMicm.a libGLMcommon.a | |
76 | $(CPPC) -o $(BINDIR)/$@ $(OBJDIR)/score-fixed.o $(LDFLAGS) -lGLMicm -lGLMcommon | |
77 | ||
78 | start-codon-distrib: start-codon-distrib.o libGLMcommon.a | |
79 | $(CPPC) -o $(BINDIR)/$@ $(OBJDIR)/start-codon-distrib.o $(LDFLAGS) -lGLMcommon | |
80 | ||
81 | test: test.o libGLMcommon.a | |
82 | $(CPPC) -o $(BINDIR)/$@ $(OBJDIR)/test.o $(LDFLAGS) -lGLMcommon | |
83 | ||
84 | uncovered: uncovered.o libGLMcommon.a | |
85 | $(CPPC) -o $(BINDIR)/$@ $(OBJDIR)/uncovered.o $(LDFLAGS) -lGLMcommon | |
86 | ||
87 | window-acgt: window-acgt.o libGLMcommon.a | |
88 | $(CPPC) -o $(BINDIR)/$@ $(OBJDIR)/window-acgt.o $(LDFLAGS) -lGLMcommon | |
89 | ||
90 | ||
91 | libGLMcommon.a: $(COMMON_OBJS) | |
92 | $(AR) $(ARFLAGS) $(LIBDIR)/$@ $(OBJDIR)/delcher.o $(OBJDIR)/fasta.o $(OBJDIR)/gene.o | |
93 | ||
94 | libGLMicm.a: icm.o | |
95 | $(AR) $(ARFLAGS) $(LIBDIR)/$@ $(OBJDIR)/icm.o | |
96 | ||
97 | depend: | |
98 | makedepend $(DEPEND_FILES) | |
99 | ||
100 | clean: | |
101 | /bin/rm -f $(CLEANABLE_FILES) | |
102 | ||
103 | # DO NOT DELETE THIS LINE -- make depend depends on it. | |
104 |
0 | // A. L. Delcher | |
1 | // | |
2 | // File: anomaly.cc | |
3 | // | |
4 | // Last Modified: Fri May 19 17:10:05 EDT 2006 | |
5 | // | |
6 | // This program reads the sequence in the first command-line | |
7 | // file and then takes the start and end positions specified in | |
8 | // the second command-line file and checks for anomalous | |
9 | // start/stop codons and frame shifts. | |
10 | ||
11 | ||
12 | #include "anomaly.hh" | |
13 | ||
14 | ||
15 | // Global variables | |
16 | ||
17 | static bool Check_Previous_Stop = false; | |
18 | // Determines whether to check if codon before start coordinate | |
19 | // is a valid stop codon | |
20 | static bool Check_Start_Codon = true; | |
21 | // Determines whether to check if first codon is a valid start | |
22 | static char * Coord_File_Name = NULL; | |
23 | // From the second command-line argument | |
24 | static int Num_Start_Codons; | |
25 | // Number of different start codon patterns | |
26 | static int Num_Stop_Codons; | |
27 | // Number of different stop codon patterns | |
28 | static char * Sequence_File_Name = NULL; | |
29 | // From the first command-line argument | |
30 | static vector <const char *> Start_Codon; | |
31 | // Sequences assumed to be start codons | |
32 | static vector <const char *> Stop_Codon; | |
33 | // Sequences assumed to be stop codons | |
34 | ||
35 | ||
36 | int main | |
37 | (int argc, char * argv []) | |
38 | ||
39 | { | |
40 | FILE * fp; | |
41 | string Data, hdr; | |
42 | char * Buffer, Line [MAX_LINE], Name [MAX_LINE]; | |
43 | char Codon [4] = "tag"; | |
44 | int Direction, Frame_Shift; | |
45 | long int Buffer_Len, Gene_Len; | |
46 | long int i, j, Begin, End, Len, Start, Stop; | |
47 | int problem_ct = 0, ok_ct = 0; | |
48 | ||
49 | try | |
50 | { | |
51 | Parse_Command_Line (argc, argv); | |
52 | ||
53 | Set_Start_And_Stop_Codons (); | |
54 | ||
55 | fp = File_Open (Sequence_File_Name, "r"); | |
56 | ||
57 | Buffer = (char *) Safe_malloc (INIT_SIZE); | |
58 | Buffer_Len = INIT_SIZE; | |
59 | ||
60 | Fasta_Read (fp, Data, hdr); | |
61 | ||
62 | fclose (fp); | |
63 | ||
64 | Len = Data . length (); | |
65 | Data . insert (Data . begin (), 'x'); | |
66 | // Put a dummy character at the front of Data so subscripts | |
67 | // will start at 1 | |
68 | ||
69 | fp = File_Open (Coord_File_Name, "r"); | |
70 | ||
71 | while (fgets (Line, MAX_LINE, fp) != NULL) | |
72 | { | |
73 | bool problem = false; | |
74 | ||
75 | if (sscanf (Line, "%s %ld %ld", Name, & Start, & End) != 3) | |
76 | { | |
77 | printf ("Bad line: %s\n...Skipping\n", Line); | |
78 | continue; | |
79 | } | |
80 | ||
81 | if (Start < End && End - Start <= Len / 2 || Start - End > Len / 2) | |
82 | { | |
83 | Direction = +1; | |
84 | Gene_Len = 1 + End - Start; | |
85 | if (Gene_Len < 0) | |
86 | Gene_Len += Len; | |
87 | ||
88 | if (Buffer_Len < Gene_Len + 1) | |
89 | Buffer = (char *) Safe_realloc (Buffer, 1 + Gene_Len); | |
90 | Buffer_Len = 1 + Gene_Len; | |
91 | for (i = 0; i < Gene_Len; i ++) | |
92 | { | |
93 | if (Start + i <= Len) | |
94 | j = Start + i; | |
95 | else | |
96 | j = Start + i - Len; | |
97 | Buffer [i] = tolower (Data [j]); | |
98 | } | |
99 | Buffer [i] = '\0'; | |
100 | } | |
101 | else | |
102 | { | |
103 | Direction = -1; | |
104 | Gene_Len = 1 + Start - End; | |
105 | if (Gene_Len < 0) | |
106 | Gene_Len += Len; | |
107 | ||
108 | if (Buffer_Len < Gene_Len + 1) | |
109 | Buffer = (char *) Safe_realloc (Buffer, 1 + Gene_Len); | |
110 | Buffer_Len = 1 + Gene_Len; | |
111 | for (i = 0; i < Gene_Len; i ++) | |
112 | { | |
113 | if (Start - i >= 1) | |
114 | j = Start - i; | |
115 | else | |
116 | j = Start - i + Len; | |
117 | Buffer [i] = Complement (tolower (Data [j])); | |
118 | } | |
119 | Buffer [i] = '\0'; | |
120 | } | |
121 | ||
122 | if (Check_Previous_Stop) | |
123 | { | |
124 | if (Direction == +1) | |
125 | { | |
126 | for (i = 3; i > 0; i --) | |
127 | if (Start - i < 1) | |
128 | Codon [i] = tolower (Data [Start - i + Len]); | |
129 | else | |
130 | Codon [i] = tolower (Data [Start - i]); | |
131 | } | |
132 | else | |
133 | { | |
134 | for (i = 3; i > 0; i --) | |
135 | if (Start + i > Len) | |
136 | Codon [i] = Complement (tolower (Data [Start + i - Len])); | |
137 | else | |
138 | Codon [i] = Complement (tolower (Data [Start + i])); | |
139 | } | |
140 | if (! Is_Stop_Codon (Codon)) | |
141 | { | |
142 | printf ("%-10s %8ld %8ld no stop before start\n", | |
143 | Name, Start, End); | |
144 | problem = true; | |
145 | } | |
146 | } | |
147 | if (Check_Start_Codon && ! Is_Start_Codon (Buffer)) | |
148 | { | |
149 | printf ("%-10s has bad start codon = %.3s\n", Name, Buffer); | |
150 | problem = true; | |
151 | } | |
152 | if (! Is_Stop_Codon (Buffer + Gene_Len - 3)) | |
153 | { | |
154 | printf ("%-10s has bad stop codon = %s\n", Name, Buffer + Gene_Len - 3); | |
155 | problem = true; | |
156 | for (j = Gene_Len; j < Len; j += 3) | |
157 | { | |
158 | for (i = 0; i < 3; i ++) | |
159 | if (Direction == +1) | |
160 | { | |
161 | if (Start + i + j > Len) | |
162 | Codon [i] = tolower (Data [Start + i + j - Len]); | |
163 | else | |
164 | Codon [i] = tolower (Data [Start + i + j]); | |
165 | } | |
166 | else | |
167 | { | |
168 | if (Start - i - j < 1) | |
169 | Codon [i] = Complement (tolower (Data [Start - i - j + Len])); | |
170 | else | |
171 | Codon [i] = Complement (tolower (Data [Start - i - j])); | |
172 | } | |
173 | if (Is_Stop_Codon (Codon)) | |
174 | break; | |
175 | } | |
176 | assert (j < Len); | |
177 | printf (" next stop occurs at offset %ld" | |
178 | " Gene_Len = %ld diff = %+ld\n", | |
179 | j, Gene_Len, j - Gene_Len + 3); | |
180 | } | |
181 | ||
182 | Frame_Shift = (Gene_Len % 3); | |
183 | if (Frame_Shift) | |
184 | { | |
185 | printf ("%-10s %8ld %8ld has %+d frame shift\n", | |
186 | Name, Start, End, Frame_Shift); | |
187 | problem = true; | |
188 | ||
189 | for (i = 0; i < Gene_Len - 3; i += 3) | |
190 | if (Is_Stop_Codon (Buffer + i)) | |
191 | break; | |
192 | if (i < Gene_Len - 3) | |
193 | { | |
194 | Stop = Start + Direction * (i - 1); | |
195 | if (Stop < 1) | |
196 | Stop += Len; | |
197 | else if (Stop > Len) | |
198 | Stop -= Len; | |
199 | printf (" Best prefix is %8ld %8ld Len = %ld\n", | |
200 | Start, Stop, i); | |
201 | } | |
202 | else | |
203 | { | |
204 | printf (" No stop found in start frame\n"); | |
205 | continue; | |
206 | } | |
207 | ||
208 | for (i = Gene_Len - 6; i >= 0; i -= 3) | |
209 | if (Is_Stop_Codon (Buffer + i)) | |
210 | break; | |
211 | i += 3; | |
212 | Begin = Start + Direction * i; | |
213 | if (Begin < 1) | |
214 | Begin += Len; | |
215 | else if (Stop > Len) | |
216 | Begin -= Len; | |
217 | printf (" Best suffix is %8ld %8ld Len = %ld\n", | |
218 | Begin, End, Gene_Len - i - 3); | |
219 | ||
220 | } | |
221 | else | |
222 | { | |
223 | for (i = 0; i < Gene_Len - 3; i += 3) | |
224 | if (Is_Stop_Codon (Buffer + i)) | |
225 | { | |
226 | printf ("%-10s has stop codon %.3s at offset %ld" | |
227 | " Gene_Len = %ld diff = %+ld\n", | |
228 | Name, Buffer + i, i, Gene_Len, Gene_Len - 3 - i); | |
229 | problem = true; | |
230 | } | |
231 | } | |
232 | if (problem) | |
233 | problem_ct ++; | |
234 | else | |
235 | ok_ct ++; | |
236 | } | |
237 | ||
238 | fprintf (stderr, " OK orfs = %7d\n", ok_ct); | |
239 | fprintf (stderr, "Problem orfs = %7d\n", problem_ct); | |
240 | } | |
241 | catch (std :: exception & e) | |
242 | { | |
243 | cerr << "** Standard Exception **" << endl; | |
244 | cerr << e << endl; | |
245 | exit (EXIT_FAILURE); | |
246 | } | |
247 | ||
248 | return 0; | |
249 | } | |
250 | ||
251 | ||
252 | ||
253 | static bool Is_Start_Codon | |
254 | (const char * p) | |
255 | ||
256 | // Return true iff the first 3 characters of p match a | |
257 | // string in global Start_Codon . | |
258 | ||
259 | { | |
260 | int i; | |
261 | ||
262 | for (i = 0; i < Num_Start_Codons; i ++) | |
263 | if (strncmp (p, Start_Codon [i], 3) == 0) | |
264 | return true; | |
265 | ||
266 | return false; | |
267 | } | |
268 | ||
269 | ||
270 | ||
271 | static bool Is_Stop_Codon | |
272 | (const char * p) | |
273 | ||
274 | // Return true iff the first 3 characters of p match a | |
275 | // string in global Stop_Codon . | |
276 | ||
277 | { | |
278 | int i; | |
279 | ||
280 | for (i = 0; i < Num_Stop_Codons; i ++) | |
281 | if (strncmp (p, Stop_Codon [i], 3) == 0) | |
282 | return true; | |
283 | ||
284 | return false; | |
285 | } | |
286 | ||
287 | ||
288 | ||
289 | static void Parse_Command_Line | |
290 | (int argc, char * argv []) | |
291 | ||
292 | // Get options and parameters from command line with argc | |
293 | // arguments in argv [0 .. (argc - 1)] . | |
294 | ||
295 | { | |
296 | char * p, * q; | |
297 | bool errflg = false; | |
298 | int ch; | |
299 | ||
300 | optarg = NULL; | |
301 | ||
302 | while (! errflg && ((ch = getopt (argc, argv, "A:stZ:")) != EOF)) | |
303 | switch (ch) | |
304 | { | |
305 | case 'A' : | |
306 | Start_Codon . clear (); | |
307 | for (p = strtok (optarg, ","); p != NULL; p = strtok (NULL, ",")) | |
308 | { | |
309 | q = strdup (p); | |
310 | Make_Lower_Case (q); | |
311 | Start_Codon . push_back (q); | |
312 | } | |
313 | break; | |
314 | ||
315 | case 's' : | |
316 | Check_Start_Codon = FALSE; | |
317 | break; | |
318 | ||
319 | case 't' : | |
320 | Check_Previous_Stop = TRUE; | |
321 | break; | |
322 | ||
323 | case 'Z' : | |
324 | Stop_Codon . clear (); | |
325 | for (p = strtok (optarg, ","); p != NULL; p = strtok (NULL, ",")) | |
326 | { | |
327 | q = strdup (p); | |
328 | Make_Lower_Case (q); | |
329 | Stop_Codon . push_back (q); | |
330 | } | |
331 | break; | |
332 | ||
333 | case '?' : | |
334 | fprintf (stderr, "Unrecognized option -%c\n", optopt); | |
335 | ||
336 | default : | |
337 | errflg = true; | |
338 | } | |
339 | ||
340 | if (errflg) | |
341 | { | |
342 | Usage (); | |
343 | exit (EXIT_FAILURE); | |
344 | } | |
345 | ||
346 | if (optind > argc - 2) | |
347 | { | |
348 | Usage (); | |
349 | exit (EXIT_FAILURE); | |
350 | } | |
351 | ||
352 | Sequence_File_Name = argv [optind ++]; | |
353 | Coord_File_Name = argv [optind ++]; | |
354 | ||
355 | return; | |
356 | } | |
357 | ||
358 | ||
359 | ||
360 | static void Set_Start_And_Stop_Codons | |
361 | (void) | |
362 | ||
363 | // Set globals Start_Codon and Stop_Codon to the sequences | |
364 | // that are allowed to be start and stop codons for genes. | |
365 | ||
366 | { | |
367 | int i, n; | |
368 | ||
369 | if (Start_Codon . size () == 0) | |
370 | { | |
371 | n = sizeof (DEFAULT_START_CODON) / sizeof (char *); | |
372 | for (i = 0; i < n; i ++) | |
373 | Start_Codon . push_back (DEFAULT_START_CODON [i]); | |
374 | } | |
375 | ||
376 | if (Stop_Codon . size () == 0) | |
377 | { | |
378 | n = sizeof (DEFAULT_STOP_CODON) / sizeof (char *); | |
379 | for (i = 0; i < n; i ++) | |
380 | Stop_Codon . push_back (DEFAULT_STOP_CODON [i]); | |
381 | } | |
382 | ||
383 | Num_Start_Codons = Start_Codon . size (); | |
384 | Num_Stop_Codons = Stop_Codon . size (); | |
385 | ||
386 | return; | |
387 | } | |
388 | ||
389 | ||
390 | ||
391 | static void Usage | |
392 | (void) | |
393 | ||
394 | // Print to stderr description of options and command line for | |
395 | // this program. | |
396 | ||
397 | { | |
398 | fprintf (stderr, | |
399 | "USAGE: anomaly [options] <sequence-file> <coord-file>\n" | |
400 | "\n" | |
401 | "Read DNA sequence in <sequence-file> and for each region specified\n" | |
402 | "by the coordinates in <coord-file>, check whether the region\n" | |
403 | "represents a normal gene, i.e., it begins with a start codon, ends\n" | |
404 | "with a stop codon, and has no frame shifts.\n" | |
405 | "Output goes to standard output.\n" | |
406 | "\n" | |
407 | "Options:\n" | |
408 | " -A <codon-list>\n" | |
409 | " Use comma-separated list of codons as start codons\n" | |
410 | " Sample format: -A atg,gtg\n" | |
411 | " Default start codons are atg,gtg,ttg\n" | |
412 | " -s\n" | |
413 | " Omit the check that the first codon is a start codon.\n" | |
414 | " -t\n" | |
415 | " Check whether the codon preceding the start coordinate position\n" | |
416 | " is a stop codon. This is useful if the coordinates represent\n" | |
417 | " the entire region between stop codons.\n" | |
418 | " -Z <codon-list>\n" | |
419 | " Use comma-separated list of codons as stop codons\n" | |
420 | " Sample format: -Z tag,tga,taa\n" | |
421 | "\n"); | |
422 | ||
423 | return; | |
424 | } | |
425 | ||
426 | ||
427 |
0 | // A. L. Delcher | |
1 | // | |
2 | // File: anomaly.hh | |
3 | // | |
4 | // Last Modified: Fri May 19 17:10:05 EDT 2006 | |
5 | // | |
6 | // Declarations for anomaly.cc | |
7 | ||
8 | ||
9 | ||
10 | #ifndef __ANOMALY_HH_INCLUDED | |
11 | #define __ANOMALY_HH_INCLUDED | |
12 | ||
13 | ||
14 | #include "delcher.hh" | |
15 | #include "fasta.hh" | |
16 | #include "gene.hh" | |
17 | ||
18 | ||
19 | static bool Is_Start_Codon | |
20 | (const char * p); | |
21 | static bool Is_Stop_Codon | |
22 | (const char * p); | |
23 | static void Parse_Command_Line | |
24 | (int argc, char * argv []); | |
25 | static void Set_Start_And_Stop_Codons | |
26 | (void); | |
27 | static void Usage | |
28 | (void); | |
29 | ||
30 | ||
31 | #endif |
0 | // Programmer: Arthur L. Delcher | |
1 | // File: build-fixed.cc | |
2 | // Last Updated: Fri Jun 4 16:31:05 EDT 2004 | |
3 | // | |
4 | // This program reads (from stdin ) a set of fixed_length strings in | |
5 | // multi-fasta format. It then builds and outputs to stdout | |
6 | // a fixed-length interpolated context model (ICM) that matches the input. | |
7 | // | |
8 | // Copyright (c) 2006 University of Maryland Center for Bioinformatics | |
9 | // & Computational Biology | |
10 | ||
11 | ||
12 | #include "build-fixed.hh" | |
13 | ||
14 | ||
15 | static FILE * Index_File_fp = NULL; | |
16 | // File containing a list of subscripts of strings to train model | |
17 | static int Model_Depth = DEFAULT_MODEL_DEPTH; | |
18 | // Maximum number of positions to use in Markov context | |
19 | static int Model_Len = DEFAULT_MODEL_LEN; | |
20 | // Width of Markov context and character to be predicted | |
21 | static ICM_Model_t Model_Type = UNKNOWN_TYPE; | |
22 | // Type of model | |
23 | static int * Permutation = NULL; | |
24 | // Describes how to re-order the characters before building the model | |
25 | static int Permutation_Len = 0; | |
26 | // Length of above permutation; must match length of input strings | |
27 | static bool Print_Binary = true; | |
28 | // Print model as a binary file iff this is true; otherwise print | |
29 | // as text file | |
30 | static int Special_Position = -1; | |
31 | // Designated position in model, e.g., for splice junction | |
32 | static vector <char *> Training_Data; | |
33 | // Holds the training strings | |
34 | ||
35 | ||
36 | //**ALD Gets rid of make undefined reference error | |
37 | int Unused = Filter ('a'); | |
38 | ||
39 | ||
40 | ||
41 | int main | |
42 | (int argc, char * argv []) | |
43 | { | |
44 | int string_ct; | |
45 | // Number of strings read from training file | |
46 | int i; | |
47 | ||
48 | ||
49 | Parse_Command_Line (argc, argv); | |
50 | ||
51 | Read_Training_Data (stdin); | |
52 | string_ct = Training_Data . size (); | |
53 | ||
54 | if (string_ct <= 0) | |
55 | { | |
56 | fprintf (stderr, "ERROR: No strings read to train model\n"); | |
57 | exit (EXIT_FAILURE); | |
58 | } | |
59 | ||
60 | if (Index_File_fp != NULL) | |
61 | { | |
62 | // Read the file of subscripts, make a list of the strings | |
63 | // they refer to and use that for training. | |
64 | ||
65 | vector <char *> list; | |
66 | int sub; | |
67 | ||
68 | while (fscanf (Index_File_fp, "%d", & sub) == 1) | |
69 | list . push_back (Training_Data [sub]); | |
70 | ||
71 | Training_Data = list; | |
72 | string_ct = Training_Data . size (); | |
73 | } | |
74 | ||
75 | Model_Len = strlen (Training_Data [0]); | |
76 | for (i = 1; i < string_ct; i ++) | |
77 | if (int (strlen (Training_Data [i])) != Model_Len) | |
78 | { | |
79 | fprintf (stderr, "ERROR: String #%d has length = %d\n", | |
80 | i, int (strlen (Training_Data [i]))); | |
81 | fprintf (stderr, " different from string #0 length = %d\n", | |
82 | Model_Len); | |
83 | exit (EXIT_FAILURE); | |
84 | } | |
85 | if (Permutation != NULL && Permutation_Len != Model_Len) | |
86 | { | |
87 | fprintf (stderr, "ERROR: Permutation len = %d string_len = %d\n", | |
88 | Permutation_Len, Model_Len); | |
89 | exit (EXIT_FAILURE); | |
90 | } | |
91 | ||
92 | // create the model | |
93 | if (Special_Position > Model_Len) | |
94 | { | |
95 | fprintf (stderr, "ERROR: Bad special position = %d\n", | |
96 | Special_Position); | |
97 | } | |
98 | Fixed_Length_ICM_Training_t model (Model_Len, Model_Depth, Special_Position, | |
99 | Permutation, Model_Type); | |
100 | ||
101 | model . Train_Model (Training_Data); | |
102 | ||
103 | model . Output (stdout, Print_Binary); | |
104 | ||
105 | return 0; | |
106 | } | |
107 | ||
108 | ||
109 | ||
110 | static void Parse_Command_Line | |
111 | (int argc, char * argv []) | |
112 | ||
113 | // Get options and parameters from command line with argc | |
114 | // arguments in argv [0 .. (argc - 1)] . | |
115 | ||
116 | { | |
117 | char * p; | |
118 | int ch, errflg = FALSE; | |
119 | ||
120 | optarg = NULL; | |
121 | ||
122 | while (! errflg | |
123 | && ((ch = getopt (argc, argv, "bd:hi:p:s:tv:")) != EOF)) | |
124 | switch (ch) | |
125 | { | |
126 | case 'b' : | |
127 | Print_Binary = true; | |
128 | break; | |
129 | ||
130 | case 'd' : | |
131 | Model_Depth = int (strtol (optarg, & p, 10)); | |
132 | if (p == optarg || Model_Depth <= 0) | |
133 | { | |
134 | fprintf (stderr, "Bad model depth value \"%s\"\n", | |
135 | optarg); | |
136 | errflg = TRUE; | |
137 | } | |
138 | break; | |
139 | ||
140 | case 'h' : | |
141 | errflg = TRUE; | |
142 | break; | |
143 | ||
144 | case 'i' : | |
145 | Index_File_fp = File_Open (optarg, "r"); | |
146 | break; | |
147 | ||
148 | case 'p' : | |
149 | { | |
150 | vector <int> perm; | |
151 | int i, j, n; | |
152 | ||
153 | for (p = strtok (optarg, ", "); p != NULL; p = strtok (NULL, ", ")) | |
154 | perm . push_back (atoi (p)); | |
155 | n = perm . size (); | |
156 | Permutation = (int *) Safe_calloc (n, sizeof (int), __FILE__, | |
157 | __LINE__); | |
158 | for (i = 0; i < n; i ++) | |
159 | if (Permutation [perm [i]] == 0) | |
160 | Permutation [perm [i]] = 1; | |
161 | else | |
162 | { | |
163 | fprintf (stderr, "ERROR: Illegal permutation\n"); | |
164 | for (j = 0; j <= i; j ++) | |
165 | fprintf (stderr, " %d", perm [j]); | |
166 | fprintf (stderr, " <-- duplicate\n"); | |
167 | exit (EXIT_FAILURE); | |
168 | } | |
169 | for (i = 0; i < n; i ++) | |
170 | if (Permutation [i] == 0) | |
171 | { | |
172 | fprintf (stderr, "ERROR: Illegal permutation--missing %d\n", i); | |
173 | exit (EXIT_FAILURE); | |
174 | } | |
175 | for (i = 0; i < n; i ++) | |
176 | Permutation [i] = perm [i]; | |
177 | Permutation_Len = n; | |
178 | } | |
179 | break; | |
180 | ||
181 | case 's' : | |
182 | Special_Position = strtol (optarg, NULL, 10); | |
183 | break; | |
184 | ||
185 | #if 0 // ALD removed on 22 May 2006 | |
186 | case 'T' : | |
187 | Model_Type = ICM_Model_t (strtol (optarg, NULL, 10)); | |
188 | break; | |
189 | #endif | |
190 | ||
191 | case 't' : | |
192 | Print_Binary = false; | |
193 | break; | |
194 | ||
195 | case 'v' : | |
196 | Verbose = int (strtol (optarg, & p, 10)); | |
197 | if (p == optarg) | |
198 | { | |
199 | fprintf (stderr, "Bad verbose value \"%s\"\n", | |
200 | optarg); | |
201 | errflg = TRUE; | |
202 | } | |
203 | break; | |
204 | ||
205 | case '?' : | |
206 | fprintf (stderr, "Unrecognized option -%c\n", optopt); | |
207 | ||
208 | default : | |
209 | errflg = TRUE; | |
210 | } | |
211 | ||
212 | if (errflg || optind != argc - 0) | |
213 | { | |
214 | Usage (argv [0]); | |
215 | exit (EXIT_FAILURE); | |
216 | } | |
217 | ||
218 | return; | |
219 | } | |
220 | ||
221 | ||
222 | ||
223 | static int Read_String | |
224 | (FILE * fp, char * & s, long int & s_size, char * & tag, long int & tag_size) | |
225 | ||
226 | // Read next string from fp (assuming FASTA format) into s [0 .. ] | |
227 | // which has s_size characters. Allocate extra memory if needed | |
228 | // and adjust s_size accordingly. Return TRUE if successful, FALSE | |
229 | // otherwise (e.g., EOF). Put FASTA header line into tag [0 .. ] | |
230 | // (and adjust tag_size if needed). | |
231 | ||
232 | { | |
233 | int ch, ct; | |
234 | ||
235 | while ((ch = fgetc (fp)) != EOF && ch != '>') | |
236 | ; | |
237 | ||
238 | if (ch == EOF) | |
239 | return FALSE; | |
240 | ||
241 | ct = 0; | |
242 | while ((ch = fgetc (fp)) != EOF && ch != '\n' && isspace (ch)) | |
243 | ; | |
244 | if (ch == EOF) | |
245 | return FALSE; | |
246 | if (ch != '\n' && ! isspace (ch)) | |
247 | ungetc (ch, fp); | |
248 | while ((ch = fgetc (fp)) != EOF && ch != '\n') | |
249 | { | |
250 | if (ct >= tag_size - 1) | |
251 | { | |
252 | tag_size += INCR_SIZE; | |
253 | tag = (char *) Safe_realloc (tag, tag_size); | |
254 | } | |
255 | tag [ct ++] = char (ch); | |
256 | } | |
257 | tag [ct ++] = '\0'; | |
258 | ||
259 | ct = 0; | |
260 | while ((ch = fgetc (fp)) != EOF && ch != '>') | |
261 | { | |
262 | if (isspace (ch)) | |
263 | continue; | |
264 | ||
265 | if (ct >= s_size - 1) | |
266 | { | |
267 | s_size += INCR_SIZE; | |
268 | s = (char *) Safe_realloc (s, s_size); | |
269 | } | |
270 | s [ct ++] = char (ch); | |
271 | } | |
272 | s [ct ++] = '\0'; | |
273 | ||
274 | if (ch == '>') | |
275 | ungetc (ch, fp); | |
276 | ||
277 | return TRUE; | |
278 | } | |
279 | ||
280 | ||
281 | ||
282 | static void Read_Training_Data | |
283 | (FILE * fp) | |
284 | ||
285 | // Read in training strings from fp . Format is multifasta, i.e., for | |
286 | // each string a header line (starting with '>') followed by arbitrarily | |
287 | // many data lines. Save strings in global Training_Data | |
288 | ||
289 | { | |
290 | char * string = NULL, * tag = NULL; | |
291 | char * p; | |
292 | long int string_size = 0, tag_size = 0; | |
293 | ||
294 | while (Read_String (fp, string, string_size, tag, tag_size)) | |
295 | { | |
296 | p = strdup (string); | |
297 | Training_Data . push_back (p); | |
298 | } | |
299 | ||
300 | return; | |
301 | } | |
302 | ||
303 | ||
304 | ||
305 | static void Usage | |
306 | (char * command) | |
307 | ||
308 | // Print to stderr description of options and command line for | |
309 | // this program. command is the command that was used to | |
310 | // invoke it. | |
311 | ||
312 | { | |
313 | fprintf (stderr, | |
314 | "USAGE: %s [<options>] < <input-file> > <output-file>\n" | |
315 | "\n" | |
316 | "Read sequences from stdin and output to stdout \n" | |
317 | "the fixed-length interpolated context model built from them\n" | |
318 | "\n" | |
319 | "Options:\n" | |
320 | " -d <num> Set depth of model to <num>\n" | |
321 | " -h Print this message\n" | |
322 | " -i <fn> Train using strings specified by subscripts in file\n" | |
323 | " named <fn>\n" | |
324 | " -p n1,n2,...,nk Permutation describing re-ordering of\n" | |
325 | " character positions of input string to build model\n" | |
326 | " -s <num> Specify special position in model\n" | |
327 | " -t Output model as text (for debugging only)\n" | |
328 | " -v <num> Set verbose level; higher is more diagnostic printouts\n" | |
329 | "\n", | |
330 | command); | |
331 | ||
332 | return; | |
333 | } | |
334 | ||
335 | ||
336 |
0 | // Programmer: Arthur L. Delcher | |
1 | // File: build-fixed.hh | |
2 | // Last Updated: Fri Jun 4 16:33:12 EDT 2004 | |
3 | // | |
4 | // Declarations for build-fixed.cc | |
5 | ||
6 | ||
7 | #ifndef _BUILD_FIXED_HH | |
8 | #define _BUILD_FIXED_HH | |
9 | ||
10 | ||
11 | #include "icm.hh" | |
12 | ||
13 | ||
14 | static void Parse_Command_Line | |
15 | (int argc, char * argv []); | |
16 | static int Read_String | |
17 | (FILE * fp, char * & s, long int & s_size, char * & tag, | |
18 | long int & tag_size); | |
19 | static void Read_Training_Data | |
20 | (FILE * fp); | |
21 | static void Usage | |
22 | (char * command); | |
23 | ||
24 | ||
25 | ||
26 | #endif |
0 | // Programmers: Arthur L. Delcher | |
1 | // Doug Harmon | |
2 | // File: build-icm.cc | |
3 | // Last Updated: Mon Jun 12 15:34:00 EDT 2006 | |
4 | // | |
5 | // This program reads (from stdin ) a set of strings in | |
6 | // multi-fasta format. It then builds and outputs to stdout | |
7 | // an interpolated context model (ICM) that matches the input. | |
8 | // | |
9 | // Copyright (c) 2006 University of Maryland Center for Bioinformatics | |
10 | // & Computational Biology | |
11 | ||
12 | ||
13 | #include "build-icm.hh" | |
14 | ||
15 | ||
16 | static int Genbank_Xlate_Code = 0; | |
17 | // Holds the Genbank translation table number that determines | |
18 | // stop codons and codon translation. | |
19 | static int Model_Depth = DEFAULT_MODEL_DEPTH; | |
20 | // Maximum number of positions to use in Markov context | |
21 | static int Model_Len = DEFAULT_MODEL_LEN; | |
22 | // Width of Markov context and character to be predicted | |
23 | static int Model_Periodicity = DEFAULT_PERIODICITY; | |
24 | // Number of different models to cycle through | |
25 | static char * Output_Filename = NULL; | |
26 | // Name of file to which the ICM is written | |
27 | static bool Print_Binary = true; | |
28 | // Print model as a binary file iff this is true; otherwise print | |
29 | // as text file | |
30 | static bool Reverse_Strings = false; | |
31 | // If true, then use the reverse (not the reverse complement) | |
32 | // of input strings to train the model. | |
33 | static bool Skip_In_Frame_Stop_Strings = false; | |
34 | // If true then ignore any input string with an in-frame stop codon | |
35 | static vector <const char *> Stop_Codon; | |
36 | // Sequences assumed to be stop codons | |
37 | static vector <char *> Training_Data; | |
38 | // Holds the training strings | |
39 | ||
40 | ||
41 | ||
42 | ||
43 | //**ALD Gets rid of make undefined reference error | |
44 | int Unused = Filter ('a'); | |
45 | ||
46 | ||
47 | ||
48 | int main | |
49 | (int argc, char **argv) | |
50 | { | |
51 | FILE * output_fp; | |
52 | int string_ct; | |
53 | // Number of strings read from training file | |
54 | ||
55 | ||
56 | Parse_Command_Line (argc, argv); | |
57 | ||
58 | if (strcmp (Output_Filename, "-") == 0) | |
59 | output_fp = stdout; | |
60 | else if (Print_Binary) | |
61 | output_fp = File_Open (Output_Filename, "wb"); | |
62 | else | |
63 | output_fp = File_Open (Output_Filename, "w"); | |
64 | ||
65 | // create the model | |
66 | ICM_Training_t model (Model_Len, Model_Depth, Model_Periodicity); | |
67 | ||
68 | Read_Training_Data (stdin); | |
69 | string_ct = Training_Data . size (); | |
70 | if (string_ct == 0) | |
71 | { | |
72 | fprintf (stderr, "ERROR: Cannot create model--no input data\n"); | |
73 | fclose (output_fp); | |
74 | exit (EXIT_FAILURE); | |
75 | } | |
76 | ||
77 | if (Skip_In_Frame_Stop_Strings) | |
78 | { | |
79 | bool skip; | |
80 | int i, j, k, s, len, ct = 0; | |
81 | ||
82 | Set_Stop_Codons (); | |
83 | ||
84 | int num_stops = Stop_Codon . size (); | |
85 | ||
86 | for (i = k = 0; i < string_ct; i ++) | |
87 | { | |
88 | skip = false; | |
89 | ||
90 | // Assuming data has been converted to lower-case if needed | |
91 | ||
92 | len = strlen (Training_Data [i]); | |
93 | ||
94 | for (j = 0; j < len - 2 && ! skip; j += 3) | |
95 | for (s = 0; s < num_stops && ! skip; s ++) | |
96 | skip = (strncmp (Training_Data [i] + j, Stop_Codon [s], 3) == 0); | |
97 | ||
98 | if (skip) | |
99 | ct ++; | |
100 | else | |
101 | Training_Data [k ++] = Training_Data [i]; | |
102 | } | |
103 | ||
104 | fprintf (stderr, | |
105 | "Skipped %d strings with in-frame stops of %d total strings\n", | |
106 | ct, string_ct); | |
107 | Training_Data . resize (k); | |
108 | } | |
109 | ||
110 | if (Reverse_Strings) | |
111 | { | |
112 | int i, n; | |
113 | ||
114 | n = Training_Data . size (); | |
115 | for (i = 0; i < n; i ++) | |
116 | Reverse_String (Training_Data [i]); | |
117 | } | |
118 | ||
119 | model . Train_Model (Training_Data); | |
120 | ||
121 | model . Output (output_fp, Print_Binary); | |
122 | ||
123 | fclose (output_fp); | |
124 | ||
125 | return 0; | |
126 | } | |
127 | ||
128 | ||
129 | ||
130 | static void Parse_Command_Line | |
131 | (int argc, char * argv []) | |
132 | ||
133 | // Get options and parameters from command line with argc | |
134 | // arguments in argv [0 .. (argc - 1)] . | |
135 | ||
136 | { | |
137 | char * p, * q; | |
138 | bool errflg = false; | |
139 | int ch; | |
140 | ||
141 | optarg = NULL; | |
142 | ||
143 | #if ALLOW_LONG_OPTIONS | |
144 | int option_index = 0; | |
145 | static struct option long_options [] = { | |
146 | {"depth", 1, 0, 'd'}, | |
147 | {"no_stops", 0, 0, 'F'}, | |
148 | {"help", 0, 0, 'h'}, | |
149 | {"period", 1, 0, 'p'}, | |
150 | {"reverse", 0, 0, 'r'}, | |
151 | {"text", 0, 0, 't'}, | |
152 | {"verbose", 1, 0, 'v'}, | |
153 | {"width", 1, 0, 'w'}, | |
154 | {"trans_table", 1, 0, 'z'}, | |
155 | {"stop_codons", 1, 0, 'Z'}, | |
156 | {0, 0, 0, 0} | |
157 | }; | |
158 | ||
159 | while (! errflg && ((ch = getopt_long (argc, argv, | |
160 | "d:Fhp:rtv:w:z:Z:", | |
161 | long_options, & option_index)) != EOF)) | |
162 | #else | |
163 | while (! errflg && ((ch = getopt (argc, argv, | |
164 | "d:Fhp:rtv:w:z:Z:")) != EOF)) | |
165 | #endif | |
166 | ||
167 | switch (ch) | |
168 | { | |
169 | case 'd' : | |
170 | Model_Depth = int (strtol (optarg, & p, 10)); | |
171 | if (p == optarg || Model_Depth <= 0) | |
172 | { | |
173 | fprintf (stderr, "Bad model depth value \"%s\"\n", | |
174 | optarg); | |
175 | errflg = TRUE; | |
176 | } | |
177 | break; | |
178 | ||
179 | case 'F' : | |
180 | Skip_In_Frame_Stop_Strings = true; | |
181 | break; | |
182 | ||
183 | case 'h' : | |
184 | errflg = TRUE; | |
185 | break; | |
186 | ||
187 | case 'p' : | |
188 | Model_Periodicity = int (strtol (optarg, & p, 10)); | |
189 | if (p == optarg || Model_Periodicity <= 0) | |
190 | { | |
191 | fprintf (stderr, "Bad model period value \"%s\"\n", | |
192 | optarg); | |
193 | errflg = TRUE; | |
194 | } | |
195 | break; | |
196 | ||
197 | case 'r' : | |
198 | Reverse_Strings = true; | |
199 | break; | |
200 | ||
201 | case 't' : | |
202 | Print_Binary = false; | |
203 | break; | |
204 | ||
205 | case 'v' : | |
206 | Verbose = int (strtol (optarg, & p, 10)); | |
207 | if (p == optarg) | |
208 | { | |
209 | fprintf (stderr, "Bad verbose value \"%s\"\n", | |
210 | optarg); | |
211 | errflg = TRUE; | |
212 | } | |
213 | break; | |
214 | ||
215 | case 'w' : | |
216 | Model_Len = int (strtol (optarg, & p, 10)); | |
217 | if (p == optarg || Model_Len <= 0) | |
218 | { | |
219 | fprintf (stderr, "Bad model length value \"%s\"\n", | |
220 | optarg); | |
221 | errflg = TRUE; | |
222 | } | |
223 | break; | |
224 | ||
225 | case 'z' : | |
226 | Genbank_Xlate_Code = strtol (optarg, & p, 10); | |
227 | Set_Stop_Codons_By_Code (Stop_Codon, Genbank_Xlate_Code, errflg); | |
228 | break; | |
229 | ||
230 | case 'Z' : | |
231 | Stop_Codon . clear (); | |
232 | for (p = strtok (optarg, ","); p != NULL; p = strtok (NULL, ",")) | |
233 | { | |
234 | q = strdup (p); | |
235 | Make_Lower_Case (q); | |
236 | Stop_Codon . push_back (q); | |
237 | } | |
238 | break; | |
239 | ||
240 | case '?' : | |
241 | fprintf (stderr, "Unrecognized option -%c\n", optopt); | |
242 | ||
243 | default : | |
244 | errflg = TRUE; | |
245 | } | |
246 | ||
247 | if (errflg || optind != argc - 1) | |
248 | { | |
249 | Usage (argv [0]); | |
250 | exit (EXIT_FAILURE); | |
251 | } | |
252 | ||
253 | Output_Filename = argv [optind ++]; | |
254 | ||
255 | return; | |
256 | } | |
257 | ||
258 | ||
259 | ||
260 | static int Read_String | |
261 | (FILE * fp, char * & s, long int & s_size, char * & tag, long int & tag_size) | |
262 | ||
263 | // Read next string from fp (assuming FASTA format) into s [0 .. ] | |
264 | // which has s_size characters. Allocate extra memory if needed | |
265 | // and adjust s_size accordingly. Return TRUE if successful, FALSE | |
266 | // otherwise (e.g., EOF). Put FASTA header line into tag [0 .. ] | |
267 | // (and adjust tag_size if needed). | |
268 | ||
269 | { | |
270 | int ch, ct; | |
271 | ||
272 | while ((ch = fgetc (fp)) != EOF && ch != '>') | |
273 | ; | |
274 | ||
275 | if (ch == EOF) | |
276 | return FALSE; | |
277 | ||
278 | ct = 0; | |
279 | while ((ch = fgetc (fp)) != EOF && ch != '\n' && isspace (ch)) | |
280 | ; | |
281 | if (ch == EOF) | |
282 | return FALSE; | |
283 | if (ch != '\n' && ! isspace (ch)) | |
284 | ungetc (ch, fp); | |
285 | while ((ch = fgetc (fp)) != EOF && ch != '\n') | |
286 | { | |
287 | if (ct >= tag_size - 1) | |
288 | { | |
289 | tag_size += INCR_SIZE; | |
290 | tag = (char *) Safe_realloc (tag, tag_size); | |
291 | } | |
292 | tag [ct ++] = char (ch); | |
293 | } | |
294 | tag [ct ++] = '\0'; | |
295 | ||
296 | ct = 0; | |
297 | while ((ch = fgetc (fp)) != EOF && ch != '>') | |
298 | { | |
299 | if (isspace (ch)) | |
300 | continue; | |
301 | ||
302 | if (ct >= s_size - 1) | |
303 | { | |
304 | s_size += INCR_SIZE; | |
305 | s = (char *) Safe_realloc (s, s_size); | |
306 | } | |
307 | s [ct ++] = char (ch); | |
308 | } | |
309 | s [ct ++] = '\0'; | |
310 | ||
311 | if (ch == '>') | |
312 | ungetc (ch, fp); | |
313 | ||
314 | return TRUE; | |
315 | } | |
316 | ||
317 | ||
318 | ||
319 | static void Read_Training_Data | |
320 | (FILE * fp) | |
321 | ||
322 | // Read in training strings from fp . Format is multifasta, i.e., for | |
323 | // each string a header line (starting with '>') followed by arbitrarily | |
324 | // many data lines. Save strings in global Training_Data | |
325 | ||
326 | { | |
327 | char * string = NULL, * tag = NULL; | |
328 | char * p; | |
329 | long int string_size = 0, tag_size = 0; | |
330 | ||
331 | while (Read_String (fp, string, string_size, tag, tag_size)) | |
332 | { | |
333 | p = strdup (string); | |
334 | Make_Lower_Case (p); | |
335 | Training_Data . push_back (p); | |
336 | } | |
337 | ||
338 | return; | |
339 | } | |
340 | ||
341 | ||
342 | ||
343 | static void Set_Stop_Codons | |
344 | (void) | |
345 | ||
346 | // Set global Stop_Codon to the sequences | |
347 | // that are allowed to be stop codons for genes, if | |
348 | // not already set. | |
349 | ||
350 | { | |
351 | int i, n; | |
352 | ||
353 | if (Stop_Codon . size () == 0) | |
354 | { | |
355 | n = sizeof (DEFAULT_STOP_CODON) / sizeof (char *); | |
356 | for (i = 0; i < n; i ++) | |
357 | Stop_Codon . push_back (DEFAULT_STOP_CODON [i]); | |
358 | } | |
359 | ||
360 | return; | |
361 | } | |
362 | ||
363 | ||
364 | ||
365 | static void Usage | |
366 | (char * command) | |
367 | ||
368 | // Print to stderr description of options and command line for | |
369 | // this program. command is the command that was used to | |
370 | // invoke it. | |
371 | ||
372 | { | |
373 | fprintf (stderr, | |
374 | "USAGE: build-icm [options] output_file < input-file\n" | |
375 | "\n" | |
376 | "Read sequences from standard input and output to output-file\n" | |
377 | "the interpolated context model built from them.\n" | |
378 | "Input also can be piped into the program, e.g.,\n" | |
379 | " cat abc.in | build-icm xyz.icm\n" | |
380 | "If <output-file> is \"-\", then output goes to standard output\n" | |
381 | "\n" | |
382 | "Options:\n" | |
383 | " -d <num>\n" | |
384 | " Set depth of model to <num>\n" | |
385 | " -F\n" | |
386 | " Ignore input strings with in-frame stop codons\n" | |
387 | " -h\n" | |
388 | " Print this message\n" | |
389 | " -p <num>\n" | |
390 | " Set period of model to <num>\n" | |
391 | " -r\n" | |
392 | " Use the reverse of input strings to build the model\n" | |
393 | " -t\n" | |
394 | " Output model as text (for debugging only)\n" | |
395 | " -v <num>\n" | |
396 | " Set verbose level; higher is more diagnostic printouts\n" | |
397 | " -w <num>\n" | |
398 | " Set length of model window to <num>\n" | |
399 | "\n"); | |
400 | ||
401 | return; | |
402 | } | |
403 | ||
404 | ||
405 |
0 | // Programmers: Arthur L. Delcher | |
1 | // Doug Harmon | |
2 | // File: build-icm.hh | |
3 | // Last Updated: Mon May 3 08:07:22 EDT 2004 | |
4 | // | |
5 | // Declarations for build-icm.cc | |
6 | ||
7 | ||
8 | #ifndef _BUILD_ICM_HH | |
9 | #define _BUILD_ICM_HH | |
10 | ||
11 | ||
12 | #include "icm.hh" | |
13 | ||
14 | ||
15 | // The 16 possible base pair sequences | |
16 | const int AA_PAIR = 0; | |
17 | const int AC_PAIR = 1; | |
18 | const int AG_PAIR = 2; | |
19 | const int AT_PAIR = 3; | |
20 | const int CA_PAIR = 4; | |
21 | const int CC_PAIR = 5; | |
22 | const int CG_PAIR = 6; | |
23 | const int CT_PAIR = 7; | |
24 | const int GA_PAIR = 8; | |
25 | const int GC_PAIR = 9; | |
26 | const int GG_PAIR = 10; | |
27 | const int GT_PAIR = 11; | |
28 | const int TA_PAIR = 12; | |
29 | const int TC_PAIR = 13; | |
30 | const int TG_PAIR = 14; | |
31 | const int TT_PAIR = 15; | |
32 | ||
33 | ||
34 | // The 4 possible bases | |
35 | const int A_BASE = 0; | |
36 | const int C_BASE = 1; | |
37 | const int G_BASE = 2; | |
38 | const int T_BASE = 3; | |
39 | ||
40 | ||
41 | // Function prototypes | |
42 | ||
43 | static void Parse_Command_Line | |
44 | (int argc, char * argv []); | |
45 | static int Read_String | |
46 | (FILE * fp, char * & s, long int & s_size, char * & tag, long int & tag_size); | |
47 | static void Read_Training_Data | |
48 | (FILE * fp); | |
49 | static void Set_Stop_Codons | |
50 | (void); | |
51 | static void Usage | |
52 | (char * command); | |
53 | ||
54 | #endif |
0 | // A. L. Delcher | |
1 | // | |
2 | // File: delcher.cc | |
3 | // | |
4 | // Last Modified: 23 October 2003 | |
5 | // | |
6 | // Common generic routines. | |
7 | ||
8 | ||
9 | #include "delcher.hh" | |
10 | ||
11 | ||
12 | const int COMMATIZE_BUFF_LEN = 50; | |
13 | // Length of buffer for creating string with commas | |
14 | ||
15 | ||
16 | char Clean_Exit_Msg_Line [MAX_ERROR_MSG_LEN]; | |
17 | // String to write error messages to before exiting | |
18 | int Global_Debug_Flag = 0; | |
19 | // Used to set different debugging levels | |
20 | int Verbose = 0; | |
21 | // Flag to determine level of debugging output | |
22 | ||
23 | ||
24 | const char * Commatize | |
25 | (long int n) | |
26 | ||
27 | // Return a string representing the value of n with commas | |
28 | // every three digits. | |
29 | ||
30 | { | |
31 | static char buff [COMMATIZE_BUFF_LEN]; | |
32 | bool is_negative = false; | |
33 | int i, ct; | |
34 | ||
35 | buff [COMMATIZE_BUFF_LEN - 1] = '\0'; | |
36 | ||
37 | if (n == 0) | |
38 | { | |
39 | buff [COMMATIZE_BUFF_LEN - 2] = '0'; | |
40 | return buff + 48; | |
41 | } | |
42 | ||
43 | i = COMMATIZE_BUFF_LEN - 2; | |
44 | if (n < 0) | |
45 | { | |
46 | is_negative = true; | |
47 | n *= -1; | |
48 | } | |
49 | ||
50 | for (ct = 0; n > 0; ct ++) | |
51 | { | |
52 | if (ct == 3) | |
53 | { | |
54 | buff [i --] = ','; | |
55 | ct = 0; | |
56 | } | |
57 | buff [i --] = char ('0' + n % 10); | |
58 | n /= 10; | |
59 | } | |
60 | ||
61 | if (is_negative) | |
62 | buff [i --] = '-'; | |
63 | ||
64 | return buff + i + 1; | |
65 | } | |
66 | ||
67 | ||
68 | ||
69 | void Clean_Exit | |
70 | (const char * msg, const char * src_fname, size_t line_num) | |
71 | ||
72 | // Write string msg to stderr and also a line indicating | |
73 | // the error happen in source file src_fname at line line_num | |
74 | // if they are not NULL and 0 respectively. | |
75 | // Then exit with an error condition. | |
76 | ||
77 | { | |
78 | fprintf (stderr, "%s\n", msg); | |
79 | if (src_fname != NULL) | |
80 | fprintf (stderr, " in file %s", src_fname); | |
81 | if (line_num != 0) | |
82 | fprintf (stderr, " at line %lu", (long unsigned) (line_num)); | |
83 | fprintf (stderr, " errno = %d\n", errno); | |
84 | ||
85 | exit (EXIT_FAILURE); | |
86 | } | |
87 | ||
88 | ||
89 | ||
90 | FILE * File_Open | |
91 | (const string & fname, const string & mode, const char * src_fname, | |
92 | size_t line_num) | |
93 | ||
94 | // Open fname in mode and return a pointer to its control | |
95 | // block. If fail, print a message and exit, assuming the call came from | |
96 | // source file src_fname at line line_num . | |
97 | ||
98 | { | |
99 | FILE * fp; | |
100 | ||
101 | fp = fopen (fname . c_str (), mode . c_str ()); | |
102 | if (fp == NULL) | |
103 | { | |
104 | sprintf (Clean_Exit_Msg_Line, | |
105 | "ERROR: Could not open file %s", fname . c_str ()); | |
106 | Clean_Exit (Clean_Exit_Msg_Line, src_fname, line_num); | |
107 | } | |
108 | ||
109 | return fp; | |
110 | } | |
111 | ||
112 | ||
113 | ||
114 | char First_Non_Blank | |
115 | (const char * s) | |
116 | ||
117 | // Return the first non-white-space character in s . | |
118 | // If none, return ' ' . | |
119 | ||
120 | { | |
121 | const char * p; | |
122 | ||
123 | for (p = s; * p != '\0'; p ++) | |
124 | if (! isspace (* p)) | |
125 | return * p; | |
126 | ||
127 | return ' '; | |
128 | } | |
129 | ||
130 | ||
131 | ||
132 | void Make_Lower_Case | |
133 | (char * s) | |
134 | ||
135 | // Convert all letters in s to lower-case. | |
136 | ||
137 | { | |
138 | char * p; | |
139 | ||
140 | for (p = s; * p != '\0'; p ++) | |
141 | * p = tolower (* p); | |
142 | } | |
143 | ||
144 | ||
145 | ||
146 | void Make_Upper_Case | |
147 | (char * s) | |
148 | ||
149 | // Convert all letters in s to upper-case. | |
150 | ||
151 | { | |
152 | char * p; | |
153 | ||
154 | for (p = s; * p != '\0'; p ++) | |
155 | * p = toupper (* p); | |
156 | } | |
157 | ||
158 | ||
159 | ||
160 | int Int_Power | |
161 | (int a, int b) | |
162 | ||
163 | // Return a raised to the power b . Both are assumed | |
164 | // to be non-negative; | |
165 | ||
166 | { | |
167 | int result = 1; | |
168 | int p = a; | |
169 | ||
170 | while (b > 0) | |
171 | { | |
172 | if (b & 1) | |
173 | result *= p; | |
174 | p = p * p; | |
175 | b >>= 1; | |
176 | } | |
177 | ||
178 | return result; | |
179 | } | |
180 | ||
181 | ||
182 | ||
183 | const char * Num_Or_Max | |
184 | (int n, int mx) | |
185 | ||
186 | // Return a string representation of n or else "max" | |
187 | // if n >= mx . | |
188 | ||
189 | { | |
190 | static char buff [20]; | |
191 | ||
192 | if (n >= mx) | |
193 | return "max"; | |
194 | ||
195 | sprintf (buff, "%d", n); | |
196 | return buff; | |
197 | } | |
198 | ||
199 | ||
200 | ||
201 | double Percent | |
202 | (double a, double b) | |
203 | ||
204 | // Return a / b as a percentage. Return 0.0 if b = 0.0 . | |
205 | ||
206 | { | |
207 | if (b == 0.0) | |
208 | return 0.0; | |
209 | ||
210 | return 100.0 * (a / b); | |
211 | } | |
212 | ||
213 | ||
214 | ||
215 | const char * Printable | |
216 | (bool b) | |
217 | ||
218 | // Return a string representing the value of b . | |
219 | ||
220 | { | |
221 | if (b) | |
222 | return "true"; | |
223 | else | |
224 | return "false"; | |
225 | } | |
226 | ||
227 | ||
228 | ||
229 | const char * Printable | |
230 | (char * s) | |
231 | ||
232 | // Return "none" if s is NULL ; otherwise, return s itself. | |
233 | ||
234 | { | |
235 | if (s == NULL) | |
236 | return "none"; | |
237 | else | |
238 | return s; | |
239 | } | |
240 | ||
241 | ||
242 | ||
243 | double Pseudo_Normal | |
244 | (void) | |
245 | ||
246 | // Return a simple approximation to a standard normally distributed value, | |
247 | // i.e., mean = 0.0 and s.d. = 1.0 | |
248 | ||
249 | { | |
250 | double sum = 0.0; | |
251 | int i; | |
252 | ||
253 | for (i = 0; i < 12; i ++) | |
254 | sum += drand48 (); | |
255 | ||
256 | return sum - 6.0; | |
257 | } | |
258 | ||
259 | ||
260 | ||
261 | double Ratio | |
262 | (double a, double b) | |
263 | ||
264 | // Return a / b , or 0.0 if b = 0.0 . | |
265 | ||
266 | { | |
267 | if (b == 0.0) | |
268 | return 0.0; | |
269 | ||
270 | return a / b; | |
271 | } | |
272 | ||
273 | ||
274 | ||
275 | void Reverse_String | |
276 | (char * s) | |
277 | ||
278 | // Reverse the order of characters in string s . | |
279 | ||
280 | { | |
281 | int i, j, n; | |
282 | ||
283 | n = strlen (s); | |
284 | for (i = 0, j = n - 1; i < j; i ++, j --) | |
285 | { | |
286 | char ch; | |
287 | ||
288 | ch = s [i]; | |
289 | s [i] = s [j]; | |
290 | s [j] = ch; | |
291 | } | |
292 | ||
293 | return; | |
294 | } | |
295 | ||
296 | ||
297 | ||
298 | void Reverse_String | |
299 | (string & s) | |
300 | ||
301 | // Reverse the order of characters in string s . | |
302 | ||
303 | { | |
304 | int i, j, n; | |
305 | ||
306 | n = s . length (); | |
307 | for (i = 0, j = n - 1; i < j; i ++, j --) | |
308 | { | |
309 | char ch; | |
310 | ||
311 | ch = s [i]; | |
312 | s [i] = s [j]; | |
313 | s [j] = ch; | |
314 | } | |
315 | ||
316 | return; | |
317 | } | |
318 | ||
319 | ||
320 | ||
321 | void * Safe_calloc | |
322 | (size_t n, size_t len, const char * src_fname, size_t line_num) | |
323 | ||
324 | // Allocate and return a pointer to enough memory to hold an | |
325 | // array with n entries of len bytes each. All memory is | |
326 | // cleared to 0. If fail, print a message and exit, assuming the | |
327 | // call came from source file src_fname at line line_num . | |
328 | ||
329 | { | |
330 | void * p; | |
331 | ||
332 | p = calloc (n, len); | |
333 | if (p == NULL) | |
334 | { | |
335 | sprintf (Clean_Exit_Msg_Line, | |
336 | "ERROR: calloc failed %lu x %lu", | |
337 | (long unsigned) (n), (long unsigned) (len)); | |
338 | Clean_Exit (Clean_Exit_Msg_Line, src_fname, line_num); | |
339 | } | |
340 | ||
341 | return p; | |
342 | } | |
343 | ||
344 | ||
345 | ||
346 | void * Safe_malloc | |
347 | (size_t len, const char * src_fname, size_t line_num) | |
348 | ||
349 | // Allocate and return a pointer to len bytes of memory. | |
350 | // If fail, print a message and exit, assuming the call came from | |
351 | // source file src_fname at line line_num . | |
352 | ||
353 | { | |
354 | void * p; | |
355 | ||
356 | p = malloc (len); | |
357 | if (p == NULL) | |
358 | { | |
359 | sprintf (Clean_Exit_Msg_Line, | |
360 | "ERROR: malloc failed %lu bytes", | |
361 | (long unsigned) (len)); | |
362 | Clean_Exit (Clean_Exit_Msg_Line, src_fname, line_num); | |
363 | } | |
364 | ||
365 | return p; | |
366 | } | |
367 | ||
368 | ||
369 | ||
370 | void * Safe_realloc | |
371 | (void * q, size_t len, const char * src_fname, size_t line_num) | |
372 | ||
373 | // Reallocate memory for q to len bytes and return a | |
374 | // pointer to the new memory. If fail, print a message and exit, | |
375 | // assuming the call came from source file src_fname at line line_num . | |
376 | ||
377 | { | |
378 | void * p; | |
379 | ||
380 | p = realloc (q, len); | |
381 | if (p == NULL) | |
382 | { | |
383 | sprintf (Clean_Exit_Msg_Line, | |
384 | "ERROR: realloc failed %lu bytes", | |
385 | (long unsigned) (len)); | |
386 | Clean_Exit (Clean_Exit_Msg_Line, src_fname, line_num); | |
387 | } | |
388 | ||
389 | return p; | |
390 | } | |
391 | ||
392 | ||
393 | ||
394 | char * Strip_Trailing | |
395 | (char * s, char ch) | |
396 | ||
397 | // Remove all occurrences of ch at the end of s by writing | |
398 | // '\0's over them. Return s so this can be used as a function. | |
399 | ||
400 | { | |
401 | int i, len; | |
402 | ||
403 | len = strlen (s); | |
404 | ||
405 | for (i = len - 1; i >= 0 && s [i] == ch; i --) | |
406 | s [i] = '\0'; | |
407 | ||
408 | return s; | |
409 | } | |
410 | ||
411 | ||
412 |
0 | // A. L. Delcher | |
1 | // | |
2 | // File: delcher.hh | |
3 | // | |
4 | // Last Modified: 23 October 2003 | |
5 | // | |
6 | // Common generic routines declarations | |
7 | ||
8 | ||
9 | #ifndef __DELCHER_HH_INCLUDED | |
10 | #define __DELCHER_HH_INCLUDED | |
11 | ||
12 | #define ALLOW_LONG_OPTIONS 1 | |
13 | ||
14 | #include <stdio.h> | |
15 | #include <stdlib.h> | |
16 | #include <math.h> | |
17 | #include <ctype.h> | |
18 | #include <float.h> | |
19 | #include <time.h> | |
20 | #include <assert.h> | |
21 | #include <errno.h> | |
22 | #include <getopt.h> | |
23 | #include <limits.h> | |
24 | #include <algorithm> | |
25 | #include <string> | |
26 | #include <new> | |
27 | #include <cstdlib> | |
28 | #include <iostream> | |
29 | #include <iomanip> | |
30 | #include <fstream> | |
31 | #include <vector> | |
32 | #include <string> | |
33 | ||
34 | #include "exceptions.hh" | |
35 | ||
36 | ||
37 | using namespace std; | |
38 | ||
39 | ||
40 | #define TRUE 1 | |
41 | #define FALSE 0 | |
42 | #ifndef EXIT_FAILURE | |
43 | #define EXIT_FAILURE -1 | |
44 | #endif | |
45 | #ifndef EXIT_SUCCESS | |
46 | #define EXIT_SUCCESS 0 | |
47 | #endif | |
48 | ||
49 | ||
50 | #define SAFE_CALLOC(x,y) Safe_calloc (x, y, __FILE__, __LINE__) | |
51 | #define SAFE_MALLOC(x) Safe_malloc (x, __FILE__, __LINE__) | |
52 | #define SAFE_REALLOC(x,y) Safe_realloc (x, y, __FILE__, __LINE__) | |
53 | ||
54 | ||
55 | const int MAX_ERROR_MSG_LEN = 1000; | |
56 | // Length of longest possible error message | |
57 | ||
58 | ||
59 | extern char Clean_Exit_Msg_Line [MAX_ERROR_MSG_LEN]; | |
60 | // String to write error messages to before exiting | |
61 | extern int Verbose; | |
62 | // Flag to determine level of debugging output | |
63 | ||
64 | ||
65 | const char * Commatize | |
66 | (long int n); | |
67 | void Clean_Exit | |
68 | (const char * msg, const char * src_fname = NULL, size_t line_num = 0); | |
69 | FILE * File_Open | |
70 | (const string & fname, const string & mode, const char * src_fname = NULL, | |
71 | size_t line_num = 0); | |
72 | char First_Non_Blank | |
73 | (const char * s); | |
74 | int Int_Power | |
75 | (int a, int b); | |
76 | void Make_Lower_Case | |
77 | (char * s); | |
78 | void Make_Upper_Case | |
79 | (char * s); | |
80 | const char * Num_Or_Max | |
81 | (int n, int mx = INT_MAX); | |
82 | double Percent | |
83 | (double a, double b); | |
84 | const char * Printable | |
85 | (bool b); | |
86 | const char * Printable | |
87 | (char * s); | |
88 | double Pseudo_Normal | |
89 | (void); | |
90 | double Ratio | |
91 | (double a, double b); | |
92 | void Reverse_String | |
93 | (char * s); | |
94 | void Reverse_String | |
95 | (string & s); | |
96 | void * Safe_calloc | |
97 | (size_t n, size_t len, const char * src_fname = NULL, | |
98 | size_t line_num = 0); | |
99 | void * Safe_malloc | |
100 | (size_t len, const char * src_fname = NULL, size_t line_num = 0); | |
101 | void * Safe_realloc | |
102 | (void * q, size_t len, const char * src_fname = NULL, | |
103 | size_t line_num = 0); | |
104 | char * Strip_Trailing | |
105 | (char * s, char ch); | |
106 | ||
107 | ||
108 | template <class DT> void Incr_Limited | |
109 | (DT & A, DT limit); | |
110 | template <class DT> DT Max | |
111 | (DT, DT); | |
112 | template <class DT> DT Min | |
113 | (DT, DT); | |
114 | template <class DT> void Swap | |
115 | (DT &, DT &); | |
116 | ||
117 | ||
118 | ||
119 | template <class DT> void Incr_Limited | |
120 | (DT & A, DT limit) | |
121 | ||
122 | // Increment A by 1, but only if it's less than limit . | |
123 | ||
124 | { | |
125 | if (A < limit) | |
126 | A ++; | |
127 | ||
128 | return; | |
129 | } | |
130 | ||
131 | ||
132 | ||
133 | template <class DT> DT Max | |
134 | (DT A, DT B) | |
135 | ||
136 | // Return the larger of A and B . | |
137 | ||
138 | { | |
139 | if (A > B) | |
140 | return A; | |
141 | else | |
142 | return B; | |
143 | } | |
144 | ||
145 | ||
146 | ||
147 | template <class DT> DT Min | |
148 | (DT A, DT B) | |
149 | ||
150 | // Return the smaller of A and B . | |
151 | ||
152 | { | |
153 | if (A < B) | |
154 | return A; | |
155 | else | |
156 | return B; | |
157 | } | |
158 | ||
159 | ||
160 | ||
161 | template <class DT> void Reverse | |
162 | (vector <DT> & v) | |
163 | ||
164 | // Reverse the order of entries in v . | |
165 | ||
166 | { | |
167 | DT s; | |
168 | int i, j, n; | |
169 | ||
170 | n = v . size (); | |
171 | for (i = 0, j = n - 1; i < j; i ++, j --) | |
172 | { | |
173 | s = v [i]; | |
174 | v [i] = v [j]; | |
175 | v [j] = s; | |
176 | } | |
177 | ||
178 | return; | |
179 | } | |
180 | ||
181 | ||
182 | ||
183 | template <class DT> void Swap | |
184 | (DT & A, DT & B) | |
185 | ||
186 | // Swap the values in A and B . | |
187 | ||
188 | { | |
189 | DT Save; | |
190 | ||
191 | Save = A; | |
192 | A = B; | |
193 | B = Save; | |
194 | ||
195 | return; | |
196 | } | |
197 | ||
198 | ||
199 | ||
200 | #endif |
0 | // A. L. Delcher | |
1 | // | |
2 | // File: entropy-profile.cc | |
3 | // | |
4 | // Last Modified: Tue May 23 10:30:35 EDT 2006 | |
5 | // | |
6 | // This program reads a multifasta file of gene sequences. | |
7 | // It translates each sequence to it protein product and | |
8 | // computes and prints the natural and entropy distributions | |
9 | // of the amino acids. It also translates the reverse-complement | |
10 | // of each sequence, and prints the same distributions for | |
11 | // those sequences. | |
12 | ||
13 | ||
14 | ||
15 | #include "entropy-profile.hh" | |
16 | ||
17 | ||
18 | // External variables | |
19 | ||
20 | extern int Verbose; | |
21 | extern int Global_Debug_Flag; | |
22 | ||
23 | ||
24 | // Global variables | |
25 | ||
26 | static bool Brief_Output = false; | |
27 | // Determines output format | |
28 | static long int Min_Len = 0; | |
29 | // Sequences shorter than this are ignored | |
30 | ||
31 | ||
32 | ||
33 | int main | |
34 | (int argc, char * argv []) | |
35 | ||
36 | { | |
37 | string sequence, hdr; | |
38 | string rev_sequence; | |
39 | double entropy [26], prob [26]; | |
40 | double ep [20]; | |
41 | double rev_entropy [26], rev_prob [26]; | |
42 | double rev_ep [20]; | |
43 | int count [26] = {0}, rev_count [26] = {0}; | |
44 | int total = 0, rev_total = 0;; | |
45 | int i, j; | |
46 | ||
47 | Verbose = 0; | |
48 | ||
49 | Parse_Command_Line (argc, argv); | |
50 | ||
51 | while (Fasta_Read (stdin, sequence, hdr)) | |
52 | { | |
53 | const char * seq, * rev_seq; | |
54 | char aa; | |
55 | int i, n; | |
56 | ||
57 | n = sequence . length (); | |
58 | if (n < Min_Len || n % 3 != 0) | |
59 | continue; | |
60 | ||
61 | rev_sequence = seq; | |
62 | Reverse_Complement (rev_sequence); | |
63 | ||
64 | seq = sequence . c_str (); | |
65 | rev_seq = rev_sequence . c_str (); | |
66 | for (i = 0; i < n; i += 3) | |
67 | { | |
68 | aa = Codon_Translation (seq + i); | |
69 | if (aa != '*') | |
70 | count [aa - 'A'] ++; | |
71 | aa = Codon_Translation (rev_seq + i); | |
72 | if (aa != '*') | |
73 | rev_count [aa - 'A'] ++; | |
74 | } | |
75 | ||
76 | } | |
77 | ||
78 | for (i = 0; i < 26; i ++) | |
79 | if (IS_AMINO [i]) | |
80 | { | |
81 | total += count [i]; | |
82 | rev_total += rev_count [i]; | |
83 | } | |
84 | ||
85 | Counts_To_Entropy_Profile (count, ep); | |
86 | Counts_To_Entropy_Profile (rev_count, rev_ep); | |
87 | ||
88 | if (Brief_Output) | |
89 | { | |
90 | printf ("AA %8s %8s\n", "Positive", "Negative"); | |
91 | for (i = j = 0; i < 26; i ++) | |
92 | if (IS_AMINO [i]) | |
93 | { | |
94 | printf (" %c %8.5f %8.5f\n", 'A' + i, ep [j], rev_ep [j]); | |
95 | j ++; | |
96 | } | |
97 | } | |
98 | else | |
99 | { | |
100 | printf ("%2s %29s %29s\n", "", "--- Forward Translation ----", | |
101 | "--- Reverse Translation ----"); | |
102 | printf ("%2s %6s %6s %6s %6s %6s %6s %6s %6s\n", | |
103 | "AA", "Count", "Percen", "Entrpy", "EFrac", | |
104 | "Count", "Percen", "Entrpy", "EFrac"); | |
105 | for (i = 0; i < 26; i ++) | |
106 | if (IS_AMINO [i]) | |
107 | { | |
108 | prob [i] = double (count [i]) / total; | |
109 | entropy [i] = -1.0 * prob [i] * log (prob [i]); | |
110 | rev_prob [i] = double (rev_count [i]) / rev_total; | |
111 | rev_entropy [i] = -1.0 * rev_prob [i] * log (rev_prob [i]); | |
112 | } | |
113 | ||
114 | for (i = j = 0; i < 26; i ++) | |
115 | if (IS_AMINO [i]) | |
116 | { | |
117 | printf ("%c: %6d %5.1f%% %6.3f %6.3f %6d %5.1f%% %6.3f %6.3f\n", | |
118 | 'A' + i, count [i], Percent (count [i], total), | |
119 | entropy [i], ep [j], | |
120 | rev_count [i], Percent (rev_count [i], rev_total), | |
121 | rev_entropy [i], rev_ep [j]); | |
122 | j ++; | |
123 | } | |
124 | } | |
125 | ||
126 | return 0; | |
127 | } | |
128 | ||
129 | ||
130 | ||
131 | static void Parse_Command_Line | |
132 | (int argc, char * argv []) | |
133 | ||
134 | // Get options and parameters from command line with argc | |
135 | // arguments in argv [0 .. (argc - 1)] . | |
136 | ||
137 | { | |
138 | bool errflg = false; | |
139 | int ch; | |
140 | ||
141 | optarg = NULL; | |
142 | ||
143 | #if ALLOW_LONG_OPTIONS | |
144 | int option_index = 0; | |
145 | static struct option long_options [] = { | |
146 | {"brief", 0, 0, 'b'}, | |
147 | {"help", 0, 0, 'h'}, | |
148 | {"minlen", 1, 0, 'l'}, | |
149 | {0, 0, 0, 0} | |
150 | }; | |
151 | ||
152 | while (! errflg | |
153 | && ((ch = getopt_long (argc, argv, "bhl:", | |
154 | long_options, & option_index)) != EOF)) | |
155 | #else | |
156 | while (! errflg | |
157 | && ((ch = getopt (argc, argv, "bhl:")) != EOF)) | |
158 | #endif | |
159 | ||
160 | switch (ch) | |
161 | { | |
162 | case 'b' : | |
163 | Brief_Output = true; | |
164 | break; | |
165 | ||
166 | case 'h' : | |
167 | Usage (); | |
168 | exit (EXIT_SUCCESS); | |
169 | ||
170 | case 'l' : | |
171 | Min_Len = strtol (optarg, NULL, 10); | |
172 | break; | |
173 | ||
174 | default : | |
175 | errflg = true; | |
176 | } | |
177 | ||
178 | if (errflg || optind > argc - 0) | |
179 | { | |
180 | Usage (); | |
181 | exit (EXIT_FAILURE); | |
182 | } | |
183 | ||
184 | return; | |
185 | } | |
186 | ||
187 | ||
188 | ||
189 | static void Usage | |
190 | (void) | |
191 | ||
192 | // Print to stderr description of options and command line for | |
193 | // this program. | |
194 | ||
195 | { | |
196 | fprintf (stderr, | |
197 | "USAGE: entropy-profile [options] < input-file\n" | |
198 | "\n" | |
199 | "Read multi-fasta-format gene sequences from stdin.\n" | |
200 | "Translate each to its protein product and then print\n" | |
201 | "the natural and entropy distributions of the amino acids\n" | |
202 | "Output goes to stdout\n" | |
203 | "\n" | |
204 | "Options:\n" | |
205 | " -b\n" | |
206 | " --brief\n" | |
207 | " Brief output: 3 columns with header line\n" | |
208 | " -h\n" | |
209 | " --help\n" | |
210 | " Print this message\n" | |
211 | " -l <n>\n" | |
212 | " --minlen <n>\n" | |
213 | " Don't output any sequence shorter than <n> characters\n" | |
214 | "\n"); | |
215 | ||
216 | return; | |
217 | } | |
218 | ||
219 | ||
220 |
0 | // A. L. Delcher | |
1 | // | |
2 | // File: entropy-profile.hh | |
3 | // | |
4 | // Last Modified: 28 July 2005 | |
5 | // | |
6 | // Declarations for entropy-profile.cc | |
7 | ||
8 | ||
9 | ||
10 | #ifndef __ENTROPY_PROFILE_HH_INCLUDED | |
11 | #define __ENTROPY_PROFILE_HH_INCLUDED | |
12 | ||
13 | ||
14 | #include "delcher.hh" | |
15 | #include "fasta.hh" | |
16 | #include "gene.hh" | |
17 | ||
18 | ||
19 | static void Parse_Command_Line | |
20 | (int argc, char * argv []); | |
21 | static void Usage | |
22 | (void); | |
23 | ||
24 | #endif |
0 | // A. L. Delcher | |
1 | // | |
2 | // File: entropy-score.cc | |
3 | // | |
4 | // Last Modified: 29 July 2005 | |
5 | // | |
6 | // This program reads the sequence in the file named as the | |
7 | // first command-line argument and then scores specified | |
8 | // regions in it by entropy distance. Results are output | |
9 | // to stdout . | |
10 | ||
11 | ||
12 | ||
13 | #include "entropy-score.hh" | |
14 | ||
15 | ||
16 | // External variables | |
17 | ||
18 | extern int Verbose; | |
19 | extern int Global_Debug_Flag; | |
20 | ||
21 | ||
22 | // Global variables | |
23 | ||
24 | static char * Coord_Info = NULL; | |
25 | // Name of file with list of coordinates (or a single coordinate | |
26 | // specification). | |
27 | static bool Is_Circular = true; | |
28 | // Determines whether the input sequence is regarded as a circular | |
29 | // genome. | |
30 | static long int Min_Len = 0; | |
31 | // Output sequences shorter than this are not printed. | |
32 | static double Neg_Entropy_Profile [20] = DEFAULT_NEG_ENTROPY_PROF; | |
33 | // Entropy distribution of amino-acids in non-genes | |
34 | static double Pos_Entropy_Profile [20] = DEFAULT_POS_ENTROPY_PROF; | |
35 | // Entropy distribution of amino-acids in genes | |
36 | static char * Sequence_File_Name = NULL; | |
37 | // Name of file with input sequence | |
38 | static bool Skip_Start = false; | |
39 | // If set true (by -s option) then omit the first 3 letters | |
40 | // of each output sequence. | |
41 | static bool Skip_Stop = false; | |
42 | // If set true (by -t option) then omit the last 3 letters | |
43 | // of each output sequence. | |
44 | static bool Use_Direction = false; | |
45 | // If set true (by -d option) then use 4th field of coords | |
46 | // to determine direction in a circular genome. | |
47 | ||
48 | ||
49 | ||
50 | int main | |
51 | (int argc, char * argv []) | |
52 | ||
53 | { | |
54 | FILE * fp; | |
55 | string sequence, hdr; | |
56 | char line [MAX_LINE], tag [MAX_LINE]; | |
57 | long int seq_len; | |
58 | ||
59 | Verbose = 0; | |
60 | ||
61 | Parse_Command_Line (argc, argv); | |
62 | ||
63 | fp = File_Open (Sequence_File_Name, "r"); | |
64 | ||
65 | if (! Fasta_Read (fp, sequence, hdr)) | |
66 | { | |
67 | sprintf (Clean_Exit_Msg_Line, "ERROR: Failed to read file %s", | |
68 | Sequence_File_Name); | |
69 | Clean_Exit (Clean_Exit_Msg_Line, __FILE__, __LINE__); | |
70 | } | |
71 | fclose (fp); | |
72 | ||
73 | seq_len = sequence . length (); | |
74 | ||
75 | if (strcmp (Coord_Info, "-") == 0) | |
76 | fp = stdin; | |
77 | else | |
78 | fp = File_Open (Coord_Info, "r"); | |
79 | ||
80 | while (fgets (line, MAX_LINE, fp) != NULL) | |
81 | { | |
82 | string buff; | |
83 | long int i, start, end, extract_len; | |
84 | int dir, n; | |
85 | ||
86 | if (Use_Direction) | |
87 | { | |
88 | if (sscanf (line, "%s %ld %ld %d", tag, & start, & end, & dir) != 4) | |
89 | { | |
90 | fprintf (stderr, "ERROR: Skipped following coord line\n"); | |
91 | fputs (line, stderr); | |
92 | continue; | |
93 | } | |
94 | } | |
95 | else | |
96 | { | |
97 | if (sscanf (line, "%s %ld %ld", tag, & start, & end) != 3) | |
98 | { | |
99 | fprintf (stderr, "ERROR: Skipped following coord line\n"); | |
100 | fputs (line, stderr); | |
101 | continue; | |
102 | } | |
103 | if ((start < end && (! Is_Circular || end - start <= seq_len / 2)) | |
104 | || (Is_Circular && start - end > seq_len / 2)) | |
105 | dir = +1; | |
106 | else | |
107 | dir = -1; | |
108 | } | |
109 | ||
110 | if (dir > 0) | |
111 | { | |
112 | extract_len = 1 + end - start; | |
113 | if (extract_len < 0) | |
114 | extract_len += seq_len; | |
115 | ||
116 | i = start - 1; | |
117 | if (Skip_Start) | |
118 | { | |
119 | i += 3; | |
120 | extract_len -= 3; | |
121 | start += 3; | |
122 | } | |
123 | if (Skip_Stop) | |
124 | extract_len -= 3; | |
125 | ||
126 | if (extract_len < Min_Len) | |
127 | continue; | |
128 | ||
129 | Forward_Strand_Transfer (buff, sequence, On_Seq (i, seq_len), | |
130 | extract_len); | |
131 | } | |
132 | else | |
133 | { | |
134 | extract_len = 1 + start - end; | |
135 | if (extract_len < 0) | |
136 | extract_len += seq_len; | |
137 | ||
138 | i = start - 1; | |
139 | if (Skip_Start) | |
140 | { | |
141 | i -= 3; | |
142 | extract_len -= 3; | |
143 | start -= 3; | |
144 | } | |
145 | if (Skip_Stop) | |
146 | extract_len -= 3; | |
147 | ||
148 | if (extract_len < Min_Len) | |
149 | continue; | |
150 | ||
151 | Reverse_Strand_Transfer (buff, sequence, On_Seq (i, seq_len), | |
152 | extract_len); | |
153 | } | |
154 | ||
155 | n = strlen (line); | |
156 | if (line [n - 1] == '\n'); | |
157 | line [n - 1] = '\0'; | |
158 | printf ("%s \t%5.3f\n", line, Entropy_Distance_Ratio (buff)); | |
159 | } | |
160 | ||
161 | return 0; | |
162 | } | |
163 | ||
164 | ||
165 | ||
166 | static double Entropy_Distance_Ratio | |
167 | (const string & buff) | |
168 | ||
169 | // Return the distance ratio for the entropy profile for the | |
170 | // gene in buff . The ratio is the distance to global | |
171 | // Pos_Entropy_Profile over the distance to global Neg_Entropy_Profile . | |
172 | ||
173 | { | |
174 | int count [26] = {0}; | |
175 | double ep [20]; | |
176 | double pos_dist, neg_dist, ratio; | |
177 | char aa; | |
178 | int i, len; | |
179 | ||
180 | len = buff . length (); | |
181 | for (i = 0; i < len; i += 3) | |
182 | { | |
183 | aa = Codon_Translation (buff . c_str () + i); | |
184 | if (aa != '*') | |
185 | count [aa - 'A'] ++; | |
186 | } | |
187 | Counts_To_Entropy_Profile (count, ep); | |
188 | ||
189 | pos_dist = neg_dist = 0.0; | |
190 | for (i = 0; i < 20; i ++) | |
191 | { | |
192 | pos_dist += pow (ep [i] - Pos_Entropy_Profile [i], 2); | |
193 | neg_dist += pow (ep [i] - Neg_Entropy_Profile [i], 2); | |
194 | } | |
195 | ||
196 | pos_dist = sqrt (pos_dist); | |
197 | neg_dist = sqrt (neg_dist); | |
198 | if (neg_dist == 0.0) | |
199 | { | |
200 | if (pos_dist == 0.0) | |
201 | ratio = 1.0; | |
202 | else | |
203 | ratio = 1e3; | |
204 | } | |
205 | else | |
206 | ratio = pos_dist / neg_dist; | |
207 | ||
208 | return ratio; | |
209 | } | |
210 | ||
211 | ||
212 | ||
213 | static int On_Seq | |
214 | (int i, int seq_len) | |
215 | ||
216 | // Return the subscript equivalent to i on a sequence of | |
217 | // length seq_len (with subscripts starting at 0) | |
218 | // assuming circular wraparounds. | |
219 | ||
220 | { | |
221 | while (i < 0) | |
222 | i += seq_len; | |
223 | while (seq_len <= i) | |
224 | i -= seq_len; | |
225 | ||
226 | return i; | |
227 | } | |
228 | ||
229 | ||
230 | ||
231 | static void Parse_Command_Line | |
232 | (int argc, char * argv []) | |
233 | ||
234 | // Get options and parameters from command line with argc | |
235 | // arguments in argv [0 .. (argc - 1)] . | |
236 | ||
237 | { | |
238 | bool errflg = false; | |
239 | int ch; | |
240 | ||
241 | optarg = NULL; | |
242 | ||
243 | #if ALLOW_LONG_OPTIONS | |
244 | int option_index = 0; | |
245 | static struct option long_options [] = { | |
246 | {"dir", 0, 0, 'd'}, | |
247 | {"entropy", 1, 0, 'E'}, | |
248 | {"help", 0, 0, 'h'}, | |
249 | {"minlen", 1, 0, 'l'}, | |
250 | {"nostart", 0, 0, 's'}, | |
251 | {"nostop", 0, 0, 't'}, | |
252 | {"nowrap", 0, 0, 'w'}, | |
253 | {0, 0, 0, 0} | |
254 | }; | |
255 | ||
256 | while (! errflg | |
257 | && ((ch = getopt_long (argc, argv, "2dE:hl:sw", | |
258 | long_options, & option_index)) != EOF)) | |
259 | #else | |
260 | while (! errflg | |
261 | && ((ch = getopt (argc, argv, "2dE:hl:sw")) != EOF)) | |
262 | #endif | |
263 | ||
264 | switch (ch) | |
265 | { | |
266 | case 'd' : | |
267 | Use_Direction = true; | |
268 | break; | |
269 | ||
270 | case 'E' : | |
271 | Read_Entropy_Profiles (optarg, errflg); | |
272 | break; | |
273 | ||
274 | case 'h' : | |
275 | Usage (); | |
276 | exit (EXIT_SUCCESS); | |
277 | ||
278 | case 'l' : | |
279 | Min_Len = strtol (optarg, NULL, 10); | |
280 | break; | |
281 | ||
282 | case 's' : | |
283 | Skip_Start = true; | |
284 | break; | |
285 | ||
286 | case 't' : | |
287 | Skip_Stop = true; | |
288 | break; | |
289 | ||
290 | case 'w' : | |
291 | Is_Circular = false; | |
292 | break; | |
293 | ||
294 | default : | |
295 | errflg = true; | |
296 | } | |
297 | ||
298 | if (errflg || optind > argc - 2) | |
299 | { | |
300 | Usage (); | |
301 | exit (EXIT_FAILURE); | |
302 | } | |
303 | ||
304 | Sequence_File_Name = argv [optind ++]; | |
305 | Coord_Info = argv [optind ++]; | |
306 | ||
307 | return; | |
308 | } | |
309 | ||
310 | ||
311 | ||
312 | static void Read_Entropy_Profiles | |
313 | (const char * fn, bool & errflg) | |
314 | ||
315 | // Read positive and negative entropy profiles from the | |
316 | // file name fn . If not successful, set errflg to true . | |
317 | // Save the entropy profiles in globals Pos_Entropy_Profile | |
318 | // and Neg_Entropy_Profile . | |
319 | ||
320 | { | |
321 | FILE * fp; | |
322 | char line [MAX_LINE]; | |
323 | int i; | |
324 | ||
325 | fp = File_Open (fn, "r"); | |
326 | fgets (line, MAX_LINE, fp); // skip header line | |
327 | for (i = 0; i < 20; i ++) | |
328 | if (fscanf (fp, "%s %lf %lf\n", line, Pos_Entropy_Profile + i, | |
329 | Neg_Entropy_Profile + i) != 3) | |
330 | { | |
331 | errflg = true; | |
332 | return; | |
333 | } | |
334 | ||
335 | fclose (fp); | |
336 | ||
337 | return; | |
338 | } | |
339 | ||
340 | ||
341 | ||
342 | static void Usage | |
343 | (void) | |
344 | ||
345 | // Print to stderr description of options and command line for | |
346 | // this program. | |
347 | ||
348 | { | |
349 | fprintf (stderr, | |
350 | "USAGE: entropy-score [options] <sequence-file> <coords>\n" | |
351 | "\n" | |
352 | "Read fasta-format <sequence-file> and then score regions in\n" | |
353 | "it specified by <coords>. By default, <coords>\n" | |
354 | "is the name of a file containing lines of the form\n" | |
355 | " <tag> <start> <stop> [<frame>] ...\n" | |
356 | "Coordinates are inclusive counting from 1, e.g., \"1 3\"\n" | |
357 | "represents the 1st 3 characters of the sequence.\n" | |
358 | "Output is the same format as <coords> put with the entropy\n" | |
359 | "distance score appended to each line\n" | |
360 | "Output goes to stdout .\n" | |
361 | "\n" | |
362 | "Options:\n" | |
363 | " -d\n" | |
364 | " --dir\n" | |
365 | " Use the 4th column of each input line to specify the direction\n" | |
366 | " of the sequence. Positive is forward, negative is reverse\n" | |
367 | " The input sequence is assumed to be circular\n" | |
368 | " -E <filename>\n" | |
369 | " --entropy <filename>\n" | |
370 | " Read entropy profiles from <filename>. Format is one header\n" | |
371 | " line, then 20 lines of 3 columns each. Columns are amino acid,\n" | |
372 | " positive entropy, negative entropy. Rows must be in order\n" | |
373 | " by amino acid code letter\n" | |
374 | " -h\n" | |
375 | " --help\n" | |
376 | " Print this message\n" | |
377 | " -l <n>\n" | |
378 | " --minlen <n>\n" | |
379 | " Don't output any sequence shorter than <n> characters\n" | |
380 | " -s\n" | |
381 | " --nostart\n" | |
382 | " Omit the first 3 characters of each specified string\n" | |
383 | " -t\n" | |
384 | " --nostop\n" | |
385 | " Omit the last 3 characters of each specified string\n" | |
386 | " -w\n" | |
387 | " --nowrap\n" | |
388 | " Use the actual input coordinates without any wraparound\n" | |
389 | " that would be needed by a circular genome. Without this\n" | |
390 | " option, the output sequence is the shorter of the two ways\n" | |
391 | " around the circle. Use the -d option to specify direction\n" | |
392 | " explicitly.\n" | |
393 | "\n"); | |
394 | ||
395 | return; | |
396 | } | |
397 | ||
398 | ||
399 |
0 | // A. L. Delcher | |
1 | // | |
2 | // File: entropy-score.hh | |
3 | // | |
4 | // Last Modified: 29 July 2005 | |
5 | // | |
6 | // Declarations for entropy-score.cc | |
7 | ||
8 | ||
9 | ||
10 | #ifndef __ENTROPY_SCORE_HH_INCLUDED | |
11 | #define __ENTROPY_SCORE_HH_INCLUDED | |
12 | ||
13 | ||
14 | #include "delcher.hh" | |
15 | #include "fasta.hh" | |
16 | #include "gene.hh" | |
17 | ||
18 | ||
19 | static double Entropy_Distance_Ratio | |
20 | (const string & buff); | |
21 | static int On_Seq | |
22 | (int i, int seq_len); | |
23 | static void Parse_Command_Line | |
24 | (int argc, char * argv []); | |
25 | static void Read_Entropy_Profiles | |
26 | (const char * fn, bool & errflg); | |
27 | static void Usage | |
28 | (void); | |
29 | ||
30 | #endif |
0 | // A. L. Delcher | |
1 | // | |
2 | // File: exceptions.hh | |
3 | // | |
4 | // Last Modified: 13 June 2005 | |
5 | // | |
6 | // Include file for exception types | |
7 | ||
8 | ||
9 | ||
10 | #ifndef __EXCEPTIONS_HH_INCLUDED | |
11 | #define __EXCEPTIONS_HH_INCLUDED | |
12 | ||
13 | ||
14 | // Stolen from AMOS exceptions code | |
15 | ||
16 | ||
17 | ||
18 | const std :: string NULL_STRING = ""; //!< null string | |
19 | ||
20 | ||
21 | ||
22 | //================================================ Exception_t ================= | |
23 | //! \brief The base exception class | |
24 | //! | |
25 | //! All other exceptions will be derived from this class, so catching for | |
26 | //! this class should effectively catch all exceptions. | |
27 | //! | |
28 | //============================================================================== | |
29 | ||
30 | class Exception_t : public std :: exception | |
31 | { | |
32 | ||
33 | private: | |
34 | ||
35 | std :: string what_m; //!< description of exception | |
36 | int line_m; //!< line number of exception | |
37 | std :: string file_m; //!< file name of exception | |
38 | ||
39 | ||
40 | public: | |
41 | ||
42 | //---------------------------------------------- Exception_t ----------------- | |
43 | //! \brief Informative constructor | |
44 | //! | |
45 | //! \param what Brief description of the exception | |
46 | //! \param line Line number of the exception | |
47 | //! \param file File name of the exception | |
48 | //! | |
49 | Exception_t (const std :: string & what, | |
50 | int line = 0, | |
51 | const std :: string & file = NULL_STRING) | |
52 | : what_m (what), line_m (line), file_m (file) | |
53 | { } | |
54 | ||
55 | ||
56 | //---------------------------------------------- ~Exception_t ---------------- | |
57 | //! \brief Default destructor | |
58 | //! | |
59 | ~Exception_t ( ) throw() | |
60 | { } | |
61 | ||
62 | ||
63 | //---------------------------------------------- file ------------------------ | |
64 | //! \brief Returns the file (if available) of the exception | |
65 | //! | |
66 | virtual const char * file ( ) const | |
67 | { | |
68 | return file_m . c_str( ); | |
69 | } | |
70 | ||
71 | ||
72 | //---------------------------------------------- line ------------------------ | |
73 | //! \brief Returns the line number (if available) of the exception | |
74 | //! | |
75 | virtual int line ( ) const | |
76 | { | |
77 | return line_m; | |
78 | } | |
79 | ||
80 | ||
81 | //---------------------------------------------- what ------------------------ | |
82 | //! \brief Returns a short description (if available) of the exception | |
83 | //! | |
84 | virtual const char * what ( ) const throw( ) | |
85 | { | |
86 | return what_m . c_str( ); | |
87 | } | |
88 | ||
89 | }; | |
90 | ||
91 | ||
92 | ||
93 | //----------------------------------------------------- operator<< ------------- | |
94 | //! \brief Dump Exception_t info to an ostream | |
95 | //! | |
96 | inline std :: ostream & operator<< (std :: ostream & out, const Exception_t & e) | |
97 | { | |
98 | out << "WHAT: " << e . what( ) << std::endl; | |
99 | out << "LINE: " << e . line( ) << std::endl; | |
100 | out << "FILE: " << e . file( ) << std::endl; | |
101 | return out; | |
102 | } | |
103 | ||
104 | ||
105 | //----------------------------------------------------- operator<< ------------- | |
106 | //! \brief Dump exception info to an ostream | |
107 | //! | |
108 | inline std::ostream & operator<< (std::ostream & out, const std::exception & e) | |
109 | { | |
110 | out << "WHAT: " << e . what( ) << std::endl; | |
111 | return out; | |
112 | } | |
113 | ||
114 | ||
115 | ||
116 | //-- Helpful exception throw macros | |
117 | #define SIMPLE_THROW(A) throw Exception_t (A, __LINE__, __FILE__) | |
118 | ||
119 | ||
120 | #endif // #ifndef __EXCEPTIONS_HH_INCLUDED |
0 | // A. L. Delcher | |
1 | // | |
2 | // File: extract.cc | |
3 | // | |
4 | // Last Modified: 5 Aug 2005 | |
5 | // | |
6 | // This program reads the sequence in the file named as the | |
7 | // first command-line argument and then extracts from it | |
8 | // sequences specified by coordinates. The resulting sequences | |
9 | // are output (in multifasta or two-string format) to stdout. | |
10 | ||
11 | ||
12 | ||
13 | #include "extract.hh" | |
14 | ||
15 | ||
16 | // External variables | |
17 | ||
18 | extern int Verbose; | |
19 | extern int Global_Debug_Flag; | |
20 | ||
21 | ||
22 | // Global variables | |
23 | ||
24 | static char * Coord_Info = NULL; | |
25 | // Name of file with list of coordinates (or a single coordinate | |
26 | // specification). | |
27 | static bool Fasta_Output_Format = true; | |
28 | // Determines format of output. | |
29 | static bool Is_Circular = true; | |
30 | // Determines whether the input sequence is regarded as a circular | |
31 | // genome. | |
32 | static long int Min_Len = 0; | |
33 | // Output sequences shorter than this are not printed. | |
34 | static char * Sequence_File_Name = NULL; | |
35 | // Name of file with input sequence | |
36 | static bool Skip_Start = false; | |
37 | // If set true (by -s option) then omit the first 3 letters | |
38 | // of each output sequence. | |
39 | static bool Skip_Stop = false; | |
40 | // If set true (by -t option) then omit the last 3 letters | |
41 | // of each output sequence. | |
42 | static bool Use_Direction = false; | |
43 | // If set true (by -d option) then use 4th field of coords | |
44 | // to determine direction in a circular genome. | |
45 | ||
46 | ||
47 | ||
48 | int main | |
49 | (int argc, char * argv []) | |
50 | ||
51 | { | |
52 | FILE * fp; | |
53 | string sequence, hdr; | |
54 | char line [MAX_LINE], tag [MAX_LINE]; | |
55 | long int seq_len; | |
56 | ||
57 | Verbose = 0; | |
58 | ||
59 | Parse_Command_Line (argc, argv); | |
60 | ||
61 | fp = File_Open (Sequence_File_Name, "r"); | |
62 | ||
63 | if (! Fasta_Read (fp, sequence, hdr)) | |
64 | { | |
65 | sprintf (Clean_Exit_Msg_Line, "ERROR: Failed to read file %s", | |
66 | Sequence_File_Name); | |
67 | Clean_Exit (Clean_Exit_Msg_Line, __FILE__, __LINE__); | |
68 | } | |
69 | fclose (fp); | |
70 | ||
71 | seq_len = sequence . length (); | |
72 | ||
73 | if (strcmp (Coord_Info, "-") == 0) | |
74 | fp = stdin; | |
75 | else | |
76 | fp = File_Open (Coord_Info, "r"); | |
77 | ||
78 | while (fgets (line, MAX_LINE, fp) != NULL) | |
79 | { | |
80 | long int i, start, end, extract_len; | |
81 | int dir; | |
82 | ||
83 | if (Use_Direction) | |
84 | { | |
85 | if (sscanf (line, "%s %ld %ld %d", tag, & start, & end, & dir) != 4) | |
86 | { | |
87 | fprintf (stderr, "ERROR: Skipped following coord line\n"); | |
88 | fputs (line, stderr); | |
89 | continue; | |
90 | } | |
91 | } | |
92 | else | |
93 | { | |
94 | if (sscanf (line, "%s %ld %ld", tag, & start, & end) != 3) | |
95 | { | |
96 | fprintf (stderr, "ERROR: Skipped following coord line\n"); | |
97 | fputs (line, stderr); | |
98 | continue; | |
99 | } | |
100 | if ((start < end && (! Is_Circular || end - start <= seq_len / 2)) | |
101 | || (Is_Circular && start - end > seq_len / 2)) | |
102 | dir = +1; | |
103 | else | |
104 | dir = -1; | |
105 | } | |
106 | ||
107 | if (dir > 0) | |
108 | { | |
109 | extract_len = 1 + end - start; | |
110 | if (extract_len < 0) | |
111 | extract_len += seq_len; | |
112 | if (extract_len < Min_Len) | |
113 | continue; | |
114 | ||
115 | i = start - 1; | |
116 | if (Skip_Start) | |
117 | { | |
118 | i += 3; | |
119 | extract_len -= 3; | |
120 | start += 3; | |
121 | } | |
122 | if (Skip_Stop) | |
123 | extract_len -= 3; | |
124 | ||
125 | if (extract_len >= Min_Len) | |
126 | Output_Subsequence (sequence, i, extract_len, 1, tag, start, end); | |
127 | } | |
128 | else | |
129 | { | |
130 | extract_len = 1 + start - end; | |
131 | if (extract_len < 0) | |
132 | extract_len += seq_len; | |
133 | if (extract_len < Min_Len) | |
134 | continue; | |
135 | ||
136 | i = start - 1; | |
137 | if (Skip_Start) | |
138 | { | |
139 | i -= 3; | |
140 | extract_len -= 3; | |
141 | start -= 3; | |
142 | } | |
143 | if (Skip_Stop) | |
144 | extract_len -= 3; | |
145 | ||
146 | if (extract_len >= Min_Len) | |
147 | Output_Subsequence (sequence, i, extract_len, -1, tag, start, end); | |
148 | } | |
149 | ||
150 | } | |
151 | ||
152 | return 0; | |
153 | } | |
154 | ||
155 | ||
156 | ||
157 | static void Output_Subsequence | |
158 | (const string & seq, long int i, long int len, int incr, | |
159 | const char * tag, long int start, long int end) | |
160 | ||
161 | // Print to stdout the subsequence of seq beginning at subscript | |
162 | // i with length len . If incr is positive, output the forward | |
163 | // strand; otherwise, output the reverse complement strand. Use tag , | |
164 | // start and end to label the output. | |
165 | ||
166 | { | |
167 | const int fasta_width = 60; | |
168 | long int seq_len; | |
169 | long int ct = 0, line_ct = 0; | |
170 | ||
171 | if (Fasta_Output_Format) | |
172 | printf (">%s %ld %ld len=%ld\n", tag, start, end, len); | |
173 | else | |
174 | printf ("%-10s ", tag); | |
175 | ||
176 | if (incr > 0) | |
177 | incr = 1; | |
178 | else | |
179 | incr = -1; | |
180 | seq_len = seq . length (); | |
181 | ||
182 | while (ct < len) | |
183 | { | |
184 | if (i < 0) | |
185 | i += seq_len; | |
186 | else if (i >= seq_len) | |
187 | i -= seq_len; | |
188 | ||
189 | if (Fasta_Output_Format && line_ct == fasta_width) | |
190 | { | |
191 | putchar ('\n'); | |
192 | line_ct = 0; | |
193 | } | |
194 | if (incr > 0) | |
195 | putchar (seq [i]); | |
196 | else | |
197 | putchar (Complement (seq [i])); | |
198 | ||
199 | i += incr; | |
200 | ct ++; | |
201 | line_ct ++; | |
202 | } | |
203 | putchar ('\n'); | |
204 | ||
205 | return; | |
206 | } | |
207 | ||
208 | ||
209 | ||
210 | static void Parse_Command_Line | |
211 | (int argc, char * argv []) | |
212 | ||
213 | // Get options and parameters from command line with argc | |
214 | // arguments in argv [0 .. (argc - 1)] . | |
215 | ||
216 | { | |
217 | bool errflg = false; | |
218 | int ch; | |
219 | ||
220 | optarg = NULL; | |
221 | ||
222 | #if ALLOW_LONG_OPTIONS | |
223 | int option_index = 0; | |
224 | static struct option long_options [] = { | |
225 | {"2_fields", 0, 0, 'd'}, | |
226 | {"dir", 0, 0, 'd'}, | |
227 | {"help", 0, 0, 'h'}, | |
228 | {"minlen", 1, 0, 'l'}, | |
229 | {"nostart", 0, 0, 's'}, | |
230 | {"nostop", 0, 0, 't'}, | |
231 | {"nowrap", 0, 0, 'w'}, | |
232 | {0, 0, 0, 0} | |
233 | }; | |
234 | ||
235 | while (! errflg | |
236 | && ((ch = getopt_long (argc, argv, "2dhl:stw", | |
237 | long_options, & option_index)) != EOF)) | |
238 | #else | |
239 | while (! errflg | |
240 | && ((ch = getopt (argc, argv, "2dhl:stw")) != EOF)) | |
241 | #endif | |
242 | ||
243 | switch (ch) | |
244 | { | |
245 | case '2' : | |
246 | Fasta_Output_Format = false; | |
247 | break; | |
248 | ||
249 | case 'd' : | |
250 | Use_Direction = true; | |
251 | break; | |
252 | ||
253 | case 'h' : | |
254 | Usage (); | |
255 | exit (EXIT_SUCCESS); | |
256 | ||
257 | case 'l' : | |
258 | Min_Len = strtol (optarg, NULL, 10); | |
259 | break; | |
260 | ||
261 | case 's' : | |
262 | Skip_Start = true; | |