Reorganized executables: made bin/ dir and binaries/$OPSYS/ dirs.
Torsten Seemann
9 years ago
0 | #!/usr/bin/env perl | |
1 | use strict; | |
2 | use warnings; | |
3 | use Time::Piece; | |
4 | use FindBin; | |
5 | ||
6 | # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | |
7 | # global variables | |
8 | ||
9 | my $EXE = $FindBin::RealScript; | |
10 | my $VERSION = "0.3"; | |
11 | my $DESC = "Bacterial/Archaeal Ribosomal RNA Predictor"; | |
12 | my $AUTHOR = 'Torsten Seemann <torsten.seemann@monash.edu>'; | |
13 | my $URL = 'http://www.vicbioinformatics.com/'; | |
14 | my $DBDIR = "$FindBin::RealBin/db"; | |
15 | my $OPSYS = $^O; | |
16 | my $NHMMER = "$FindBin::RealBin/binaries/nhmmer.$OPSYS"; | |
17 | my %LENG = ("5S_rRNA"=>119, "16S_rRNA"=>1585, "23S_rRNA"=>3232); | |
18 | ||
19 | # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | |
20 | # command line options | |
21 | ||
22 | my(@Options, $quiet, $kingdom, $threads, $evalue, $lencutoff, $incseq); | |
23 | setOptions(); | |
24 | ||
25 | # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | |
26 | # check all is well | |
27 | ||
28 | msg("This is $EXE $VERSION"); | |
29 | msg("Written by $AUTHOR"); | |
30 | msg("Obtained from $URL"); | |
31 | msg("Detected operating system: $OPSYS"); | |
32 | ||
33 | if (! -x $NHMMER) { | |
34 | err("No binary for your OS '$OPSYS' is included. If you have one, copy it to $NHMMER."); | |
35 | } | |
36 | msg("Using HMMER binary: $NHMMER"); | |
37 | ||
38 | #if (! -r "$DATABASE.h3i") { | |
39 | # err("The pressed HMM files are missing. Go to the barnapp folder and type 'make database'"); | |
40 | #} | |
41 | #msg("Found database files: $DATABASE.*"); | |
42 | ||
43 | $threads > 0 or err("Invalid --threads $threads"); | |
44 | msg("Will use $threads threads"); | |
45 | ||
46 | $evalue > 0 or err("Invalid --evalue $evalue"); | |
47 | msg("Setting evalue cutoff to $evalue"); | |
48 | ||
49 | $lencutoff > 0 or err("Invalid --lencutoff $lencutoff"); | |
50 | msg("Will tag genes < $lencutoff x full length as partial genes"); | |
51 | ||
52 | if ($kingdom =~ m/^b/i) { | |
53 | $kingdom = 'bacteria'; | |
54 | } | |
55 | elsif ($kingdom =~ m/^a/i) { | |
56 | $kingdom = 'archaea'; | |
57 | } | |
58 | else { | |
59 | err("I don't recognise the --kingdom '$kingdom'. Try 'bacteria' or 'archaea'."); | |
60 | } | |
61 | ||
62 | my $hmmdb = "$DBDIR/$kingdom.hmm"; | |
63 | err("Can't find database: $hmmdb") unless -r $hmmdb; | |
64 | msg("Using database: $hmmdb"); | |
65 | ||
66 | # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | |
67 | # run the external command | |
68 | ||
69 | my $fasta = shift @ARGV; | |
70 | $fasta && -r $fasta or err("Usage: $EXE <file.fasta>"); | |
71 | ||
72 | msg("Running $NHMMER on $fasta ... please wait"); | |
73 | my $cmd = "$NHMMER --cpu $threads -E $evalue --w_length 2000 -o /dev/null --tblout /dev/stdout \Q$hmmdb\E \Q$fasta\E"; | |
74 | my @hits = qx($cmd 2>&1); | |
75 | ||
76 | # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | |
77 | # process the output | |
78 | ||
79 | my @feat; | |
80 | foreach (@hits) { | |
81 | chomp; | |
82 | err("nhmmer failed to run - $_") if m/fail|error|core dump|bus error/i; | |
83 | next if m/^#/; # comment line | |
84 | next if m/^\s*$/; # empty line | |
85 | my @x = split ' ', $_; | |
86 | err("bad line in nhmmer output - @x") unless defined $x[6] and $x[6] =~ m/^\d+$/; | |
87 | ||
88 | my($begin,$end,$strand) = $x[6] < $x[7] ? ($x[6],$x[7],'+') : ($x[7],$x[6],'-'); | |
89 | my($seqid, $gene, $prod) = ($x[0], $x[2], $x[2]); | |
90 | ||
91 | # assumes NAME field in .HMM file is of form "16s_rRNA" etc | |
92 | $prod =~ s/_r/ ribosomal /; # convert the short ID to an English string | |
93 | ||
94 | # check for incomplete alignments | |
95 | my $len = $end-$begin+1; | |
96 | if ($len < int($lencutoff * $LENG{$gene}) ) { | |
97 | $prod .= " (partial)"; | |
98 | } | |
99 | ||
100 | msg("Found:", $gene, $seqid, "L=$len/$LENG{$gene}", "$begin..$end", $strand, $prod); | |
101 | push @feat, [ | |
102 | $seqid, "$EXE:$VERSION", 'rRNA', $begin, $end, '.', $strand, '.', "Name=$gene;product=$prod" | |
103 | ]; | |
104 | } | |
105 | ||
106 | # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | |
107 | # output a sorted GFF3 | |
108 | ||
109 | sub gff_sort { | |
110 | # sort by seqid, then start pos | |
111 | return ($a->[0] cmp $b->[0]) || ($a->[3] <=> $b->[3]); | |
112 | } | |
113 | ||
114 | msg("Found", scalar(@feat), "ribosomal RNA features."); | |
115 | msg("Sorting features and outputting GFF3..."); | |
116 | print "##gff-version 3\n"; | |
117 | for my $row (sort { gff_sort } @feat) { | |
118 | print join("\t", @$row),"\n"; | |
119 | } | |
120 | if ($incseq) { | |
121 | print "##FASTA\n"; | |
122 | open FASTA, $fasta; | |
123 | print while (<FASTA>); # `cat $fasta` | |
124 | } | |
125 | ||
126 | #---------------------------------------------------------------------- | |
127 | ||
128 | sub msg { | |
129 | return if $quiet; | |
130 | my $t = localtime; | |
131 | my $line = "[".$t->hms."] @_\n"; | |
132 | print STDERR $line; | |
133 | } | |
134 | ||
135 | #---------------------------------------------------------------------- | |
136 | ||
137 | sub err { | |
138 | $quiet=0; | |
139 | msg(@_); | |
140 | exit(2); | |
141 | } | |
142 | ||
143 | #---------------------------------------------------------------------- | |
144 | ||
145 | sub version { | |
146 | print STDERR "$EXE $VERSION\n"; | |
147 | exit; | |
148 | } | |
149 | ||
150 | #---------------------------------------------------------------------- | |
151 | ||
152 | sub show_citation { | |
153 | print STDERR << "EOCITE"; | |
154 | ||
155 | If you use Barrnap in your work, please cite: | |
156 | ||
157 | Seemann T (2013) | |
158 | $EXE $VERSION : $DESC | |
159 | $URL | |
160 | ||
161 | Thank you. | |
162 | ||
163 | EOCITE | |
164 | ||
165 | exit; | |
166 | } | |
167 | ||
168 | #---------------------------------------------------------------------- | |
169 | # Option setting routines | |
170 | ||
171 | sub setOptions { | |
172 | use Getopt::Long; | |
173 | ||
174 | @Options = ( | |
175 | 'Options:', | |
176 | {OPT=>"help", VAR=>\&usage, DESC=>"This help"}, | |
177 | {OPT=>"version", VAR=>\&version, DESC=>"Print version and exit"}, | |
178 | {OPT=>"citation",VAR=>\&show_citation, DESC=>"Print citation for referencing $EXE"}, | |
179 | {OPT=>"kingdom=s", VAR=>\$kingdom, DEFAULT=>'bacteria', DESC=>"Kingdom: [b]acteria [a]rchaea"}, | |
180 | {OPT=>"quiet!", VAR=>\$quiet, DEFAULT=>0, DESC=>"No screen output"}, | |
181 | {OPT=>"threads=i", VAR=>\$threads, DEFAULT=>8, DESC=>"Number of threads/cores/CPUs to use"}, | |
182 | {OPT=>"lencutoff=f",VAR=>\$lencutoff, DEFAULT=>0.8, DESC=>"Proportional length threshold to tag as pseudo"}, | |
183 | {OPT=>"evalue=f",VAR=>\$evalue, DEFAULT=>1E-6, DESC=>"Similarity e-value cut-off"}, | |
184 | {OPT=>"incseq!", VAR=>\$incseq, DEFAULT=>0, DESC=>"Include FASTA input sequences in GFF3 output"}, | |
185 | ); | |
186 | ||
187 | (!@ARGV) && (usage()); | |
188 | ||
189 | &GetOptions(map {$_->{OPT}, $_->{VAR}} grep { ref } @Options) || usage(); | |
190 | ||
191 | # Now setup default values. | |
192 | foreach (@Options) { | |
193 | if (ref $_ && defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) { | |
194 | ${$_->{VAR}} = $_->{DEFAULT}; | |
195 | } | |
196 | } | |
197 | } | |
198 | ||
199 | #---------------------------------------------------------------------- | |
200 | ||
201 | sub usage { | |
202 | print STDERR "Synopsis:\n $EXE $VERSION - $DESC\n"; | |
203 | print STDERR "Author:\n $AUTHOR\n"; | |
204 | print STDERR "Usage:\n $EXE [options] <chromosomes.fasta>\n"; | |
205 | foreach (@Options) { | |
206 | if (ref) { | |
207 | my $def = defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : ""; | |
208 | $def = ($def ? ' (default OFF)' : '(default ON)') if $_->{OPT} =~ m/!$/; | |
209 | my $opt = $_->{OPT}; | |
210 | $opt =~ s/!$//; | |
211 | $opt =~ s/=s$/ [X]/; | |
212 | $opt =~ s/=i$/ [N]/; | |
213 | $opt =~ s/=f$/ [n.n]/; | |
214 | printf STDERR " --%-15s %s%s\n", $opt, $_->{DESC}, $def; | |
215 | } | |
216 | else { | |
217 | print STDERR "$_\n"; | |
218 | } | |
219 | } | |
220 | exit(1); | |
221 | } | |
222 | ||
223 | #---------------------------------------------------------------------- |
0 | #!/usr/bin/env perl | |
1 | use strict; | |
2 | use warnings; | |
3 | use Time::Piece; | |
4 | use FindBin; | |
5 | ||
6 | # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | |
7 | # global variables | |
8 | ||
9 | my $EXE = $FindBin::RealScript; | |
10 | my $VERSION = "0.4-dev"; | |
11 | my $DESC = "Bacterial/Archaeal Ribosomal RNA Predictor"; | |
12 | my $AUTHOR = 'Torsten Seemann <torsten.seemann@monash.edu>'; | |
13 | my $URL = 'http://www.vicbioinformatics.com/'; | |
14 | my $DBDIR = "$FindBin::RealBin/../db"; | |
15 | my $OPSYS = $^O; | |
16 | my $NHMMER = "$FindBin::RealBin/../binaries/$OPSYS/nhmmer"; | |
17 | my %LENG = ("5S_rRNA"=>119, "16S_rRNA"=>1585, "23S_rRNA"=>3232); | |
18 | ||
19 | # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | |
20 | # command line options | |
21 | ||
22 | my(@Options, $quiet, $kingdom, $threads, $evalue, $lencutoff, $incseq); | |
23 | setOptions(); | |
24 | ||
25 | # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | |
26 | # check all is well | |
27 | ||
28 | msg("This is $EXE $VERSION"); | |
29 | msg("Written by $AUTHOR"); | |
30 | msg("Obtained from $URL"); | |
31 | msg("Detected operating system: $OPSYS"); | |
32 | ||
33 | if (! -x $NHMMER) { | |
34 | err("No binary for your OS '$OPSYS' is included. If you have one, copy it to $NHMMER."); | |
35 | } | |
36 | msg("Using HMMER binary: $NHMMER"); | |
37 | ||
38 | #if (! -r "$DATABASE.h3i") { | |
39 | # err("The pressed HMM files are missing. Go to the barnapp folder and type 'make database'"); | |
40 | #} | |
41 | #msg("Found database files: $DATABASE.*"); | |
42 | ||
43 | $threads > 0 or err("Invalid --threads $threads"); | |
44 | msg("Will use $threads threads"); | |
45 | ||
46 | $evalue > 0 or err("Invalid --evalue $evalue"); | |
47 | msg("Setting evalue cutoff to $evalue"); | |
48 | ||
49 | $lencutoff > 0 or err("Invalid --lencutoff $lencutoff"); | |
50 | msg("Will tag genes < $lencutoff x full length as partial genes"); | |
51 | ||
52 | if ($kingdom =~ m/^b/i) { | |
53 | $kingdom = 'bacteria'; | |
54 | } | |
55 | elsif ($kingdom =~ m/^a/i) { | |
56 | $kingdom = 'archaea'; | |
57 | } | |
58 | else { | |
59 | err("I don't recognise the --kingdom '$kingdom'. Try 'bacteria' or 'archaea'."); | |
60 | } | |
61 | ||
62 | my $hmmdb = "$DBDIR/$kingdom.hmm"; | |
63 | err("Can't find database: $hmmdb") unless -r $hmmdb; | |
64 | msg("Using database: $hmmdb"); | |
65 | ||
66 | # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | |
67 | # run the external command | |
68 | ||
69 | my $fasta = shift @ARGV; | |
70 | $fasta && -r $fasta or err("Usage: $EXE <file.fasta>"); | |
71 | ||
72 | msg("Running $NHMMER on $fasta ... please wait"); | |
73 | my $cmd = "$NHMMER --cpu $threads -E $evalue --w_length 2000 -o /dev/null --tblout /dev/stdout \Q$hmmdb\E \Q$fasta\E"; | |
74 | my @hits = qx($cmd 2>&1); | |
75 | ||
76 | # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | |
77 | # process the output | |
78 | ||
79 | my @feat; | |
80 | foreach (@hits) { | |
81 | chomp; | |
82 | err("nhmmer failed to run - $_") if m/fail|error|core dump|bus error/i; | |
83 | next if m/^#/; # comment line | |
84 | next if m/^\s*$/; # empty line | |
85 | my @x = split ' ', $_; | |
86 | err("bad line in nhmmer output - @x") unless defined $x[6] and $x[6] =~ m/^\d+$/; | |
87 | ||
88 | my($begin,$end,$strand) = $x[6] < $x[7] ? ($x[6],$x[7],'+') : ($x[7],$x[6],'-'); | |
89 | my($seqid, $gene, $prod) = ($x[0], $x[2], $x[2]); | |
90 | ||
91 | # assumes NAME field in .HMM file is of form "16s_rRNA" etc | |
92 | $prod =~ s/_r/ ribosomal /; # convert the short ID to an English string | |
93 | ||
94 | # check for incomplete alignments | |
95 | my $len = $end-$begin+1; | |
96 | if ($len < int($lencutoff * $LENG{$gene}) ) { | |
97 | $prod .= " (partial)"; | |
98 | } | |
99 | ||
100 | msg("Found:", $gene, $seqid, "L=$len/$LENG{$gene}", "$begin..$end", $strand, $prod); | |
101 | push @feat, [ | |
102 | $seqid, "$EXE:$VERSION", 'rRNA', $begin, $end, '.', $strand, '.', "Name=$gene;product=$prod" | |
103 | ]; | |
104 | } | |
105 | ||
106 | # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | |
107 | # output a sorted GFF3 | |
108 | ||
109 | sub gff_sort { | |
110 | # sort by seqid, then start pos | |
111 | return ($a->[0] cmp $b->[0]) || ($a->[3] <=> $b->[3]); | |
112 | } | |
113 | ||
114 | msg("Found", scalar(@feat), "ribosomal RNA features."); | |
115 | msg("Sorting features and outputting GFF3..."); | |
116 | print "##gff-version 3\n"; | |
117 | for my $row (sort { gff_sort } @feat) { | |
118 | print join("\t", @$row),"\n"; | |
119 | } | |
120 | if ($incseq) { | |
121 | print "##FASTA\n"; | |
122 | open FASTA, $fasta; | |
123 | print while (<FASTA>); # `cat $fasta` | |
124 | } | |
125 | ||
126 | #---------------------------------------------------------------------- | |
127 | ||
128 | sub msg { | |
129 | return if $quiet; | |
130 | my $t = localtime; | |
131 | my $line = "[".$t->hms."] @_\n"; | |
132 | print STDERR $line; | |
133 | } | |
134 | ||
135 | #---------------------------------------------------------------------- | |
136 | ||
137 | sub err { | |
138 | $quiet=0; | |
139 | msg(@_); | |
140 | exit(2); | |
141 | } | |
142 | ||
143 | #---------------------------------------------------------------------- | |
144 | ||
145 | sub version { | |
146 | print STDERR "$EXE $VERSION\n"; | |
147 | exit; | |
148 | } | |
149 | ||
150 | #---------------------------------------------------------------------- | |
151 | ||
152 | sub show_citation { | |
153 | print STDERR << "EOCITE"; | |
154 | ||
155 | If you use Barrnap in your work, please cite: | |
156 | ||
157 | Seemann T (2013) | |
158 | $EXE $VERSION : $DESC | |
159 | $URL | |
160 | ||
161 | Thank you. | |
162 | ||
163 | EOCITE | |
164 | ||
165 | exit; | |
166 | } | |
167 | ||
168 | #---------------------------------------------------------------------- | |
169 | # Option setting routines | |
170 | ||
171 | sub setOptions { | |
172 | use Getopt::Long; | |
173 | ||
174 | @Options = ( | |
175 | 'Options:', | |
176 | {OPT=>"help", VAR=>\&usage, DESC=>"This help"}, | |
177 | {OPT=>"version", VAR=>\&version, DESC=>"Print version and exit"}, | |
178 | {OPT=>"citation",VAR=>\&show_citation, DESC=>"Print citation for referencing $EXE"}, | |
179 | {OPT=>"kingdom=s", VAR=>\$kingdom, DEFAULT=>'bacteria', DESC=>"Kingdom: [b]acteria [a]rchaea"}, | |
180 | {OPT=>"quiet!", VAR=>\$quiet, DEFAULT=>0, DESC=>"No screen output"}, | |
181 | {OPT=>"threads=i", VAR=>\$threads, DEFAULT=>8, DESC=>"Number of threads/cores/CPUs to use"}, | |
182 | {OPT=>"lencutoff=f",VAR=>\$lencutoff, DEFAULT=>0.8, DESC=>"Proportional length threshold to tag as pseudo"}, | |
183 | {OPT=>"evalue=f",VAR=>\$evalue, DEFAULT=>1E-6, DESC=>"Similarity e-value cut-off"}, | |
184 | {OPT=>"incseq!", VAR=>\$incseq, DEFAULT=>0, DESC=>"Include FASTA input sequences in GFF3 output"}, | |
185 | ); | |
186 | ||
187 | (!@ARGV) && (usage()); | |
188 | ||
189 | &GetOptions(map {$_->{OPT}, $_->{VAR}} grep { ref } @Options) || usage(); | |
190 | ||
191 | # Now setup default values. | |
192 | foreach (@Options) { | |
193 | if (ref $_ && defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) { | |
194 | ${$_->{VAR}} = $_->{DEFAULT}; | |
195 | } | |
196 | } | |
197 | } | |
198 | ||
199 | #---------------------------------------------------------------------- | |
200 | ||
201 | sub usage { | |
202 | print STDERR "Synopsis:\n $EXE $VERSION - $DESC\n"; | |
203 | print STDERR "Author:\n $AUTHOR\n"; | |
204 | print STDERR "Usage:\n $EXE [options] <chromosomes.fasta>\n"; | |
205 | foreach (@Options) { | |
206 | if (ref) { | |
207 | my $def = defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : ""; | |
208 | $def = ($def ? ' (default OFF)' : '(default ON)') if $_->{OPT} =~ m/!$/; | |
209 | my $opt = $_->{OPT}; | |
210 | $opt =~ s/!$//; | |
211 | $opt =~ s/=s$/ [X]/; | |
212 | $opt =~ s/=i$/ [N]/; | |
213 | $opt =~ s/=f$/ [n.n]/; | |
214 | printf STDERR " --%-15s %s%s\n", $opt, $_->{DESC}, $def; | |
215 | } | |
216 | else { | |
217 | print STDERR "$_\n"; | |
218 | } | |
219 | } | |
220 | exit(1); | |
221 | } | |
222 | ||
223 | #---------------------------------------------------------------------- |
Binary diff not shown
Binary diff not shown