#!/usr/bin/env perl
use strict;
use warnings;
use List::Util qw(max);
use FindBin;
use File::Temp;
# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# global variables
my $VERSION = "0.9";
my $EXE = $FindBin::RealScript;
my $DESC = "rapid ribosomal RNA prediction";
my $AUTHOR = 'Torsten Seemann';
my $URL = 'https://github.com/tseemann/barrnap';
my $DBDIR = "$FindBin::RealBin/../db";
my $OPSYS = $^O;
my $BINDIR = "$FindBin::RealBin/../binaries/$OPSYS";
my %KINGDOM = (map { substr($_,0,1) => $_ } qw(bac arc euk mito));
my %LENG = (
"5S_rRNA" =>119, "16S_rRNA"=>1585, "23S_rRNA"=>3232,
"5_8S_rRNA"=>156, "18S_rRNA"=>1869, "28S_rRNA"=>2912, "12S_rRNA"=>954,
);
my $MAXLEN = int( 1.2 * max(values %LENG) );
# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# command line options
my(@Options, $quiet, $kingdom, $threads, $evalue, $lencutoff, $reject, $incseq, $outseq);
setOptions();
# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# check all is well
msg("This is $EXE $VERSION");
msg("Written by $AUTHOR");
msg("Obtained from $URL");
msg("Detected operating system: $OPSYS");
msg("Adding $BINDIR to end of PATH");
$ENV{PATH} .= ":$BINDIR";
msg("Checking for dependencies:");
require_exe('nhmmer', 'bedtools');
$threads > 0 or err("Invalid --threads $threads");
msg("Will use $threads threads");
$evalue > 0 or err("Invalid --evalue $evalue");
msg("Setting evalue cutoff to $evalue");
$lencutoff > 0 or err("Invalid --lencutoff $lencutoff");
msg("Will tag genes < $lencutoff of expected length.");
$reject > 0 or err("Invalid --reject cutoff $reject");
msg("Will reject genes < $reject of expected length.");
my $kdom = $KINGDOM{ lc substr($kingdom,0,1) } or
err("I don't recognise --kingdom '$kingdom'. Try:", values(%KINGDOM) );
my $hmmdb = "$DBDIR/$kdom.hmm";
err("Can't find database: $hmmdb") unless -r $hmmdb;
msg("Using database: $hmmdb");
# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# check if user is piping to STDIN
# nhmmer needs to fseek() so we make a temp fasta file
my $fasta = shift @ARGV;
my $tmpfh;
if (defined($fasta) && $fasta eq '-' or !defined($fasta) && !-t \*STDIN) {
$tmpfh = File::Temp->new(UNLINK=>1);
msg("Copying STDIN to a temporary file:", $tmpfh->filename);
while (<STDIN>) {
print $tmpfh $_;
}
$fasta = $tmpfh->filename;
}
$fasta && -r $fasta or err("No input file on command line or stdin");
# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# run the external command
msg("Scanning $fasta for $kdom rRNA genes... please wait");
my $opts = "--cpu $threads -E $evalue --w_length $MAXLEN -o /dev/null --tblout /dev/stdout";
my $cmd = "nhmmer $opts '$hmmdb' '$fasta'";
msg("Command: $cmd");
my @hits = qx($cmd 2>&1);
# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# process the output
my %hitname;
my @feat;
my @bed;
HIT:
foreach (@hits) {
chomp;
err("nhmmer failed to run - $_") if m/fail|error|core dump|bus error/i;
next if m/^#/; # comment line
next if m/^\s*$/; # empty line
my @x = split ' ', $_;
err("bad line in nhmmer output - @x") unless defined $x[6] and $x[6] =~ m/^\d+$/;
# massage to GFF
my($begin,$end,$strand) = $x[6] < $x[7] ? ($x[6],$x[7],'+') : ($x[7],$x[6],'-');
my($seqid, $gene, $prod) = ($x[0], $x[2], $x[2]);
my $score = defined $x[12] ? $x[12] : '.';
# record hits for --outseq retrieval later
$hitname{"$seqid/$begin:$end($strand)"} = $prod;
# check if hit makes sense to us
exists $LENG{$gene} or err("Detected unknown gene '$gene' in scan, aborting.");
# assumes NAME field in .HMM file is of form "16s_rRNA" etc
$prod =~ s/_r/ ribosomal /; # convert the short ID to an English string
$prod =~ s/5_8/5.8/; # correct naming for 5.8S
# check for incomplete alignments
my $note = '';
my $len = $end-$begin+1;
if ( $len < int($reject * $LENG{$gene}) ) {
msg("Rejecting short $len nt predicted $gene. Adjust via --reject option.");
next HIT;
}
elsif ( $len < int($lencutoff * $LENG{$gene}) ) {
$note = sprintf "aligned only %d percent of the $prod", (100*$len/$LENG{$gene});
$prod .= " (partial)";
}
# keep track of good hits for retrievel later
push @bed, [ $seqid, $begin-1, $end, $gene, 100, $strand ];
msg("Found:", $gene, $seqid, "L=$len/$LENG{$gene}", "$begin..$end", $strand, $prod);
my $tags = "Name=$gene;product=$prod";
$tags .= ";note=$note" if $note;
push @feat, [
$seqid, "$EXE:$VERSION", 'rRNA', $begin, $end, $score, $strand, '.', $tags,
];
}
# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# output a sorted GFF3
sub gff_sort {
# sort by seqid, then start pos
return ($a->[0] cmp $b->[0]) || ($a->[3] <=> $b->[3]);
}
msg("Found", scalar(@feat), "ribosomal RNA features.");
msg("Sorting features and outputting GFF3...");
print "##gff-version 3\n";
for my $row (sort { gff_sort } @feat) {
print join("\t", @$row),"\n";
}
if ($incseq) {
print "##FASTA\n";
open FASTA, '<', $fasta;
print while (<FASTA>); # `cat $fasta`
}
if ($outseq) {
msg("Writing hit sequences to: $outseq");
my $bed = File::Temp->new();
for my $b (@bed) {
print $bed join("\t", @$b),"\n";
}
$bed->seek(0, SEEK_END); # rewind
my $cmd = "bedtools getfasta -s -name+ -fo '$outseq' -fi '$fasta' -bed '".$bed->filename."'";
msg("Running: $cmd");
system($cmd);
}
# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# final cleanup
msg("Done.");
#----------------------------------------------------------------------
sub require_exe {
for my $exe (@_) {
my($which) = qx(which $exe 2> /dev/null);
$which or err("Can not find required '$exe' in PATH");
chomp $which;
msg("Found $exe - $which");
}
}
#----------------------------------------------------------------------
sub revcom {
my($s) = @_;
$s = reverse($s);
$s =~ tr/ATCGatcg/TAGCtagc/;
return $s;
}
#----------------------------------------------------------------------
sub msg {
return if $quiet;
my $line = "[$EXE] @_\n";
print STDERR $line;
}
#----------------------------------------------------------------------
sub err {
$quiet=0;
msg("ERROR:", @_);
exit(2);
}
#----------------------------------------------------------------------
sub version {
print STDERR "$EXE $VERSION\n";
exit;
}
#----------------------------------------------------------------------
sub show_citation {
print STDERR << "EOCITE";
If you use Barrnap in your work, please cite:
Seemann T
$EXE $VERSION : $DESC
$URL
Thank you.
EOCITE
exit;
}
#----------------------------------------------------------------------
# Option setting routines
sub setOptions {
use Getopt::Long;
@Options = (
'Options:',
{OPT=>"help", VAR=>\&usage, DESC=>"This help"},
{OPT=>"version", VAR=>\&version, DESC=>"Print version and exit"},
{OPT=>"citation",VAR=>\&show_citation, DESC=>"Print citation for referencing $EXE"},
{OPT=>"kingdom=s", VAR=>\$kingdom, DEFAULT=>'bac', DESC=>"Kingdom: ".join(' ', values %KINGDOM) },
{OPT=>"quiet!", VAR=>\$quiet, DEFAULT=>0, DESC=>"No screen output"},
{OPT=>"threads=i", VAR=>\$threads, DEFAULT=>1, DESC=>"Number of threads/cores/CPUs to use"},
{OPT=>"lencutoff=f",VAR=>\$lencutoff, DEFAULT=>0.8, DESC=>"Proportional length threshold to label as partial"},
{OPT=>"reject=f",VAR=>\$reject, DEFAULT=>0.25, DESC=>"Proportional length threshold to reject prediction"},
{OPT=>"evalue=f",VAR=>\$evalue, DEFAULT=>1E-6, DESC=>"Similarity e-value cut-off"},
{OPT=>"incseq!", VAR=>\$incseq, DEFAULT=>0, DESC=>"Include FASTA _input_ sequences in GFF3 output"},
{OPT=>"outseq=s", VAR=>\$outseq, DEFAULT=>'', DESC=>"Save rRNA hit seqs to this FASTA file"},
);
# (!@ARGV) && (usage(1));
&GetOptions(map {$_->{OPT}, $_->{VAR}} grep { ref } @Options) || usage(1);
# Now setup default values.
foreach (@Options) {
if (ref $_ && defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) {
${$_->{VAR}} = $_->{DEFAULT};
}
}
}
#----------------------------------------------------------------------
sub usage {
my($exitcode) = @_;
$exitcode = 0 if $exitcode eq 'help'; # what gets passed by getopt func ref
$exitcode ||= 0;
select STDERR if $exitcode; # write to STDERR if exitcode is error
print "Synopsis:\n $EXE $VERSION - $DESC\n";
print "Author:\n $AUTHOR\n";
print "Usage:\n";
print " $EXE [options] chr.fa\n";
print " $EXE [options] < chr.fa\n";
print " $EXE [options] - < chr.fa\n";
foreach (@Options) {
if (ref) {
my $def = defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
$def = ($def ? ' (default OFF)' : '(default ON)') if $_->{OPT} =~ m/!$/;
my $opt = $_->{OPT};
$opt =~ s/!$//;
$opt =~ s/=s$/ [X]/;
$opt =~ s/=i$/ [N]/;
$opt =~ s/=f$/ [n.n]/;
printf " --%-15s %s%s\n", $opt, $_->{DESC}, $def;
}
else {
print "$_\n";
}
}
exit($exitcode);
}
#----------------------------------------------------------------------