Codebase list barrnap / upstream/0.8 bin / barrnap
upstream/0.8

Tree @upstream/0.8 (Download .tar.gz)

barrnap @upstream/0.8raw · history · blame

#!/usr/bin/env perl
use strict;
use warnings;
use Time::Piece;
use List::Util qw(max);
use FindBin;

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# global variables

my $EXE = $FindBin::RealScript;
my $VERSION = "0.8";
my $DESC = "rapid ribosomal RNA prediction";
my $AUTHOR = 'Torsten Seemann <torsten.seemann@gmail.com>';
my $URL = 'https://github.com/tseemann/barrnap';
my $DBDIR = "$FindBin::RealBin/../db";
my $OPSYS = $^O;
my $BINDIR = "$FindBin::RealBin/../binaries/$OPSYS";

my %KINGDOM = (map { substr($_,0,1) => $_ } qw(bac arc euk mito));
my %LENG = (
  "5S_rRNA"  =>119, "16S_rRNA"=>1585, "23S_rRNA"=>3232,
  "5_8S_rRNA"=>156, "18S_rRNA"=>1869, "28S_rRNA"=>2912, "12S_rRNA"=>954,
);
my $MAXLEN = int( 1.2 * max(values %LENG) );

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# command line options

my(@Options, $quiet, $kingdom, $threads, $evalue, $lencutoff, $reject, $incseq);
setOptions();

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# check all is well

msg("This is $EXE $VERSION");
msg("Written by $AUTHOR");
msg("Obtained from $URL");
msg("Detected operating system: $OPSYS");
msg("Adding $BINDIR to end of PATH");
$ENV{PATH} .= ":$BINDIR";

my($NHMMER) = qx(which -a nhmmer 2> /dev/null);
$NHMMER or err("Could not find 'nhmmer' executable in PATH");
chomp $NHMMER;
msg("Using HMMER binary: $NHMMER");

$threads > 0 or err("Invalid --threads $threads");
msg("Will use $threads threads");

$evalue > 0 or err("Invalid --evalue $evalue");
msg("Setting evalue cutoff to $evalue");

$lencutoff > 0 or err("Invalid --lencutoff $lencutoff");
msg("Will tag genes < $lencutoff of expected length.");

$reject > 0 or err("Invalid --reject cutoff $reject");
msg("Will reject genes < $reject of expected length.");

my $kdom = $KINGDOM{ lc substr($kingdom,0,1) } or
  err("I don't recognise --kingdom '$kingdom'. Try: bac arc euk mito");

my $hmmdb = "$DBDIR/$kdom.hmm";
err("Can't find database: $hmmdb") unless -r $hmmdb;
msg("Using database: $hmmdb");

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# run the external command

my $fasta = shift @ARGV;
$fasta && -r $fasta or err("Usage: $EXE <file.fasta>");

msg("Scanning $fasta for $kdom rRNA genes... please wait");
my $cmd = "$NHMMER --cpu $threads -E $evalue --w_length $MAXLEN  -o /dev/null --tblout /dev/stdout \Q$hmmdb\E \Q$fasta\E";
msg("Command: $cmd");
my @hits = qx($cmd 2>&1);

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# process the output

my @feat;
HIT: 
foreach (@hits) {
  chomp;
  err("nhmmer failed to run - $_") if m/fail|error|core dump|bus error/i;
  next if m/^#/;    # comment line
  next if m/^\s*$/; # empty line
  my @x = split ' ', $_;
  err("bad line in nhmmer output - @x") unless defined $x[6] and $x[6] =~ m/^\d+$/;

  my($begin,$end,$strand) = $x[6] < $x[7] ? ($x[6],$x[7],'+') : ($x[7],$x[6],'-');
  my($seqid, $gene, $prod) = ($x[0], $x[2], $x[2]);
  my $score = defined $x[12]  ? $x[12] : '.';

  # check if hit makes sense to us
  exists $LENG{$gene} or err("Detected unknown gene '$gene' in scan, aborting.");

  # assumes NAME field in .HMM file is of form "16s_rRNA" etc
  $prod =~ s/_r/ ribosomal /; # convert the short ID to an English string
  $prod =~ s/5_8/5.8/;        # correct naming for 5.8S

  # check for incomplete alignments
  my $note = '';
  my $len = $end-$begin+1;
  
  if ( $len < int($reject * $LENG{$gene}) ) {
    msg("Rejecting short $len nt predicted $gene. Adjust via --reject option.");
    next HIT;
  }
  elsif ( $len < int($lencutoff * $LENG{$gene}) ) {
    $note = sprintf "aligned only %d percent of the $prod", (100*$len/$LENG{$gene});
    $prod .= " (partial)";
  }

  msg("Found:", $gene, $seqid, "L=$len/$LENG{$gene}", "$begin..$end", $strand, $prod);
  my $tags = "Name=$gene;product=$prod";
  $tags .= ";note=$note" if $note;
  push @feat, [
    $seqid, "$EXE:$VERSION", 'rRNA', $begin, $end, $score, $strand, '.', $tags,
  ];
}

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# output a sorted GFF3

sub gff_sort {
  # sort by seqid, then start pos
  return ($a->[0] cmp $b->[0]) || ($a->[3] <=> $b->[3]);
}

msg("Found", scalar(@feat), "ribosomal RNA features.");
msg("Sorting features and outputting GFF3...");
print "##gff-version 3\n";
for my $row (sort { gff_sort } @feat) {
  print join("\t", @$row),"\n";
}
if ($incseq) {
  print "##FASTA\n";
  open FASTA, '<', $fasta;
  print while (<FASTA>);  # `cat $fasta`
}

#----------------------------------------------------------------------

sub msg {
  return if $quiet;
  my $t = localtime;
  my $line = "[".$t->hms."] @_\n";
  print STDERR $line;
}

#----------------------------------------------------------------------

sub err {
  $quiet=0;
  msg(@_);
  exit(2);
}

#----------------------------------------------------------------------

sub version {
  print STDERR "$EXE $VERSION\n";
  exit;
}

#----------------------------------------------------------------------

sub show_citation {
  print STDERR << "EOCITE";
  
If you use Barrnap in your work, please cite:

    Seemann T (2013)
    $EXE $VERSION : $DESC
    $URL
    
Thank you.

EOCITE

  exit;
}

#----------------------------------------------------------------------
# Option setting routines

sub setOptions {
  use Getopt::Long;

  @Options = (
    'Options:',
    {OPT=>"help",    VAR=>\&usage,             DESC=>"This help"},
    {OPT=>"version", VAR=>\&version,           DESC=>"Print version and exit"},
    {OPT=>"citation",VAR=>\&show_citation,     DESC=>"Print citation for referencing $EXE"},
    {OPT=>"kingdom=s", VAR=>\$kingdom,  DEFAULT=>'bac', 
       DESC=>"Kingdom: ".join(' ', values %KINGDOM) },
    {OPT=>"quiet!",  VAR=>\$quiet, DEFAULT=>0, DESC=>"No screen output"},
    {OPT=>"threads=i",  VAR=>\$threads, DEFAULT=>8,  DESC=>"Number of threads/cores/CPUs to use"},
    {OPT=>"lencutoff=f",VAR=>\$lencutoff, DEFAULT=>0.8, DESC=>"Proportional length threshold to label as partial"},
    {OPT=>"reject=f",VAR=>\$reject, DEFAULT=>0.5, DESC=>"Proportional length threshold to reject prediction"},
    {OPT=>"evalue=f",VAR=>\$evalue, DEFAULT=>1E-6, DESC=>"Similarity e-value cut-off"},
    {OPT=>"incseq!",  VAR=>\$incseq, DEFAULT=>0, DESC=>"Include FASTA input sequences in GFF3 output"},
  );

  (!@ARGV) && (usage());

  &GetOptions(map {$_->{OPT}, $_->{VAR}} grep { ref } @Options) || usage();

  # Now setup default values.
  foreach (@Options) {
    if (ref $_ && defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) {
      ${$_->{VAR}} = $_->{DEFAULT};
    }
  }
}

#----------------------------------------------------------------------

sub usage {
  print STDERR "Synopsis:\n  $EXE $VERSION - $DESC\n";
  print STDERR "Author:\n  $AUTHOR\n";
  print STDERR "Usage:\n  $EXE [options] <chromosomes.fasta>\n";
  foreach (@Options) {
    if (ref) {
      my $def = defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
      $def = ($def ? ' (default OFF)' : '(default ON)') if $_->{OPT} =~ m/!$/;
      my $opt = $_->{OPT};
      $opt =~ s/!$//; 
      $opt =~ s/=s$/ [X]/; 
      $opt =~ s/=i$/ [N]/;
      $opt =~ s/=f$/ [n.n]/;
      printf STDERR "  --%-15s %s%s\n", $opt, $_->{DESC}, $def;
    }
    else {
      print STDERR "$_\n";
    }      
  }
  exit(1);
}

#----------------------------------------------------------------------