Codebase list barrnap / f2841301-f059-4bd9-8c12-75fb4174e5a1/main bin / barrnap
f2841301-f059-4bd9-8c12-75fb4174e5a1/main

Tree @f2841301-f059-4bd9-8c12-75fb4174e5a1/main (Download .tar.gz)

barrnap @f2841301-f059-4bd9-8c12-75fb4174e5a1/mainraw · history · blame

#!/usr/bin/env perl
use strict;
use warnings;
use List::Util qw(max);
use FindBin;
use File::Temp;

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# global variables

my $VERSION = "0.9";
my $EXE = $FindBin::RealScript;
my $DESC = "rapid ribosomal RNA prediction";
my $AUTHOR = 'Torsten Seemann';
my $URL = 'https://github.com/tseemann/barrnap';
my $DBDIR = "$FindBin::RealBin/../db";
my $OPSYS = $^O;
my $BINDIR = "$FindBin::RealBin/../binaries/$OPSYS";

my %KINGDOM = (map { substr($_,0,1) => $_ } qw(bac arc euk mito));
my %LENG = (
  "5S_rRNA"  =>119, "16S_rRNA"=>1585, "23S_rRNA"=>3232,
  "5_8S_rRNA"=>156, "18S_rRNA"=>1869, "28S_rRNA"=>2912, "12S_rRNA"=>954,
);
my $MAXLEN = int( 1.2 * max(values %LENG) );

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# command line options

my(@Options, $quiet, $kingdom, $threads, $evalue, $lencutoff, $reject, $incseq, $outseq);
setOptions();

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# check all is well

msg("This is $EXE $VERSION");
msg("Written by $AUTHOR");
msg("Obtained from $URL");
msg("Detected operating system: $OPSYS");
msg("Adding $BINDIR to end of PATH");
$ENV{PATH} .= ":$BINDIR";

msg("Checking for dependencies:");
require_exe('nhmmer', 'bedtools');

$threads > 0 or err("Invalid --threads $threads");
msg("Will use $threads threads");

$evalue > 0 or err("Invalid --evalue $evalue");
msg("Setting evalue cutoff to $evalue");

$lencutoff > 0 or err("Invalid --lencutoff $lencutoff");
msg("Will tag genes < $lencutoff of expected length.");

$reject > 0 or err("Invalid --reject cutoff $reject");
msg("Will reject genes < $reject of expected length.");

my $kdom = $KINGDOM{ lc substr($kingdom,0,1) } or
  err("I don't recognise --kingdom '$kingdom'. Try:", values(%KINGDOM) );

my $hmmdb = "$DBDIR/$kdom.hmm";
err("Can't find database: $hmmdb") unless -r $hmmdb;
msg("Using database: $hmmdb");

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# check if user is piping to STDIN
# nhmmer needs to fseek() so we make a temp fasta file 

my $fasta = shift @ARGV;
my $tmpfh;
if (defined($fasta) && $fasta eq '-'  or  !defined($fasta) && !-t \*STDIN) {
  $tmpfh = File::Temp->new(UNLINK=>1);
  msg("Copying STDIN to a temporary file:", $tmpfh->filename);
  while (<STDIN>) {
    print $tmpfh $_;
  }
  $fasta = $tmpfh->filename;
}
$fasta && -r $fasta or err("No input file on command line or stdin");

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# run the external command

msg("Scanning $fasta for $kdom rRNA genes... please wait");
my $opts = "--cpu $threads -E $evalue --w_length $MAXLEN -o /dev/null --tblout /dev/stdout";
my $cmd = "nhmmer $opts '$hmmdb' '$fasta'";
msg("Command: $cmd");
my @hits = qx($cmd 2>&1);

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# process the output

my %hitname;
my @feat;
my @bed;
HIT: 
foreach (@hits) {
  chomp;
  err("nhmmer failed to run - $_") if m/fail|error|core dump|bus error/i;
  next if m/^#/;    # comment line
  next if m/^\s*$/; # empty line
  my @x = split ' ', $_;
  err("bad line in nhmmer output - @x") unless defined $x[6] and $x[6] =~ m/^\d+$/;

  # massage to GFF
  my($begin,$end,$strand) = $x[6] < $x[7] ? ($x[6],$x[7],'+') : ($x[7],$x[6],'-');
  my($seqid, $gene, $prod) = ($x[0], $x[2], $x[2]);
  my $score = defined $x[12]  ? $x[12] : '.';

  # record hits for --outseq retrieval later
  $hitname{"$seqid/$begin:$end($strand)"} = $prod;

  # check if hit makes sense to us
  exists $LENG{$gene} or err("Detected unknown gene '$gene' in scan, aborting.");

  # assumes NAME field in .HMM file is of form "16s_rRNA" etc
  $prod =~ s/_r/ ribosomal /; # convert the short ID to an English string
  $prod =~ s/5_8/5.8/;        # correct naming for 5.8S

  # check for incomplete alignments
  my $note = '';
  my $len = $end-$begin+1;
  
  if ( $len < int($reject * $LENG{$gene}) ) {
    msg("Rejecting short $len nt predicted $gene. Adjust via --reject option.");
    next HIT;
  }
  elsif ( $len < int($lencutoff * $LENG{$gene}) ) {
    $note = sprintf "aligned only %d percent of the $prod", (100*$len/$LENG{$gene});
    $prod .= " (partial)";
  }

  # keep track of good hits for retrievel later
  push @bed, [ $seqid, $begin-1, $end, $gene, 100, $strand ];

  msg("Found:", $gene, $seqid, "L=$len/$LENG{$gene}", "$begin..$end", $strand, $prod);
  my $tags = "Name=$gene;product=$prod";
  $tags .= ";note=$note" if $note;
  push @feat, [
    $seqid, "$EXE:$VERSION", 'rRNA', $begin, $end, $score, $strand, '.', $tags,
  ];
}

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# output a sorted GFF3

sub gff_sort {
  # sort by seqid, then start pos
  return ($a->[0] cmp $b->[0]) || ($a->[3] <=> $b->[3]);
}

msg("Found", scalar(@feat), "ribosomal RNA features.");
msg("Sorting features and outputting GFF3...");
print "##gff-version 3\n";
for my $row (sort { gff_sort } @feat) {
  print join("\t", @$row),"\n";
}
if ($incseq) {
  print "##FASTA\n";
  open FASTA, '<', $fasta;
  print while (<FASTA>);  # `cat $fasta`
}

if ($outseq) {
  msg("Writing hit sequences to: $outseq");
  my $bed = File::Temp->new();
  for my $b (@bed) {
    print $bed join("\t", @$b),"\n";
  }
  $bed->seek(0, SEEK_END); # rewind
  my $cmd = "bedtools getfasta -s -name+ -fo '$outseq' -fi '$fasta' -bed '".$bed->filename."'";
  msg("Running: $cmd");
  system($cmd);
}

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# final cleanup

msg("Done.");

#----------------------------------------------------------------------
sub require_exe {
  for my $exe (@_) {
    my($which) = qx(which $exe 2> /dev/null);
    $which or err("Can not find required '$exe' in PATH");
    chomp $which;
    msg("Found $exe - $which");
  }
}

#----------------------------------------------------------------------
sub revcom {
  my($s) = @_;
  $s = reverse($s);
  $s =~ tr/ATCGatcg/TAGCtagc/;
  return $s;
}

#----------------------------------------------------------------------
sub msg {
  return if $quiet;
  my $line = "[$EXE] @_\n";
  print STDERR $line;
}

#----------------------------------------------------------------------
sub err {
  $quiet=0;
  msg("ERROR:", @_);
  exit(2);
}

#----------------------------------------------------------------------
sub version {
  print STDERR "$EXE $VERSION\n";
  exit;
}

#----------------------------------------------------------------------
sub show_citation {
  print STDERR << "EOCITE";
  
If you use Barrnap in your work, please cite:

    Seemann T
    $EXE $VERSION : $DESC
    $URL
    
Thank you.

EOCITE

  exit;
}

#----------------------------------------------------------------------
# Option setting routines

sub setOptions {
  use Getopt::Long;

  @Options = (
    'Options:',
    {OPT=>"help",    VAR=>\&usage,             DESC=>"This help"},
    {OPT=>"version", VAR=>\&version,           DESC=>"Print version and exit"},
    {OPT=>"citation",VAR=>\&show_citation,     DESC=>"Print citation for referencing $EXE"},
    {OPT=>"kingdom=s", VAR=>\$kingdom,  DEFAULT=>'bac', DESC=>"Kingdom: ".join(' ', values %KINGDOM) },
    {OPT=>"quiet!",  VAR=>\$quiet, DEFAULT=>0, DESC=>"No screen output"},
    {OPT=>"threads=i",  VAR=>\$threads, DEFAULT=>1,  DESC=>"Number of threads/cores/CPUs to use"},
    {OPT=>"lencutoff=f",VAR=>\$lencutoff, DEFAULT=>0.8, DESC=>"Proportional length threshold to label as partial"},
    {OPT=>"reject=f",VAR=>\$reject, DEFAULT=>0.25, DESC=>"Proportional length threshold to reject prediction"},
    {OPT=>"evalue=f",VAR=>\$evalue, DEFAULT=>1E-6, DESC=>"Similarity e-value cut-off"},
    {OPT=>"incseq!",  VAR=>\$incseq, DEFAULT=>0, DESC=>"Include FASTA _input_ sequences in GFF3 output"},
    {OPT=>"outseq=s",  VAR=>\$outseq, DEFAULT=>'', DESC=>"Save rRNA hit seqs to this FASTA file"},
  );

#  (!@ARGV) && (usage(1));

  &GetOptions(map {$_->{OPT}, $_->{VAR}} grep { ref } @Options) || usage(1);

  # Now setup default values.
  foreach (@Options) {
    if (ref $_ && defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) {
      ${$_->{VAR}} = $_->{DEFAULT};
    }
  }
}

#----------------------------------------------------------------------

sub usage {
  my($exitcode) = @_;
  $exitcode = 0 if $exitcode eq 'help'; # what gets passed by getopt func ref
  $exitcode ||= 0;
  select STDERR if $exitcode; # write to STDERR if exitcode is error

  print "Synopsis:\n  $EXE $VERSION - $DESC\n";
  print "Author:\n  $AUTHOR\n";
  print "Usage:\n";
  print "  $EXE [options] chr.fa\n";
  print "  $EXE [options] < chr.fa\n";
  print "  $EXE [options] - < chr.fa\n";
  foreach (@Options) {
    if (ref) {
      my $def = defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
      $def = ($def ? ' (default OFF)' : '(default ON)') if $_->{OPT} =~ m/!$/;
      my $opt = $_->{OPT};
      $opt =~ s/!$//; 
      $opt =~ s/=s$/ [X]/; 
      $opt =~ s/=i$/ [N]/;
      $opt =~ s/=f$/ [n.n]/;
      printf "  --%-15s %s%s\n", $opt, $_->{DESC}, $def;
    }
    else {
      print "$_\n";
    }      
  }
  exit($exitcode);
}

#----------------------------------------------------------------------