Codebase list barrnap / c652462a-fa00-465d-98e4-ceb80ad06dc1/upstream bin / barrnap
c652462a-fa00-465d-98e4-ceb80ad06dc1/upstream

Tree @c652462a-fa00-465d-98e4-ceb80ad06dc1/upstream (Download .tar.gz)

barrnap @c652462a-fa00-465d-98e4-ceb80ad06dc1/upstream

ce3f2c4
 
 
 
 
6df15a3
ce3f2c4
 
 
 
6df15a3
ce3f2c4
 
6df15a3
cb478cc
ce3f2c4
 
cb478cc
ce3f2c4
 
 
 
 
 
 
 
 
 
 
6df15a3
ce3f2c4
 
 
 
 
 
 
 
 
cb478cc
 
ce3f2c4
6df15a3
 
ce3f2c4
 
 
 
 
 
 
 
cb478cc
ce3f2c4
 
 
 
 
6df15a3
ce3f2c4
 
 
 
 
 
6df15a3
 
ce3f2c4
 
6df15a3
 
 
 
 
 
 
 
 
 
 
 
 
ce3f2c4
 
6df15a3
 
ce3f2c4
 
 
 
 
 
6df15a3
ce3f2c4
6df15a3
ce3f2c4
 
 
 
 
 
 
 
 
6df15a3
ce3f2c4
 
 
 
6df15a3
 
 
ce3f2c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6df15a3
 
 
ce3f2c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6df15a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce3f2c4
6df15a3
 
 
 
 
 
 
 
ce3f2c4
6df15a3
 
 
 
 
 
 
 
 
ce3f2c4
 
6df15a3
ce3f2c4
 
 
 
 
 
6df15a3
ce3f2c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6df15a3
ce3f2c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6df15a3
ce3f2c4
6df15a3
ce3f2c4
6df15a3
ce3f2c4
6df15a3
 
ce3f2c4
 
6df15a3
ce3f2c4
6df15a3
ce3f2c4
 
 
 
 
 
 
 
 
 
 
 
6df15a3
 
 
 
 
 
 
 
 
 
 
ce3f2c4
 
 
 
 
 
 
 
 
6df15a3
ce3f2c4
 
6df15a3
ce3f2c4
 
6df15a3
ce3f2c4
 
 
#!/usr/bin/env perl
use strict;
use warnings;
use List::Util qw(max);
use FindBin;
use File::Temp;

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# global variables

my $VERSION = "0.9";
my $EXE = $FindBin::RealScript;
my $DESC = "rapid ribosomal RNA prediction";
my $AUTHOR = 'Torsten Seemann';
my $URL = 'https://github.com/tseemann/barrnap';
my $DBDIR = "$FindBin::RealBin/../db";
my $OPSYS = $^O;
my $BINDIR = "$FindBin::RealBin/../binaries/$OPSYS";

my %KINGDOM = (map { substr($_,0,1) => $_ } qw(bac arc euk mito));
my %LENG = (
  "5S_rRNA"  =>119, "16S_rRNA"=>1585, "23S_rRNA"=>3232,
  "5_8S_rRNA"=>156, "18S_rRNA"=>1869, "28S_rRNA"=>2912, "12S_rRNA"=>954,
);
my $MAXLEN = int( 1.2 * max(values %LENG) );

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# command line options

my(@Options, $quiet, $kingdom, $threads, $evalue, $lencutoff, $reject, $incseq, $outseq);
setOptions();

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# check all is well

msg("This is $EXE $VERSION");
msg("Written by $AUTHOR");
msg("Obtained from $URL");
msg("Detected operating system: $OPSYS");
msg("Adding $BINDIR to end of PATH");
$ENV{PATH} .= ":$BINDIR";

msg("Checking for dependencies:");
require_exe('nhmmer', 'bedtools');

$threads > 0 or err("Invalid --threads $threads");
msg("Will use $threads threads");

$evalue > 0 or err("Invalid --evalue $evalue");
msg("Setting evalue cutoff to $evalue");

$lencutoff > 0 or err("Invalid --lencutoff $lencutoff");
msg("Will tag genes < $lencutoff of expected length.");

$reject > 0 or err("Invalid --reject cutoff $reject");
msg("Will reject genes < $reject of expected length.");

my $kdom = $KINGDOM{ lc substr($kingdom,0,1) } or
  err("I don't recognise --kingdom '$kingdom'. Try:", values(%KINGDOM) );

my $hmmdb = "$DBDIR/$kdom.hmm";
err("Can't find database: $hmmdb") unless -r $hmmdb;
msg("Using database: $hmmdb");

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# check if user is piping to STDIN
# nhmmer needs to fseek() so we make a temp fasta file 

my $fasta = shift @ARGV;
my $tmpfh;
if (defined($fasta) && $fasta eq '-'  or  !defined($fasta) && !-t \*STDIN) {
  $tmpfh = File::Temp->new(UNLINK=>1);
  msg("Copying STDIN to a temporary file:", $tmpfh->filename);
  while (<STDIN>) {
    print $tmpfh $_;
  }
  $fasta = $tmpfh->filename;
}
$fasta && -r $fasta or err("No input file on command line or stdin");

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# run the external command

msg("Scanning $fasta for $kdom rRNA genes... please wait");
my $opts = "--cpu $threads -E $evalue --w_length $MAXLEN -o /dev/null --tblout /dev/stdout";
my $cmd = "nhmmer $opts '$hmmdb' '$fasta'";
msg("Command: $cmd");
my @hits = qx($cmd 2>&1);

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# process the output

my %hitname;
my @feat;
my @bed;
HIT: 
foreach (@hits) {
  chomp;
  err("nhmmer failed to run - $_") if m/fail|error|core dump|bus error/i;
  next if m/^#/;    # comment line
  next if m/^\s*$/; # empty line
  my @x = split ' ', $_;
  err("bad line in nhmmer output - @x") unless defined $x[6] and $x[6] =~ m/^\d+$/;

  # massage to GFF
  my($begin,$end,$strand) = $x[6] < $x[7] ? ($x[6],$x[7],'+') : ($x[7],$x[6],'-');
  my($seqid, $gene, $prod) = ($x[0], $x[2], $x[2]);
  my $score = defined $x[12]  ? $x[12] : '.';

  # record hits for --outseq retrieval later
  $hitname{"$seqid/$begin:$end($strand)"} = $prod;

  # check if hit makes sense to us
  exists $LENG{$gene} or err("Detected unknown gene '$gene' in scan, aborting.");

  # assumes NAME field in .HMM file is of form "16s_rRNA" etc
  $prod =~ s/_r/ ribosomal /; # convert the short ID to an English string
  $prod =~ s/5_8/5.8/;        # correct naming for 5.8S

  # check for incomplete alignments
  my $note = '';
  my $len = $end-$begin+1;
  
  if ( $len < int($reject * $LENG{$gene}) ) {
    msg("Rejecting short $len nt predicted $gene. Adjust via --reject option.");
    next HIT;
  }
  elsif ( $len < int($lencutoff * $LENG{$gene}) ) {
    $note = sprintf "aligned only %d percent of the $prod", (100*$len/$LENG{$gene});
    $prod .= " (partial)";
  }

  # keep track of good hits for retrievel later
  push @bed, [ $seqid, $begin-1, $end, $gene, 100, $strand ];

  msg("Found:", $gene, $seqid, "L=$len/$LENG{$gene}", "$begin..$end", $strand, $prod);
  my $tags = "Name=$gene;product=$prod";
  $tags .= ";note=$note" if $note;
  push @feat, [
    $seqid, "$EXE:$VERSION", 'rRNA', $begin, $end, $score, $strand, '.', $tags,
  ];
}

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# output a sorted GFF3

sub gff_sort {
  # sort by seqid, then start pos
  return ($a->[0] cmp $b->[0]) || ($a->[3] <=> $b->[3]);
}

msg("Found", scalar(@feat), "ribosomal RNA features.");
msg("Sorting features and outputting GFF3...");
print "##gff-version 3\n";
for my $row (sort { gff_sort } @feat) {
  print join("\t", @$row),"\n";
}
if ($incseq) {
  print "##FASTA\n";
  open FASTA, '<', $fasta;
  print while (<FASTA>);  # `cat $fasta`
}

if ($outseq) {
  msg("Writing hit sequences to: $outseq");
  my $bed = File::Temp->new();
  for my $b (@bed) {
    print $bed join("\t", @$b),"\n";
  }
  $bed->seek(0, SEEK_END); # rewind
  my $cmd = "bedtools getfasta -s -name+ -fo '$outseq' -fi '$fasta' -bed '".$bed->filename."'";
  msg("Running: $cmd");
  system($cmd);
}

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# final cleanup

msg("Done.");

#----------------------------------------------------------------------
sub require_exe {
  for my $exe (@_) {
    my($which) = qx(which $exe 2> /dev/null);
    $which or err("Can not find required '$exe' in PATH");
    chomp $which;
    msg("Found $exe - $which");
  }
}

#----------------------------------------------------------------------
sub revcom {
  my($s) = @_;
  $s = reverse($s);
  $s =~ tr/ATCGatcg/TAGCtagc/;
  return $s;
}

#----------------------------------------------------------------------
sub msg {
  return if $quiet;
  my $line = "[$EXE] @_\n";
  print STDERR $line;
}

#----------------------------------------------------------------------
sub err {
  $quiet=0;
  msg("ERROR:", @_);
  exit(2);
}

#----------------------------------------------------------------------
sub version {
  print STDERR "$EXE $VERSION\n";
  exit;
}

#----------------------------------------------------------------------
sub show_citation {
  print STDERR << "EOCITE";
  
If you use Barrnap in your work, please cite:

    Seemann T
    $EXE $VERSION : $DESC
    $URL
    
Thank you.

EOCITE

  exit;
}

#----------------------------------------------------------------------
# Option setting routines

sub setOptions {
  use Getopt::Long;

  @Options = (
    'Options:',
    {OPT=>"help",    VAR=>\&usage,             DESC=>"This help"},
    {OPT=>"version", VAR=>\&version,           DESC=>"Print version and exit"},
    {OPT=>"citation",VAR=>\&show_citation,     DESC=>"Print citation for referencing $EXE"},
    {OPT=>"kingdom=s", VAR=>\$kingdom,  DEFAULT=>'bac', DESC=>"Kingdom: ".join(' ', values %KINGDOM) },
    {OPT=>"quiet!",  VAR=>\$quiet, DEFAULT=>0, DESC=>"No screen output"},
    {OPT=>"threads=i",  VAR=>\$threads, DEFAULT=>1,  DESC=>"Number of threads/cores/CPUs to use"},
    {OPT=>"lencutoff=f",VAR=>\$lencutoff, DEFAULT=>0.8, DESC=>"Proportional length threshold to label as partial"},
    {OPT=>"reject=f",VAR=>\$reject, DEFAULT=>0.25, DESC=>"Proportional length threshold to reject prediction"},
    {OPT=>"evalue=f",VAR=>\$evalue, DEFAULT=>1E-6, DESC=>"Similarity e-value cut-off"},
    {OPT=>"incseq!",  VAR=>\$incseq, DEFAULT=>0, DESC=>"Include FASTA _input_ sequences in GFF3 output"},
    {OPT=>"outseq=s",  VAR=>\$outseq, DEFAULT=>'', DESC=>"Save rRNA hit seqs to this FASTA file"},
  );

#  (!@ARGV) && (usage(1));

  &GetOptions(map {$_->{OPT}, $_->{VAR}} grep { ref } @Options) || usage(1);

  # Now setup default values.
  foreach (@Options) {
    if (ref $_ && defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) {
      ${$_->{VAR}} = $_->{DEFAULT};
    }
  }
}

#----------------------------------------------------------------------

sub usage {
  my($exitcode) = @_;
  $exitcode = 0 if $exitcode eq 'help'; # what gets passed by getopt func ref
  $exitcode ||= 0;
  select STDERR if $exitcode; # write to STDERR if exitcode is error

  print "Synopsis:\n  $EXE $VERSION - $DESC\n";
  print "Author:\n  $AUTHOR\n";
  print "Usage:\n";
  print "  $EXE [options] chr.fa\n";
  print "  $EXE [options] < chr.fa\n";
  print "  $EXE [options] - < chr.fa\n";
  foreach (@Options) {
    if (ref) {
      my $def = defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
      $def = ($def ? ' (default OFF)' : '(default ON)') if $_->{OPT} =~ m/!$/;
      my $opt = $_->{OPT};
      $opt =~ s/!$//; 
      $opt =~ s/=s$/ [X]/; 
      $opt =~ s/=i$/ [N]/;
      $opt =~ s/=f$/ [n.n]/;
      printf "  --%-15s %s%s\n", $opt, $_->{DESC}, $def;
    }
    else {
      print "$_\n";
    }      
  }
  exit($exitcode);
}

#----------------------------------------------------------------------