centrifuge-kreport - centrifuge (debian/1.0.3-2)

Tree @debian/1.0.3-2 (Download .tar.gz)

centrifuge-kreport @debian/1.0.3-2 — raw · history · blame

#!/usr/bin/env perl

# Give a Kraken-style report from a Centrifuge output
#
# Based on kraken-report by Derrick Wood
# Copyright 2013-2016, Derrick Wood <dwood@cs.jhu.edu>
#

use strict;
use warnings;
use Getopt::Long;
use File::Basename;
use Cwd;
use Cwd 'cwd' ;
use Cwd 'abs_path' ;

my ($centrifuge_index, $min_score, $min_length);
my $only_unique = 0;
my $show_zeros = 0;
my $is_cnts_table = 0;
my $PROG = "centrifuge-kreport";
my $CWD = dirname( abs_path( $0 ) ) ;

GetOptions(
  "help" => \&display_help,
  "x=s" => \$centrifuge_index,
  "show-zeros" => \$show_zeros,
  "is-count-table" => \$is_cnts_table,
  "min-score=i" => \$min_score,
  "min-length=i"=> \$min_length,
  "only-unique" => \$only_unique
) or usage();

usage() unless defined $centrifuge_index;
if (!defined $ARGV[0]) {
    print STDERR "Reading centrifuge out file from STDIN ... \n";
}

sub usage {
  my $exit_code = @_ ? shift : 64;
  print STDERR "
Usage: centrifuge-kreport -x <index name> OPTIONS <centrifuge output file(s)>

centrifuge-kreport creates Kraken-style reports from centrifuge out files.

Options:
    -x INDEX            (REQUIRED) Centrifuge index

    --only-unique        Only count reads that were uniquely assigned to one taxon
    --show-zeros         Show clades that have zero reads, too
    --is-count-table     The format of the file is 'TAXID<tab>COUNT' instead of the standard
                         Centrifuge output format

    --min-score SCORE    Require a minimum score for reads to be counted
    --min-length LENGTH  Require a minimum alignment length to the read
  
  ";
  exit $exit_code;
}

sub display_help {
  usage(0);
}

my (%child_lists, %name_map, %rank_map);
print STDERR "Loading taxonomy ...\n";
load_taxonomy();

my %taxo_counts;
my $seq_count = 0;
$taxo_counts{0} = 0;
if ($is_cnts_table) {
  while (<>) {
    my ($taxid,$count) = split;
    $taxo_counts{$taxid} = $count;
    $seq_count += $count;
  }
} else {
  my $header = <>;
  my @cols = split /\s+/, $header ;
  my %headerMap ;
  for ( my $i = 0 ; $i < scalar( @cols ) ; ++$i )
  {
  	$headerMap{ $cols[$i] } = $i ;
  }
  while (<>) {
    #my (undef,$seqID,$taxid,$score, undef, $hitLength, $queryLength, $numMatches) = split /\t/;
    my @cols = split /\s+/ ;
    my $seqID = $cols[ $headerMap{ "seqID" } ] ; 
    my $taxid = $cols[ $headerMap{ "taxID" } ] ; 
    my $score = $cols[ $headerMap{ "score" } ] ; 
    my $hitLength = $cols[ $headerMap{ "hitLength" } ] ; 
    my $queryLength = $cols[ $headerMap{ "queryLength" } ] ; 
    my $numMatches = $cols[ $headerMap{ "numMatches" } ] ; 

    next if $only_unique && $numMatches > 1;
    next if defined $min_length && $hitLength < $min_length;
    next if defined $min_score && $score < $min_score;
    $taxo_counts{$taxid} += 1/$numMatches;
    $seq_count++;
  }
}
my $classified_count = $seq_count - $taxo_counts{0};

my %clade_counts = %taxo_counts;
dfs_summation(1);

for (keys %name_map) {
  $clade_counts{$_} ||= 0;
}

die "No sequence matches with given settings" unless $seq_count > 0;

printf "%6.2f\t%d\t%d\t%s\t%d\t%s%s\n",
  $clade_counts{0} * 100 / $seq_count,
  $clade_counts{0}, $taxo_counts{0}, "U",
  0, "", "unclassified";
dfs_report(1, 0);

sub dfs_report {
  my $node = shift;
  my $depth = shift;
  if (! $clade_counts{$node} && ! $show_zeros) {
    return;
  }
  printf "%6.2f\t%d\t%d\t%s\t%d\t%s%s\n",
    ($clade_counts{$node} || 0) * 100 / $seq_count,
    ($clade_counts{$node} || 0),
    ($taxo_counts{$node} || 0),
    rank_code($rank_map{$node}),
    $node,
    "  " x $depth,
    $name_map{$node};
  my $children = $child_lists{$node};
  if ($children) {
    my @sorted_children = sort { $clade_counts{$b} <=> $clade_counts{$a} } @$children;
    for my $child (@sorted_children) {
      dfs_report($child, $depth + 1);
    }
  }
}

sub rank_code {
  my $rank = shift;
  for ($rank) {
    $_ eq "species" and return "S";
    $_ eq "genus" and return "G";
    $_ eq "family" and return "F";
    $_ eq "order" and return "O";
    $_ eq "class" and return "C";
    $_ eq "phylum" and return "P";
    $_ eq "kingdom" and return "K";
    $_ eq "superkingdom" and return "D";
  }
  return "-";
}

sub dfs_summation {
  my $node = shift;
  my $children = $child_lists{$node};
  if ($children) {
    for my $child (@$children) {
      dfs_summation($child);
      $clade_counts{$node} += ($clade_counts{$child} || 0);
    }
  }
}

sub load_taxonomy {

  print STDERR "Loading names file ...\n";
  open NAMES, "-|", "$CWD/centrifuge-inspect --name-table $centrifuge_index"
    or die "$PROG: can't open names file: $!\n";
  while (<NAMES>) {
    chomp;
    s/\t\|$//;
    my @fields = split /\t/;
    my ($node_id, $name) = @fields[0,1];
    $name_map{$node_id} = $name;
  }
  close NAMES;

  print STDERR "Loading nodes file ...\n";
  open NODES, "-|", "$CWD/centrifuge-inspect --taxonomy-tree $centrifuge_index"
    or die "$PROG: can't open nodes file: $!\n";
  while (<NODES>) {
    chomp;
    my @fields = split /\t\|\t/;
    my ($node_id, $parent_id, $rank) = @fields[0,1,2];
    if ($node_id == 1) {
      $parent_id = 0;
    }
    $child_lists{$parent_id} ||= [];
    push @{ $child_lists{$parent_id} }, $node_id;
    $rank_map{$node_id} = $rank;
  }
  close NODES;
}