centrifuge-compress.pl - centrifuge (lintian-fixes/main)

Tree @lintian-fixes/main (Download .tar.gz)

centrifuge-compress.pl @lintian-fixes/main — raw · history · blame

#!/usr/bin/env perl

# Read and merge the sequence for the chosen level

use strict ;
use warnings ;

use threads ;
use threads::shared ;
use FindBin qw($Bin);
use File::Basename;
use File::Find;
use Getopt::Long;
use Cwd;
use Cwd 'cwd' ;
use Cwd 'abs_path' ;

my $CWD = dirname( abs_path( $0 ) ) ;
my $PWD = abs_path( "./" ) ;

my $usage = "USAGE: perl ".basename($0)." path_to_download_files path_to_taxnonomy [-map header_to_taxid_map -o compressed -noCompress -t 1 -maxG 50000000 -noDustmasker]\n" ;

my $level = "species" ;
my $output = "compressed" ;
my $bssPath = $CWD ; # take path of binary as script directory
my $numOfThreads = 1 ;
my $noCompress = 0 ;
my $noDustmasker = 0 ;
my $verbose = 0;
my $maxGenomeSizeForCompression = 50000000 ;
my $mapFile = "" ;

GetOptions ("level|l=s" => \$level,
			"output|o=s" => \$output,
			"bss=s" => \$bssPath,
			"threads|t=i" => \$numOfThreads,
			"maxG=i" => \$maxGenomeSizeForCompression, 
			"map=s" => \$mapFile, 
            "verbose|v" => \$verbose,
			"noCompress" => \$noCompress,
			"noDustmasker" => \$noDustmasker)
or die("Error in command line arguments. \n\n$usage");

die $usage unless @ARGV == 2;

my $path = $ARGV[0] ;
my $taxPath = $ARGV[1] ;

my $i ;

my %gidToFile ;
my %gidUsed ;
my %tidToGid ; # each tid can corresponds several gid
my %gidToTid ;
my %speciesList ; # hold the tid in the species
my %species ; # hold the species tid
my %genus ; # hold the genus tid
my %speciesToGenus ;
my %fileUsed : shared ;
my $fileUsedLock : shared ;
my %taxTree ;

my @speciesListKey ;
my $speciesUsed : shared ;
my $debug: shared ;
my %speciesIdToName : shared ;

my %idToTaxId : shared ; 
my %newIdToTaxId : shared ;
my %idToGenomeSize : shared ;
my $idMapLock : shared ;

my $step = 1 ;

# check the depended softwares
if ( `which jellyfish` eq "" )
{
	die "Could not find Jellyfish.\n" ;
}
else
{
	my $version = `jellyfish --version` ;
	chomp $version ;
	if ( ( split /\s+/, $version )[1] lt 2 )
	{
		die "Jellyfish on your system is ", $version, " , centrifuge-compress.pl requires at least 2.0.0\n" ;
	}
}

if ( `which nucmer` eq "" )
{
	die "Could not find Nucmer.\n"
}

if ( `which dustmasker` eq "" ) 
{
	print STDERR "Could not find dustmasker. And will turn on -noDustmasker option.\n" ;
	$noDustmasker = 1 ;
}

#Extract the gid we met in the file
if ( $mapFile ne "" )
{
	print STDERR "Step $step: Reading in the provided mapping list of ids to taxionomy ids.\n";
	++$step ;

	open FP1, $mapFile ;
	while ( <FP1> )
	{
		chomp ;
		my @cols = split ;
		$idToTaxId{ $cols[0] } = $cols[1] ;

		if ( $noCompress == 1 )
		{
			$newIdToTaxId{ $cols[0] } = $cols[1] ;
		}
	}
}

print STDERR "Step $step: Collecting all .fna files in $path and getting gids\n";
++$step ;
if ( $noCompress == 1 )
{
	`rm tmp_output.fa` ;
}

find ( { wanted=>sub {
    return unless -f  ;        # Must be a file
    return unless -s;        # Must be non-zero
    if ( !( /\.f[nf]?a$/ || /\.ffn$/ || /\.fasta$/ ) )
    {
    	return ;
    }

    my $fullfile = $File::Find::name; ## file with full path, but the CWD is actually the file's path
    my $file = $_; ## file name
	open FP2, $file or die "Error opening $file: $!";
	my $head = <FP2> ;
	close FP2 ;

	chomp $head ;
	my $headId = substr( ( split /\s+/, $head )[0], 1 ) ;  

	if ( $noCompress == 1 )
	{
		# it seems the find will change the working directory
		system_call( "cat $PWD/$fullfile >> $PWD/tmp_output.fa" ) ;
		if ( defined $idToTaxId{ $headId } )
		{
			$newIdToTaxId{ $headId } = $idToTaxId{ $headId } ;
		}
		else
		{
			$newIdToTaxId{ $headId } = -1 ;
			my @cols = split /\|/, $headId ;
			my $subHeader = $cols[0]."\|".$cols[1] ; 
			if ( $headId =~ /gi\|([0-9]+)?\|/ )
			{
				$newIdToTaxId{ $subHeader } = -1 ;
				#print STDERR "$headId $subHeader\n" if $verbose ;
			}
			elsif ( $headId =~ /taxid\|([0-9]+)?[\|\s]/ )
			{
				$newIdToTaxId{ $headId } = $1 ;
				$newIdToTaxId{ $subHeader } = $1 ;
			}
			elsif ( scalar( @cols ) > 2 )
			{
				$newIdToTaxId{ $subHeader } = -1 ;
			}
		}
	}


	if ( defined $idToTaxId{ $headId } )
	{
		my $tid = $idToTaxId{ $headId } ;
		my $dummyGid = "centrifuge_gid_".$fullfile."_$tid" ;
		$gidUsed{ $dummyGid } = 1 ;
		$gidToFile{ $dummyGid } = $fullfile ;
		$fileUsed{ $fullfile } = 0 ;
		push @{ $tidToGid{ $tid } }, $dummyGid ;	

		print STDERR "tid=$tid $file\n" if $verbose;
	}
	elsif ( $head =~ /^>gi\|([0-9]+)?\|/ ) {
		my $gid = $1 ;
		print STDERR "gid=$gid $file\n" if $verbose;
		if ( defined $gidUsed{ $gid } )
		{
			print "Repeated gid $gid\n" if $verbose ;
			$fileUsed{ $fullfile } = 1 ;
		}
		else
		{
			$fileUsed{ $fullfile } = 0 ;
			$gidToFile{ $gid } = $fullfile ;
		}

		$gidUsed{ $gid } = 1 ;
	} elsif ( $head =~ /taxid\|([0-9]+)?[\|\s]/ ) {
		my $tid = $1 ;
		my $dummyGid = "centrifuge_gid_".$fullfile."_$1" ;
		$gidUsed{ $dummyGid } = 1 ;
		$gidToFile{ $dummyGid } = $fullfile ;
		$fileUsed{ $fullfile } = 0 ;
		push @{ $tidToGid{ $tid } }, $dummyGid ;	
		
		print STDERR "tid=$tid $file\n" if $verbose;
	} else {
		print STDERR "Excluding $fullfile: Wrong header.\n";
	}

}, follow=>1 }, $path );

if ( $noCompress == 1 ) 
{
# Remove the Ns from the file
	if ( $noDustmasker == 1 )
	{
		system_call("perl $bssPath/centrifuge-RemoveN.pl tmp_output.fa | perl $bssPath/centrifuge-RemoveEmptySequence.pl > $output.fa") ;
	}
	else
	{
		system_call("perl $bssPath/centrifuge-RemoveN.pl tmp_output.fa > tmp_output_fmt.fa") ;
		system_call( "dustmasker -infmt fasta -in tmp_output_fmt.fa -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]//g' > tmp_output_dustmasker.fa" ) ;
		system_call("perl $bssPath/centrifuge-RemoveN.pl tmp_output_dustmasker.fa | perl $bssPath/centrifuge-RemoveEmptySequence.pl > $output.fa") ;
	}
}

# Extract the tid that are associated with the gids
print STDERR "Step $step: Extract the taxonomy ids that are associated with the gids\n";
++$step ;
open FP1, "$taxPath/gi_taxid_nucl.dmp" ;
while ( <FP1> )
{
	chomp ;
	my @cols = split ;		
#print $cols[0], "\n" ;	
	next if ( @ARGV < 2 ) ;
	if ( defined( $gidUsed{ $cols[0] } ) )
	{
		push @{ $tidToGid{ $cols[1] } }, $cols[0] ;
		$gidToTid{ $cols[0] } = $cols[1] ;
		print STDERR "gid=", $cols[0], " tid=", $cols[1], " ", $gidToFile{ $cols[0] }, "\n" if $verbose;
	}
}
close FP1 ;

if ( $noCompress == 1 )
{
	open FP1, ">tmp_$output.map" ;
	foreach my $key (keys %newIdToTaxId )
	{
		if ( $newIdToTaxId{$key} != -1 )
		{
			print FP1 "$key\t", $newIdToTaxId{$key}, "\n" ; 
		}
		elsif ( $key =~ /gi\|([0-9]+)?/ )
		{
			#if ( defined $gidToTid{ $1 } )
			#{
			#	$newIdToTaxId{ $key } = $gidToTid{ $1 } ;
			#}
			my $taxId = 1 ;
			if (defined $gidToTid{ $1 } )
			{
				$taxId = $gidToTid{ $1 } ;
			}
			print FP1 "$key\t", $taxId, "\n" ; 
		}
		else
		{
			print FP1 "$key\t1\n" ;
		}
	}
	close FP1 ;
	system_call( "sort tmp_$output.map | uniq > $output.map" ) ;
	exit ;
}

# Organize the tree
print STDERR "Step $step: Organizing the taxonomical tree\n";
++$step ;
open FP1, "$taxPath/nodes.dmp" or die "Couldn't open $taxPath/nodes.dmp: $!";
while ( <FP1> ) {
	chomp ;

	my @cols = split ;
#next if ( !defined $tidToGid{ $cols[0] } ) ;

	my $tid = $cols[0] ;
	my $parentTid = $cols[2] ;
	my $rank = $cols[4] ;
#print "subspecies: $tid $parentTid\n" ;
#push @{ $species{ $parentTid } }, $tid ;
#$tidToSpecies{ $tid } = $parentTid ;

	$taxTree{ $cols[0] } = $cols[2] ;
#print $cols[0], "=>", $cols[2], "\n" ;
	if ( $rank eq "species" )	
	{
		$species{ $cols[0] } = 1 ;
	}
	elsif ( $rank eq "genus" )
	{
		$genus{ $cols[0] } = 1 ;
	}
}
close FP1 ;

# Put the sub-species taxonomy id into the corresponding species.
print STDERR "Step $step: Putting the sub-species taxonomy id into the corresponding species\n";
++$step ;
for $i ( keys %tidToGid )
{
	my $p = $i ;
	my $found = 0 ;
	while ( 1 )
	{
		last if ( $p <= 1 ) ;
		if ( defined $species{ $p } ) 
		{
			$found = 1 ;
			last ;
		}
		if ( defined $taxTree{ $p } )
		{
			$p = $taxTree{ $p } ;
		}
		else
		{
			last ;
		}
	}

	if ( $found == 1 )
	{
		push @{ $speciesList{ $p } }, $i ;
	}
}

print STDERR "Step $step: Reading the name of the species\n";
++$step ;
open FP1, "$taxPath/names.dmp" or die "Could not open $taxPath/names.dmp: $!";
while ( <FP1> )
{
	next if (!/scientific name/ ) ;
	my @cols = split /\t/ ;

	if ( defined $species{ $cols[0] } )
	{
		$speciesIdToName{ $cols[0] } = $cols[2] ;
	}
}
close FP1 ;
#exit ;

sub GetGenomeSize
{
	open FPfa, $_[0] ;
	my $size = 0 ;
	while ( <FPfa> )
	{
		next if ( /^>/ ) ;
		$size += length( $_ ) - 1 ;
	}
	close FPfa ;
	return $size ;
}

# Compress one species.
sub solve
{
# Extracts the files
#print "find $pwd -name *.fna > tmp.list\n" ;
	my $tid = threads->tid() - 1 ;
#system_call("find $pwd -name *.fna > tmp_$tid.list") ;

#system_call("find $pwd -name *.fa >> tmp.list") ; # Just in case
# Build the header
	my $genusId ;
	my $speciesId = $_[0] ;
	my $speciesName ;
	my $i ;
	my $file ;
	my @subspeciesList = @{ $speciesList{ $speciesId } } ;
	$genusId = $taxTree{ $speciesId } ;
	while ( 1 )
	{
		if ( !defined $genusId || $genusId <= 1 )
		{
			$genusId = 0 ;
			last ;
		}

		if ( defined $genus{ $genusId } )
		{
			last ;
		}
		else
		{
			$genusId = $taxTree{ $genusId } ;
		}
	}

	my $FP1 ;
	open FP1, ">tmp_$tid.list" ;
	my $genomeSize = 0 ;
	my $avgGenomeSize = 0 ;
	foreach $i ( @subspeciesList )
	{
		foreach my $j ( @{$tidToGid{ $i } } )
		{
#{
#	lock( $debug ) ;
#	print "Merge ", $gidToFile{ $j }, "\n" ;
#}
			$file =  $gidToFile{ $j } ;
			{
				lock( $fileUsedLock ) ;
				$fileUsed{ $file } = 1 ;
			}
			print FP1 $file, "\n" ;

			my $tmp = GetGenomeSize( $file ) ;
			if ( $tmp > $genomeSize )
			{
				$genomeSize = $tmp ;
			}
			$avgGenomeSize += $tmp ;
		}
	}
	close FP1 ;

#$genomeSize = int( $genomeSize / scalar( @subspeciesList ) ) ;
	$avgGenomeSize = int( $avgGenomeSize / scalar( @subspeciesList ) ) ;

#print $file, "\n" ;	
#if ( $file =~ /\/(\w*?)uid/ )
#{
#	$speciesName = ( split /_/, $1 )[0]."_". ( split /_/, $1 )[1] ;
#}
	if ( defined $speciesIdToName{ $speciesId } )
	{
		$speciesName = $speciesIdToName{ $speciesId } ;
		$speciesName =~ s/ /_/g ;
	}
	else
	{
		$speciesName = "Unknown_species_name" ;
	}
	my $id = $speciesId ;#( $speciesId << 32 ) | $genusId ;
	my $header = ">cid|$id $speciesName $avgGenomeSize ".scalar( @subspeciesList ) ;
	print STDERR "$header\n" ;
	{
		lock( $idMapLock ) ;
		$newIdToTaxId{ "cid|$id" } = $speciesId ;
		$idToGenomeSize{ "cid|$id" } = $avgGenomeSize ;
	}

#return ;
# Build the sequence
	my $seq = "" ;
	if ( $noCompress == 0 &&  ( $maxGenomeSizeForCompression < 0 || $genomeSize <= $maxGenomeSizeForCompression ) ) #$genomeSize < 50000000 )
	{
		system_call("perl $bssPath/centrifuge-BuildSharedSequence.pl tmp_$tid.list -prefix tmp_${tid}_$id" ) ;

# Merge all the fragmented sequence into one big chunk.
		system_call("cat tmp_${tid}_${id}_*.fa > tmp_${tid}_$id.fa");

		open FP1, "tmp_${tid}_$id.fa" ;
		while ( <FP1> )
		{
			chomp ;
			next if ( /^>/ ) ;
			$seq .= $_ ;
		}
		close FP1 ;
	}
	else
	{
		foreach $i ( @subspeciesList )
		{
			foreach my $j ( @{$tidToGid{ $i } } )
			{
				$file =  $gidToFile{ $j } ;
				open FP1, $file ;
				while ( <FP1> )
				{
					#chomp ;
					next if ( /^>/ ) ;
					$seq .= $_ ;
				}
				close FP1 ;
			}
		}
	}
	open fpOut, ">>${output}_${tid}" ;
	print fpOut "$header\n$seq\n" ;
	close fpOut ;

	unlink glob("tmp_${tid}_*");	
}

sub system_call {
	print STDERR "SYSTEM CALL: ".join(" ",@_);
	system(@_) == 0
		or die "system @_ failed: $?";
	print STDERR " finished\n";
}

sub threadWrapper
{
	my $tid = threads->tid() - 1 ;
	unlink("${output}_${tid}");

	while ( 1 )
	{
		my $u ;
		{
			lock $speciesUsed ;
			$u = $speciesUsed ;
			++$speciesUsed ;
		}
		last if ( $u >= scalar( @speciesListKey ) ) ;
		solve( $speciesListKey[ $u ] ) ;
	}
}


print STDERR "Step $step: Merging sub-species\n";
++$step ;
my @threads ;
@speciesListKey = keys %speciesList ; 
$speciesUsed = 0 ;
for ( $i = 0 ; $i < $numOfThreads ; ++$i )
{
	push @threads, $i ;
}

foreach (@threads)
{
	$_ = threads->create( \&threadWrapper ) ;
}

foreach (@threads)
{
	$_->join() ;
}

# merge the files generated from each threads
system_call("cat ${output}_* > tmp_output.fa");
unlink glob("${output}_*");

#print unused files
foreach $i ( keys %fileUsed )
{
	if ( $fileUsed{ $i } == 0 )
	{
#print $i, "\n" ;
#`cat $i >> tmp_output.fa` ;
		print "Unused file: $i\n" ;
	}
}

# Remove the Ns from the file
if ( $noDustmasker == 1 )
{
	system_call("perl $bssPath/centrifuge-RemoveN.pl tmp_output.fa | perl $bssPath/centrifuge-RemoveEmptySequence.pl > $output.fa") ;
}
else
{
	system_call("perl $bssPath/centrifuge-RemoveN.pl tmp_output.fa > tmp_output_fmt.fa") ;
	system_call( "dustmasker -infmt fasta -in tmp_output_fmt.fa -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]//g' > tmp_output_dustmasker.fa" ) ;
	system_call("perl $bssPath/centrifuge-RemoveN.pl tmp_output_dustmasker.fa | perl $bssPath/centrifuge-RemoveEmptySequence.pl > $output.fa") ;
}

# Output the mapping of the ids to species
open FP1, ">$output.map" ;
foreach my $key (keys %newIdToTaxId )
{
	print FP1 "$key\t", $newIdToTaxId{ $key }, "\n" ;
}
close FP1 ;

# Output the genome sizem map
open FP1, ">$output.size" ;
foreach my $key ( keys %newIdToTaxId )
{
	print FP1 $newIdToTaxId{ $key }, "\t", $idToGenomeSize{ $key }, "\n" ;
}
close FP1 ;
unlink glob("tmp_*") ;