Codebase list centrifuge / lintian-fixes/main centrifuge_report.cpp
lintian-fixes/main

Tree @lintian-fixes/main (Download .tar.gz)

centrifuge_report.cpp @lintian-fixes/mainraw · history · blame

/*
 * centrifuge-build.cpp
 *
 *  Created on: Apr 8, 2015
 *      Author: fbreitwieser
 */

#include<iostream>
#include<fstream>
#include<sstream>
#include<map>
#include<vector>
#include "assert_helpers.h"
#include "sstring.h"
#include "ds.h"      // EList
#include "bt2_idx.h" // Ebwt
#include "bt2_io.h"
#include "util.h"

using namespace std;
typedef TIndexOffU index_t;

static bool startVerbose = true; // be talkative at startup
int gVerbose = 1; // be talkative always
static const char *argv0 = NULL;
static string adjIdxBase;
static bool useShmem				= false; // use shared memory to hold the index
static bool useMm					= false; // use memory-mapped files to hold the index
static bool mmSweep					= false; // sweep through memory-mapped files immediately after mapping
static int offRate					= -1;    // keep default offRate
static bool noRefNames				= false; // true -> print reference indexes; not names
static int sanityCheck				= 0;  // enable expensive sanity checks
/**
 * Print a summary usage message to the provided output stream.
 */
static void printUsage(ostream& out) {
	out << "Centrifuge version " << string(CENTRIFUGE_VERSION).c_str() << " by Daehwan Kim (infphilo@gmail.com, www.ccb.jhu.edu/people/infphilo)" << endl;
	string tool_name = "centrifuge-class";

	out << "Usage: " << endl
	    << "  " << tool_name.c_str() << " <bt2-idx> <centrifuge-out>" << endl
	    << endl
		<<     "  <bt2-idx>  Index filename prefix (minus trailing .X." << gEbwt_ext << ")." << endl
		<<     "  <centrifuge-out>  Centrifuge result file." << endl;
}

template <typename T>
class Pair2ndComparator{
public:
     bool operator()(const pair<T,T> &left, const pair<T,T> &right){
    	 return left.second < right.second;
     }
};


template<typename TStr>
static void driver(
	const char * type,
	const string& bt2indexBase,
	const string& cf_out)
{
	if(gVerbose || startVerbose)  {
		cerr << "Entered driver(): "; logTime(cerr, true);
	}

    //initializeCntLut();  // FB: test commenting

	// Vector of the reference sequences; used for sanity-checking
	EList<SString<char> > names, os;
	EList<size_t> nameLens, seqLens;

	// Initialize Ebwt object and read in header
	if(gVerbose || startVerbose) {
		cerr << "About to initialize fw Ebwt: "; logTime(cerr, true);
	}
	adjIdxBase = adjustEbwtBase(argv0, bt2indexBase, gVerbose);
	Ebwt<index_t> ebwt(
		adjIdxBase,
	    0,        // index is colorspace
		-1,       // fw index
	    true,     // index is for the forward direction
	    /* overriding: */ offRate,
		0, // amount to add to index offrate or <= 0 to do nothing
	    useMm,    // whether to use memory-mapped files
	    useShmem, // whether to use shared memory
	    mmSweep,  // sweep memory-mapped files
	    !noRefNames, // load names?
		true,        // load SA sample?
		true,        // load ftab?
		true,        // load rstarts?
	    gVerbose, // whether to be talkative
	    startVerbose, // talkative during initialization
	    false /*passMemExc*/,
	    sanityCheck);
	//Ebwt<index_t>* ebwtBw = NULL;


	EList<size_t> reflens;
	EList<string> refnames;
	readEbwtRefnames<index_t>(adjIdxBase, refnames);
	map<uint32_t,pair<string,uint64_t> > speciesID_to_name_len;
	for(size_t i = 0; i < ebwt.nPat(); i++) {
		// cerr << "Push back to reflens: "<<  refnames[i] << " is so long: " << ebwt.plen()[i] << endl;
		reflens.push_back(ebwt.plen()[i]);

		// extract numeric id from refName
		const string& refName = refnames[i];
		uint64_t id = extractIDFromRefName(refName);
		uint32_t speciesID = (uint32_t)(id >> 32);

		// extract name from refName
		const string& name_part = refName.substr(refName.find_first_of(' '));

		//uint32_t genusID = (uint32_t)(id & 0xffffffff);
		speciesID_to_name_len[speciesID] = pair<string,uint64_t>(name_part,ebwt.plen()[i]);

	}
//	EList<string> refnames;
//	readEbwtRefnames<index_t>(adjIdxBase, refnames);

	// Read Centrifuge output file
	ifstream infile(cf_out.c_str());

	string line;
	map<uint32_t,uint32_t> species_to_score;

	while (getline(infile,line)) {
		string rd_name;
		uint32_t genusID;
		uint32_t speciesID;
		uint32_t score;
		uint32_t secbest_score;

		istringstream iss(line);
		iss >> rd_name >> genusID >> speciesID >> score >> secbest_score;
		// cerr << rd_name << " -> " << genusID << " -> " << speciesID << " -> " << score << " -> " << secbest_score << "\n";
		species_to_score[speciesID] += score;
	}

	// Sort the species by their score
	vector<pair<uint32_t,uint32_t> > species_to_score_v(species_to_score.begin(), species_to_score.end());

	sort(species_to_score_v.begin(),species_to_score_v.end(),Pair2ndComparator<uint32_t>());

	cout << "Name\tTaxonID\tLength\tSummed Score\tNormalized Score\n";
	// Output the summed species scores
	for (vector<pair<uint32_t,uint32_t> >::iterator species_score = species_to_score_v.begin();
			species_score != species_to_score_v.end();
			++species_score) {
		uint32_t speciesID = species_score->first;
		pair<string,uint64_t> name_len = speciesID_to_name_len[speciesID];
		uint64_t slength = name_len.second;
		uint64_t sumscore = species_score->second;

		cout << name_len.first << "\t" <<
				speciesID << "\t" <<
				slength << "\t" <<
				sumscore << "\t" <<
				(float)sumscore/slength << "\n";
	}



}

//int centrifuge_report(int argc, const char **argv) {
int main(int argc, const char **argv) {

	if (argc < 3) {
		cerr << "Number of arguments is " << argc << endl;
		printUsage(cerr);
		exit(1);
	}

	argv0 = argv[0];
	const string bt2index = argv[1];
	const string cf_out = argv[2];
	//static string outfile;        // write SAM output to this file

	cout << "Input bt2 file: \"" << bt2index.c_str() << "\"" << endl;
	cout << "Centrifuge results file: \"" << cf_out.c_str() << "\"" << endl;

	driver<SString<char> >("DNA", bt2index, cf_out);
	return 0;
}