Codebase list centrifuge / 8800a3f2-a136-4d87-beb1-01572fd07197/main aligner_metrics.h
8800a3f2-a136-4d87-beb1-01572fd07197/main

Tree @8800a3f2-a136-4d87-beb1-01572fd07197/main (Download .tar.gz)

aligner_metrics.h @8800a3f2-a136-4d87-beb1-01572fd07197/mainraw · history · blame

/*
 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
 *
 * This file is part of Bowtie 2.
 *
 * Bowtie 2 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Bowtie 2 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef ALIGNER_METRICS_H_
#define ALIGNER_METRICS_H_

#include <math.h>
#include <iostream>
#include "alphabet.h"
#include "timer.h"
#include "sstring.h"

using namespace std;

/**
 * Borrowed from http://www.johndcook.com/standard_deviation.html,
 * which in turn is borrowed from Knuth.
 */
class RunningStat {
public:
	RunningStat() : m_n(0), m_tot(0.0) { }

	void clear() {
		m_n = 0;
		m_tot = 0.0;
	}

	void push(float x) {
		m_n++;
		m_tot += x;
		// See Knuth TAOCP vol 2, 3rd edition, page 232
		if (m_n == 1) {
			m_oldM = m_newM = x;
			m_oldS = 0.0;
		} else {
			m_newM = m_oldM + (x - m_oldM)/m_n;
			m_newS = m_oldS + (x - m_oldM)*(x - m_newM);
			// set up for next iteration
			m_oldM = m_newM;
			m_oldS = m_newS;
		}
	}

	int num() const {
		return m_n;
	}

	double tot() const {
		return m_tot;
	}

	double mean() const {
		return (m_n > 0) ? m_newM : 0.0;
	}

	double variance() const {
		return ( (m_n > 1) ? m_newS/(m_n - 1) : 0.0 );
	}

	double stddev() const {
		return sqrt(variance());
	}

private:
	int m_n;
	double m_tot;
	double m_oldM, m_newM, m_oldS, m_newS;
};

/**
 * Encapsulates a set of metrics that we would like an aligner to keep
 * track of, so that we can possibly use it to diagnose performance
 * issues.
 */
class AlignerMetrics {

public:

	AlignerMetrics() :
		curBacktracks_(0),
		curBwtOps_(0),
		first_(true),
		curIsLowEntropy_(false),
		curIsHomoPoly_(false),
		curHadRanges_(false),
		curNumNs_(0),
		reads_(0),
		homoReads_(0),
		lowEntReads_(0),
		hiEntReads_(0),
		alignedReads_(0),
		unalignedReads_(0),
		threeOrMoreNReads_(0),
		lessThanThreeNRreads_(0),
		bwtOpsPerRead_(),
		backtracksPerRead_(),
		bwtOpsPerHomoRead_(),
		backtracksPerHomoRead_(),
		bwtOpsPerLoEntRead_(),
		backtracksPerLoEntRead_(),
		bwtOpsPerHiEntRead_(),
		backtracksPerHiEntRead_(),
		bwtOpsPerAlignedRead_(),
		backtracksPerAlignedRead_(),
		bwtOpsPerUnalignedRead_(),
		backtracksPerUnalignedRead_(),
		bwtOpsPer0nRead_(),
		backtracksPer0nRead_(),
		bwtOpsPer1nRead_(),
		backtracksPer1nRead_(),
		bwtOpsPer2nRead_(),
		backtracksPer2nRead_(),
		bwtOpsPer3orMoreNRead_(),
		backtracksPer3orMoreNRead_(),
		timer_(cout, "", false)
		{ }

	void printSummary() {
		if(!first_) {
			finishRead();
		}
		cout << "AlignerMetrics:" << endl;
		cout << "  # Reads:             " << reads_ << endl;
		float hopct = (reads_ > 0) ? (((float)homoReads_)/((float)reads_)) : (0.0f);
		hopct *= 100.0f;
		cout << "  % homo-polymeric:    " << (hopct) << endl;
		float lopct = (reads_ > 0) ? ((float)lowEntReads_/(float)(reads_)) : (0.0f);
		lopct *= 100.0f;
		cout << "  % low-entropy:       " << (lopct) << endl;
		float unpct = (reads_ > 0) ? ((float)unalignedReads_/(float)(reads_)) : (0.0f);
		unpct *= 100.0f;
		cout << "  % unaligned:         " << (unpct) << endl;
		float npct = (reads_ > 0) ? ((float)threeOrMoreNReads_/(float)(reads_)) : (0.0f);
		npct *= 100.0f;
		cout << "  % with 3 or more Ns: " << (npct) << endl;
		cout << endl;
		cout << "  Total BWT ops:    avg: " << bwtOpsPerRead_.mean() << ", stddev: " << bwtOpsPerRead_.stddev() << endl;
		cout << "  Total Backtracks: avg: " << backtracksPerRead_.mean() << ", stddev: " << backtracksPerRead_.stddev() << endl;
		time_t elapsed = timer_.elapsed();
		cout << "  BWT ops per second:    " << (bwtOpsPerRead_.tot()/elapsed) << endl;
		cout << "  Backtracks per second: " << (backtracksPerRead_.tot()/elapsed) << endl;
		cout << endl;
		cout << "  Homo-poly:" << endl;
		cout << "    BWT ops:    avg: " << bwtOpsPerHomoRead_.mean() << ", stddev: " << bwtOpsPerHomoRead_.stddev() << endl;
		cout << "    Backtracks: avg: " << backtracksPerHomoRead_.mean() << ", stddev: " << backtracksPerHomoRead_.stddev() << endl;
		cout << "  Low-entropy:" << endl;
		cout << "    BWT ops:    avg: " << bwtOpsPerLoEntRead_.mean() << ", stddev: " << bwtOpsPerLoEntRead_.stddev() << endl;
		cout << "    Backtracks: avg: " << backtracksPerLoEntRead_.mean() << ", stddev: " << backtracksPerLoEntRead_.stddev() << endl;
		cout << "  High-entropy:" << endl;
		cout << "    BWT ops:    avg: " << bwtOpsPerHiEntRead_.mean() << ", stddev: " << bwtOpsPerHiEntRead_.stddev() << endl;
		cout << "    Backtracks: avg: " << backtracksPerHiEntRead_.mean() << ", stddev: " << backtracksPerHiEntRead_.stddev() << endl;
		cout << endl;
		cout << "  Unaligned:" << endl;
		cout << "    BWT ops:    avg: " << bwtOpsPerUnalignedRead_.mean() << ", stddev: " << bwtOpsPerUnalignedRead_.stddev() << endl;
		cout << "    Backtracks: avg: " << backtracksPerUnalignedRead_.mean() << ", stddev: " << backtracksPerUnalignedRead_.stddev() << endl;
		cout << "  Aligned:" << endl;
		cout << "    BWT ops:    avg: " << bwtOpsPerAlignedRead_.mean() << ", stddev: " << bwtOpsPerAlignedRead_.stddev() << endl;
		cout << "    Backtracks: avg: " << backtracksPerAlignedRead_.mean() << ", stddev: " << backtracksPerAlignedRead_.stddev() << endl;
		cout << endl;
		cout << "  0 Ns:" << endl;
		cout << "    BWT ops:    avg: " << bwtOpsPer0nRead_.mean() << ", stddev: " << bwtOpsPer0nRead_.stddev() << endl;
		cout << "    Backtracks: avg: " << backtracksPer0nRead_.mean() << ", stddev: " << backtracksPer0nRead_.stddev() << endl;
		cout << "  1 N:" << endl;
		cout << "    BWT ops:    avg: " << bwtOpsPer1nRead_.mean() << ", stddev: " << bwtOpsPer1nRead_.stddev() << endl;
		cout << "    Backtracks: avg: " << backtracksPer1nRead_.mean() << ", stddev: " << backtracksPer1nRead_.stddev() << endl;
		cout << "  2 Ns:" << endl;
		cout << "    BWT ops:    avg: " << bwtOpsPer2nRead_.mean() << ", stddev: " << bwtOpsPer2nRead_.stddev() << endl;
		cout << "    Backtracks: avg: " << backtracksPer2nRead_.mean() << ", stddev: " << backtracksPer2nRead_.stddev() << endl;
		cout << "  >2 Ns:" << endl;
		cout << "    BWT ops:    avg: " << bwtOpsPer3orMoreNRead_.mean() << ", stddev: " << bwtOpsPer3orMoreNRead_.stddev() << endl;
		cout << "    Backtracks: avg: " << backtracksPer3orMoreNRead_.mean() << ", stddev: " << backtracksPer3orMoreNRead_.stddev() << endl;
		cout << endl;
	}

	/**
	 *
	 */
	void nextRead(const BTDnaString& read) {
		if(!first_) {
			finishRead();
		}
		first_ = false;
		//float ent = entropyDna5(read);
		float ent = 0.0f;
		curIsLowEntropy_ = (ent < 0.75f);
		curIsHomoPoly_ = (ent < 0.001f);
		curHadRanges_ = false;
		curBwtOps_ = 0;
		curBacktracks_ = 0;
		// Count Ns
		curNumNs_ = 0;
		const size_t len = read.length();
		for(size_t i = 0; i < len; i++) {
			if((int)read[i] == 4) curNumNs_++;
		}
	}

	/**
	 *
	 */
	void setReadHasRange() {
		curHadRanges_ = true;
	}

	/**
	 * Commit the running statistics for this read to
	 */
	void finishRead() {
		reads_++;
		if(curIsHomoPoly_) homoReads_++;
		else if(curIsLowEntropy_) lowEntReads_++;
		else hiEntReads_++;
		if(curHadRanges_) alignedReads_++;
		else unalignedReads_++;
		bwtOpsPerRead_.push((float)curBwtOps_);
		backtracksPerRead_.push((float)curBacktracks_);
		// Drill down by entropy
		if(curIsHomoPoly_) {
			bwtOpsPerHomoRead_.push((float)curBwtOps_);
			backtracksPerHomoRead_.push((float)curBacktracks_);
		} else if(curIsLowEntropy_) {
			bwtOpsPerLoEntRead_.push((float)curBwtOps_);
			backtracksPerLoEntRead_.push((float)curBacktracks_);
		} else {
			bwtOpsPerHiEntRead_.push((float)curBwtOps_);
			backtracksPerHiEntRead_.push((float)curBacktracks_);
		}
		// Drill down by whether it aligned
		if(curHadRanges_) {
			bwtOpsPerAlignedRead_.push((float)curBwtOps_);
			backtracksPerAlignedRead_.push((float)curBacktracks_);
		} else {
			bwtOpsPerUnalignedRead_.push((float)curBwtOps_);
			backtracksPerUnalignedRead_.push((float)curBacktracks_);
		}
		if(curNumNs_ == 0) {
			lessThanThreeNRreads_++;
			bwtOpsPer0nRead_.push((float)curBwtOps_);
			backtracksPer0nRead_.push((float)curBacktracks_);
		} else if(curNumNs_ == 1) {
			lessThanThreeNRreads_++;
			bwtOpsPer1nRead_.push((float)curBwtOps_);
			backtracksPer1nRead_.push((float)curBacktracks_);
		} else if(curNumNs_ == 2) {
			lessThanThreeNRreads_++;
			bwtOpsPer2nRead_.push((float)curBwtOps_);
			backtracksPer2nRead_.push((float)curBacktracks_);
		} else {
			threeOrMoreNReads_++;
			bwtOpsPer3orMoreNRead_.push((float)curBwtOps_);
			backtracksPer3orMoreNRead_.push((float)curBacktracks_);
		}
	}

	// Running-total of the number of backtracks and BWT ops for the
	// current read
	uint32_t curBacktracks_;
	uint32_t curBwtOps_;

protected:

	bool first_;

	// true iff the current read is low entropy
	bool curIsLowEntropy_;
	// true if current read is all 1 char (or very close)
	bool curIsHomoPoly_;
	// true iff the current read has had one or more ranges reported
	bool curHadRanges_;
	// number of Ns in current read
	int curNumNs_;

	// # reads
	uint32_t reads_;
	// # homo-poly reads
	uint32_t homoReads_;
	// # low-entropy reads
	uint32_t lowEntReads_;
	// # high-entropy reads
	uint32_t hiEntReads_;
	// # reads with alignments
	uint32_t alignedReads_;
	// # reads without alignments
	uint32_t unalignedReads_;
	// # reads with 3 or more Ns
	uint32_t threeOrMoreNReads_;
	// # reads with < 3 Ns
	uint32_t lessThanThreeNRreads_;

	// Distribution of BWT operations per read
	RunningStat bwtOpsPerRead_;
	RunningStat backtracksPerRead_;

	// Distribution of BWT operations per homo-poly read
	RunningStat bwtOpsPerHomoRead_;
	RunningStat backtracksPerHomoRead_;

	// Distribution of BWT operations per low-entropy read
	RunningStat bwtOpsPerLoEntRead_;
	RunningStat backtracksPerLoEntRead_;

	// Distribution of BWT operations per high-entropy read
	RunningStat bwtOpsPerHiEntRead_;
	RunningStat backtracksPerHiEntRead_;

	// Distribution of BWT operations per read that "aligned" (for
	// which a range was arrived at - range may not have necessarily
	// lead to an alignment)
	RunningStat bwtOpsPerAlignedRead_;
	RunningStat backtracksPerAlignedRead_;

	// Distribution of BWT operations per read that didn't align
	RunningStat bwtOpsPerUnalignedRead_;
	RunningStat backtracksPerUnalignedRead_;

	// Distribution of BWT operations/backtracks per read with no Ns
	RunningStat bwtOpsPer0nRead_;
	RunningStat backtracksPer0nRead_;

	// Distribution of BWT operations/backtracks per read with one N
	RunningStat bwtOpsPer1nRead_;
	RunningStat backtracksPer1nRead_;

	// Distribution of BWT operations/backtracks per read with two Ns
	RunningStat bwtOpsPer2nRead_;
	RunningStat backtracksPer2nRead_;

	// Distribution of BWT operations/backtracks per read with three or
	// more Ns
	RunningStat bwtOpsPer3orMoreNRead_;
	RunningStat backtracksPer3orMoreNRead_;

	Timer timer_;
};

#endif /* ALIGNER_METRICS_H_ */