group_walk.h - centrifuge (debian/1.0.3-8)

Tree @debian/1.0.3-8 (Download .tar.gz)

group_walk.h @debian/1.0.3-8 — raw · history · blame

/*
 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
 *
 * This file is part of Bowtie 2.
 *
 * Bowtie 2 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Bowtie 2 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 * group_walk.h
 *
 * Classes and routines for walking a set of BW ranges backwards from the edge
 * of a seed hit with the goal of resolving the offset of each row in each
 * range.  Here "offset" means offset into the concatenated string of all
 * references.  The main class is 'GroupWalk' and an important helper is
 * 'GWState'.
 *
 * For each combination of seed offset and orientation, there is an associated
 * QVal.  Each QVal describes a (possibly empty) set of suffix array ranges.
 * Call these "seed range sets."  Each range in the set is "backed" by a range
 * of the salist, represented as a PListSlice. Such a range is the origin of a
 * walk.
 *
 * When an offset is resolved, it is entered into the salist via the
 * PListSlice.  Note that other routines in this same thread might also be
 * setting elements of the salist, so routines here should expect that elements
 * can go from unresolved to resolved at any time.
 *
 * What bookkeeping do we have to do as we walk?  Before the first step, we
 * convert the initial QVal into a list of SATuples; the SATuples are our link
 * to the correpsonding ranges in the suffix array.  The list of SATuples is
 * then converted to a list of GWState objects; these keep track of where we
 * are in our walk (e.g. what 'top' and 'bot' are, how many steps have we gone,
 * etc) as well as how the elements in the current range correspond to elements
 * from the original range.
 *
 * The user asks the GroupWalk to resolve another offset by calling advance().
 * advance() can be called in various ways:
 *
 * (a) The user can request that the GroupWalk proceed until a
 *     *particular* element is resolved, then return that resolved
 *     element.  Other elements may be resolved along the way, but
 *     those results are buffered and may be dispensed in future calls
 *     to advance().
 *
 * (b) The user can request that the GroupWalk select an as-yet-
 *     unreported element at random and and proceed until that element
 *     is resolved and report it.  Again, other elements may be
 *     resolved along the way but they are buffered.
 *
 * (c) The user can request that the GroupWalk resolve elements in a
 *     particular BW range (with a particular offset and orientation)
 *     in an order of its choosing.  The GroupWalk in this case
 *     attempts to resolve as many offsets as possible as quickly as
 *     possible, and returns them as soon as they're found.  The res_
 *     buffer is used in this case.
 *
 * (d) Like (c) but resolving elements at a paritcular offset and
 *     orientation instead of at a specific BW range.  The res_ buffer
 *     is used in this case, since there's a chance that the 
 *
 * There are simple ways to heuristically reduce the problem size while
 * maintaining randomness.  For instance, the user put a ceiling on the
 * number of elements that we walk from any given seed offset or range.
 * We can then trim away random subranges to reduce the size of the
 * problem.  There is no need for the caller to do this for us.
 */

#ifndef GROUP_WALK_H_
#define GROUP_WALK_H_

#include <stdint.h>
#include <limits>
#include "ds.h"
#include "bt2_idx.h"
#include "read.h"
#include "reference.h"
#include "mem_ids.h"

/**
 * Encapsulate an SA range and an associated list of slots where the resolved
 * offsets can be placed.
 */
template<typename T>
class SARangeWithOffs {

public:

	SARangeWithOffs() { reset(); };

	SARangeWithOffs(TIndexOffU tf, size_t len, const T& o) {
		init(tf, len, o);
	}
	
	void init(TIndexOffU tf, size_t len_, const T& o) {
		topf = tf; len = len_, offs = o;
	}

	/**
	 * Reset to uninitialized state.
	 */
	void reset() { topf = std::numeric_limits<TIndexOffU>::max(); }
	
	/**
	 * Return true if this is initialized.
	 */
	bool inited() const {
		return topf != std::numeric_limits<TIndexOffU>::max();
	}
	
	/**
	 * Return the number of times this reference substring occurs in the
	 * reference, which is also the size of the 'offs' TSlice.
	 */
	size_t size() const { return offs.size(); }

	TIndexOffU topf; // top in BWT index
	size_t    len;  // length of the reference sequence involved
	T         offs; // offsets
};

/**
 * A group of per-thread state that can be shared between all the GroupWalks
 * used in that thread.
 */
template <typename index_t>
struct GroupWalkState {

	GroupWalkState(int cat) : map(cat) {
		masks[0].setCat(cat);
		masks[1].setCat(cat);
		masks[2].setCat(cat);
		masks[3].setCat(cat);
	}

	EList<bool> masks[4];      // temporary list for masks; used in GWState
	EList<index_t, 16> map;   // temporary list of GWState maps
};

/**
 * Encapsulates counters that encode how much work the walk-left logic
 * has done.
 */
struct WalkMetrics {

	WalkMetrics() {
	    reset();
	}

	/**
	 * Sum each across this object and 'm'.  This is the only safe way
	 * to update a WalkMetrics shared by many threads.
	 */
	void merge(const WalkMetrics& m, bool getLock = false) {
		ThreadSafe ts(&mutex_m, getLock);
		bwops += m.bwops;
		branches += m.branches;
		resolves += m.resolves;
		refresolves += m.refresolves;
		reports += m.reports;
	}
	
	/**
	 * Set all to 0.
	 */
	void reset() {
		bwops = branches = resolves = refresolves = reports = 0;
	}

	uint64_t bwops;       // Burrows-Wheeler operations
	uint64_t branches;    // BW range branch-offs
	uint64_t resolves;    // # offs resolved with BW walk-left
	uint64_t refresolves; // # resolutions caused by reference scanning
	uint64_t reports;     // # offs reported (1 can be reported many times)
	MUTEX_T mutex_m;
};

/**
 * Coordinates for a BW element that the GroupWalk might resolve.
 */
template <typename index_t>
struct GWElt {

	GWElt() { reset(); }
	
	/**
	 * Reset GWElt to uninitialized state.
	 */
	void reset() {
		offidx = range = elt = len = (index_t)0xffffffff;
		fw = false;
	}

	/**
	 * Initialize this WalkResult.
	 */
	void init(
		index_t oi,
		bool f,
		index_t r,
		index_t e,
		index_t l)
	{
		offidx = oi;
		fw = f;
		range = r;
		elt = e;
		len = l;
	}

	/**
	 * Return true iff this GWElt and the given GWElt refer to the same
	 * element.
	 */
	bool operator==(const GWElt& o) const {
		return offidx == o.offidx &&
		       fw == o.fw &&
		       range == o.range &&
		       elt == o.elt &&
		       len == o.len;
	}
	
	/**
	 * Return true iff this GWElt and the given GWElt refer to
	 * different elements.
	 */
	bool operator!=(const GWElt& o) const {
		return !(*this == o);
	}

	index_t offidx; // seed offset index
	bool    fw;     // strand
	index_t range;  // range
	index_t elt;    // element
	index_t len;    // length
};

/**
 * A record encapsulating the result of looking up one BW element in
 * the Bowtie index.
 */
template <typename index_t>
struct WalkResult {

	WalkResult() { reset(); }
	
	/**
	 * Reset GWElt to uninitialized state.
	 */
	void reset() {
		elt.reset();
		bwrow = toff = (index_t)OFF_MASK;
	}

	/**
	 * Initialize this WalkResult.
	 */
	void init(
		index_t oi,  // seed offset index
		bool f,       // strand
		index_t r,   // range
		index_t e,   // element
		index_t bwr, // BW row
		index_t len, // length
		index_t to)  // text offset
	{
		elt.init(oi, f, r, e, len);
		bwrow = bwr;
		toff = to;
	}

	GWElt<index_t> elt;   // element resolved
	index_t        bwrow; // SA row resolved
	index_t        toff;  // resolved offset from SA sample
};

/**
 * A GW hit encapsulates an SATuple describing a reference substring
 * in the cache, along with a bool indicating whether each element of
 * the hit has been reported yet.
 */
template<typename index_t, typename T>
class GWHit {

public:
	GWHit() :
		fmap(0, GW_CAT),
		offidx((index_t)OFF_MASK),
		fw(false),
		range((index_t)OFF_MASK),
		len((index_t)OFF_MASK),
		reported_(0, GW_CAT),
		nrep_(0)
	{
		assert(repOkBasic());
	}

	/**
	 * Initialize with a new SA range.  Resolve the done vector so that
	 * there's one bool per suffix array element.
	 */
	void init(
		SARangeWithOffs<T>& sa,
		index_t oi,
		bool f,
		index_t r)
	{
		nrep_ = 0;
		offidx = oi;
		fw = f;
		range = r;
		len = (index_t)sa.len;
		reported_.resize(sa.offs.size());
		reported_.fill(false);
		fmap.resize(sa.offs.size());
		fmap.fill(make_pair((index_t)OFF_MASK, (index_t)OFF_MASK));
	}
	
	/**
	 * Clear contents of sat and done.
	 */
	void reset() {
		reported_.clear();
		fmap.clear();
		nrep_ = 0;
		offidx = (index_t)OFF_MASK;
		fw = false;
		range = (index_t)OFF_MASK;
		len = (index_t)OFF_MASK;
	}
	
#ifndef NDEBUG
	/**
	 * Check that GWHit is internally consistent.  If a pointer to an
	 * EList of GWStates is given, we assume that it is the EList
	 * corresponding to this GWHit and check whether the forward and
	 * reverse mappings match up for the as-yet-unresolved elements.
	 */
	bool repOk(const SARangeWithOffs<T>& sa) const {
		assert_eq(reported_.size(), sa.offs.size());
		assert_eq(fmap.size(), sa.offs.size());
		// Shouldn't be any repeats among as-yet-unresolveds
		size_t nrep = 0;
		for(size_t i = 0; i < fmap.size(); i++) {
			if(reported_[i]) nrep++;
			if(sa.offs[i] != (index_t)OFF_MASK) {
				continue;
			}
			for(size_t j = i+1; j < fmap.size(); j++) {
				if(sa.offs[j] != (index_t)OFF_MASK) {
					continue;
				}
				assert(fmap[i] != fmap[j]);
			}
		}
		assert_eq(nrep_, nrep);
		return true;
	}

	/**
	 * Return true iff this GWHit is not obviously corrupt.
	 */
	bool repOkBasic() {
		return true;
	}
#endif
	
	/**
	 * Set the ith element to be reported.
	 */
	void setReported(index_t i) {
		assert(!reported_[i]);
		assert_lt(i, reported_.size());
		reported_[i] = true;
		nrep_++;
	}
	
	/**
	 * Return true iff element i has been reported.
	 */
	bool reported(index_t i) const {
		assert_lt(i, reported_.size());
		return reported_[i];
	}
	
	/**
	 * Return true iff all elements have been reported.
	 */
	bool done() const {
		assert_leq(nrep_, reported_.size());
		return nrep_ == reported_.size();
	}

	EList<std::pair<index_t, index_t>, 16> fmap; // forward map; to GWState & elt
	index_t offidx; // offset idx
	bool fw;         // orientation
	index_t range;  // original range index
	index_t len;    // length of hit

protected:

	EList<bool, 16> reported_; // per-elt bool indicating whether it's been reported
	index_t nrep_;
};

/**
 * Encapsulates the progress made along a particular path from the original
 * range.
 */
template<typename index_t, typename T>
class GWState {
	
public:

	GWState() : map_(0, GW_CAT) {
		reset(); assert(repOkBasic());
	}
	
	/**
	 * Initialize this GWState with new ebwt, top, bot, step, and sat.
	 *
	 * We assume map is already set up.
	 *
	 * Returns true iff at least one elt was resolved.
	 */
	template<int S>
	pair<int, int> init(
		const Ebwt<index_t>& ebwt,    // index to walk left in
		const BitPairReference& ref,  // bitpair-encoded reference
		SARangeWithOffs<T>& sa,       // SA range with offsets
		EList<GWState, S>& sts,       // EList of GWStates for range being advanced
		GWHit<index_t, T>& hit,       // Corresponding hit structure
		index_t range,                // which range is this?
		bool reportList,              // if true, "report" resolved offsets immediately by adding them to 'res' list
		EList<WalkResult<index_t>, 16>* res,   // EList where resolved offsets should be appended
		index_t tp,                   // top of range at this step
		index_t bt,                   // bot of range at this step
		index_t st,                   // # steps taken to get to this step
		WalkMetrics& met)
	{
		assert_gt(bt, tp);
		assert_lt(range, sts.size());
		top = tp;
		bot = bt;
		step = (int)st;
		assert(!inited_);
		ASSERT_ONLY(inited_ = true);
		ASSERT_ONLY(lastStep_ = step-1);
		return init(ebwt, ref, sa, sts, hit, range, reportList, res, met);
	}

	/**
	 * Initialize this GWState.
	 *
	 * We assume map is already set up, and that 'step' is equal to the
	 * number of steps taken to get to the new top/bot pair *currently*
	 * in the top and bot fields.
	 *
	 * Returns a pair of numbers, the first being the number of
	 * resolved but unreported offsets found during this advance, the
	 * second being the number of as-yet-unresolved offsets.
	 */
	template<int S>
	pair<int, int> init(
		const Ebwt<index_t>& ebwt,    // forward Bowtie index
		const BitPairReference& ref,  // bitpair-encoded reference
		SARangeWithOffs<T>& sa,       // SA range with offsets
		EList<GWState, S>& st,        // EList of GWStates for advancing range
		GWHit<index_t, T>& hit,       // Corresponding hit structure
		index_t range,                // range being inited
		bool reportList,              // report resolutions, adding to 'res' list?
		EList<WalkResult<index_t>, 16>* res,   // EList to append resolutions
		WalkMetrics& met)             // update these metrics
	{
		assert(inited_);
		assert_eq(step, lastStep_+1);
		ASSERT_ONLY(lastStep_++);
		assert_leq((index_t)step, ebwt.eh().len());
		assert_lt(range, st.size());
		pair<int, int> ret = make_pair(0, 0);
		index_t trimBegin = 0, trimEnd = 0;
		bool empty = true; // assume all resolved until proven otherwise
		// Commit new information, if any, to the PListSlide.  Also,
		// trim and check if we're done.
		for(size_t i = mapi_; i < map_.size(); i++) {
			bool resolved = (off(i, sa) != (index_t)OFF_MASK);
			if(!resolved) {
				// Elt not resolved yet; try to resolve it now
				index_t bwrow = (index_t)(top - mapi_ + i);
				index_t toff = ebwt.tryOffset(bwrow);
				ASSERT_ONLY(index_t origBwRow = sa.topf + map(i));
				assert_eq(bwrow, ebwt.walkLeft(origBwRow, step));
				if(toff != (index_t)OFF_MASK) {
					// Yes, toff was resolvable
					assert_eq(toff, ebwt.getOffset(bwrow));
					met.resolves++;
#ifdef CENTRIFUGE
#else
					toff += step;
                    assert_eq(toff, ebwt.getOffset(origBwRow));
#endif
					setOff(i, toff, sa, met);
					if(!reportList) ret.first++;
#if 0
// used to be #ifndef NDEBUG, but since we no longer require that the reference
// string info be included, this is no longer relevant.

					// Sanity check that the reference characters under this
					// hit match the seed characters in hit.satup->key.seq.
					// This is NOT a check that we associated the exact right
					// text offset with the BW row.  This is an important
					// distinction because when resolved offsets are filled in
					// via refernce scanning, they are not necessarily the
					// exact right text offsets to associate with the
					// respective BW rows but they WILL all be correct w/r/t
					// the reference sequence underneath, which is what really
					// matters here.
					index_t tidx = (index_t)OFF_MASK, tof, tlen;
					bool straddled = false;
					ebwt.joinedToTextOff(
						hit.len, // length of seed
						toff,    // offset in joined reference string
						tidx,    // reference sequence id
						tof,     // offset in reference coordinates
						tlen,    // length of reference sequence
						true,    // don't reject straddlers
						straddled);
					if(tidx != (index_t)OFF_MASK &&
					   hit.satup->key.seq != std::numeric_limits<uint64_t>::max())
					{
						// key: 2-bit characters packed into a 64-bit word with
						// the least significant bitpair corresponding to the
						// rightmost character on the Watson reference strand.
						uint64_t key = hit.satup->key.seq;
						for(int64_t j = tof + hit.len-1; j >= tof; j--) {
							// Get next reference base to the left
							int c = ref.getBase(tidx, j);
							assert_range(0, 3, c);
							// Must equal least significant bitpair of key
							if(c != (int)(key & 3)) {
								// Oops; when we jump to the piece of the
								// reference where the seed hit is, it doesn't
								// match the seed hit.  Before dying, check
								// whether we have the right spot in the joined
								// reference string
								SString<char> jref;
								ebwt.restore(jref);
								uint64_t key2 = hit.satup->key.seq;
								for(int64_t k = toff + hit.len-1; k >= toff; k--) {
									int c = jref[k];
									assert_range(0, 3, c);
									assert_eq(c, (int)(key2 & 3));
									key2 >>= 2;
								}
								assert(false);
							}
							key >>= 2;
						}
					}
#endif
				}
			}
			// Is the element resolved?  We ask this regardless of how it was
			// resolved (whether this function did it just now, whether it did
			// it a while ago, or whether some other function outside GroupWalk
			// did it).
			if(off(i, sa) != (index_t)OFF_MASK) {
				if(reportList && !hit.reported(map(i))) {
					// Report it
					index_t toff = off(i, sa);
					assert(res != NULL);
					res->expand();
					index_t origBwRow = sa.topf + map(i);
					res->back().init(
						hit.offidx, // offset idx
						hit.fw,     // orientation
						hit.range,  // original range index
						map(i),     // original element offset
						origBwRow,  // BW row resolved
						hit.len,    // hit length
						toff);      // text offset
					hit.setReported(map(i));
					met.reports++;
				}
				// Offset resolved
				if(empty) {
					// Haven't seen a non-empty entry yet, so we
					// can trim this from the beginning.
					trimBegin++;
				} else {
					trimEnd++;
				}
			} else {
				// Offset not yet resolved
				ret.second++;
				trimEnd = 0;
				empty = false;
				// Set the forward map in the corresponding GWHit
				// object to point to the appropriate element of our
				// range
				assert_geq(i, mapi_);
				index_t bmap = map(i);
				hit.fmap[bmap].first = range;
				hit.fmap[bmap].second = (index_t)i;
#ifndef NDEBUG
				for(size_t j = 0; j < bmap; j++) {
					if(sa.offs[j] == (index_t)OFF_MASK &&
					   hit.fmap[j].first == range)
					{
						assert_neq(i, hit.fmap[j].second);
					}
				}
#endif
			}
		}
		// Trim from beginning
		assert_geq(trimBegin, 0);
		mapi_ += trimBegin;
		top += trimBegin;
		if(trimEnd > 0) {
			// Trim from end
			map_.resize(map_.size() - trimEnd);
			bot -= trimEnd;
		}
		if(empty) {
			assert(done());
#ifndef NDEBUG
			// If range is done, all elements from map should be
			// resolved
			for(size_t i = mapi_; i < map_.size(); i++) {
				assert_neq((index_t)OFF_MASK, off(i, sa));
			}
			// If this range is done, then it should be the case that
			// all elements in the corresponding GWHit that point to
			// this range are resolved.
			for(size_t i = 0; i < hit.fmap.size(); i++) {
				if(sa.offs[i] == (index_t)OFF_MASK) {
					assert_neq(range, hit.fmap[i].first);
				}
			}
#endif
			return ret;
		} else {
			assert(!done());
		}
		// Is there a dollar sign in the middle of the range?
		assert_neq(top, ebwt._zOff);
		assert_neq(bot-1, ebwt._zOff);
		if(ebwt._zOff > top && ebwt._zOff < bot-1) {
			// Yes, the dollar sign is in the middle of this range.  We
			// must split it into the two ranges on either side of the
			// dollar.  Let 'bot' and 'top' delimit the portion of the
			// range prior to the dollar.
			index_t oldbot = bot;
			bot = ebwt._zOff;
			// Note: might be able to do additional trimming off the
			// end.
			// Create a new range for the portion after the dollar.
			st.expand();
			st.back().reset();
			index_t ztop = ebwt._zOff+1;
			st.back().initMap(oldbot - ztop);
			assert_eq((index_t)map_.size(), oldbot-top+mapi_);
			for(index_t i = ztop; i < oldbot; i++) {
				st.back().map_[i - ztop] = map(i-top+mapi_);
			}
			map_.resize(bot - top + mapi_);
			st.back().init(
				ebwt,
				ref,
				sa,
				st,
				hit,
				(index_t)st.size()-1,
				reportList,
				res,
				ztop,
				oldbot,
				step,
				met);
		}
		assert_gt(bot, top);
		// Prepare SideLocus's for next step
		if(bot-top > 1) {
			SideLocus<index_t>::initFromTopBot(top, bot, ebwt.eh(), ebwt.ebwt(), tloc, bloc);
			assert(tloc.valid()); assert(tloc.repOk(ebwt.eh()));
			assert(bloc.valid()); assert(bloc.repOk(ebwt.eh()));
		} else {
			tloc.initFromRow(top, ebwt.eh(), ebwt.ebwt());
			assert(tloc.valid()); assert(tloc.repOk(ebwt.eh()));
			bloc.invalidate();
		}
		return ret;
	}
	
#ifndef NDEBUG
	/**
	 * Check if this GWP is internally consistent.
	 */
	bool repOk(
		const Ebwt<index_t>& ebwt,
		GWHit<index_t, T>& hit,
		index_t range) const
	{
		assert(done() || bot > top);
		assert(doneResolving(hit) || (tloc.valid() && tloc.repOk(ebwt.eh())));
		assert(doneResolving(hit) || bot == top+1 || (bloc.valid() && bloc.repOk(ebwt.eh())));
		assert_eq(map_.size()-mapi_, bot-top);
		// Make sure that 'done' is compatible with whether we have >=
		// 1 elements left to resolve.
		int left = 0;
		for(size_t i = mapi_; i < map_.size(); i++) {
			ASSERT_ONLY(index_t row = (index_t)(top + i - mapi_));
			ASSERT_ONLY(index_t origRow = hit.satup->topf + map(i));
			assert(step == 0 || row != origRow);
			assert_eq(row, ebwt.walkLeft(origRow, step));
			assert_lt(map_[i], hit.satup->offs.size());
			if(off(i, hit) == (index_t)OFF_MASK) left++;
		}
		assert(repOkMapRepeats());
		assert(repOkMapInclusive(hit, range));
		return true;
	}
	
	/**
	 * Return true iff this GWState is not obviously corrupt.
	 */
	bool repOkBasic() {
		assert_geq(bot, top);
		return true;
	}

	/**
	 * Check that the fmap elements pointed to by our map_ include all
	 * of the fmap elements that point to this range.
	 */
	bool repOkMapInclusive(GWHit<index_t, T>& hit, index_t range) const {
		for(size_t i = 0; i < hit.fmap.size(); i++) {
			if(hit.satup->offs[i] == (index_t)OFF_MASK) {
				if(range == hit.fmap[i].first) {
					ASSERT_ONLY(bool found = false);
					for(size_t j = mapi_; j < map_.size(); j++) {
						if(map(j) == i) {
							ASSERT_ONLY(found = true);
							break;
						}
					}
					assert(found);
				}
			}
		}
		return true;
	}
	
	/**
	 * Check that no two elements in map_ are the same.
	 */
	bool repOkMapRepeats() const {
		for(size_t i = mapi_; i < map_.size(); i++) {
			for(size_t j = i+1; j < map_.size(); j++) {
				assert_neq(map_[i], map_[j]);
			}
		}
		return true;
	}
#endif
	
	/**
	 * Return the offset currently assigned to the ith element.  If it
	 * has not yet been resolved, return 0xffffffff.
	 */
	index_t off(
				index_t i,
				const SARangeWithOffs<T>& sa)
	{
		assert_geq(i, mapi_);
		assert_lt(i, map_.size());
		assert_lt(map_[i], sa.offs.size());
		return sa.offs.get(map_[i]);
	}

	/**
	 * Return the offset of the element within the original range's
	 * PListSlice that the ith element of this range corresponds to.
	 */
	index_t map(index_t i) const {
		assert_geq(i, mapi_);
		assert_lt(i, map_.size());
		return map_[i];
	}

	/**
	 * Return the offset of the first untrimmed offset in the map.
	 */
	index_t mapi() const {
		return mapi_;
	}

	/**
	 * Return number of active elements in the range being tracked by
	 * this GWState.
	 */
	index_t size() const {
		return map_.size() - mapi_;
	}
	
	/**
	 * Return true iff all elements in this leaf range have been
	 * resolved.
	 */
	bool done() const {
		return size() == 0;
	}

	/**
	 * Set the PListSlice element that corresponds to the ith element
	 * of 'map' to the specified offset.
	 */
	void setOff(
		index_t i,
		index_t off,
		SARangeWithOffs<T>& sa,
		WalkMetrics& met)
	{
		assert_lt(i + mapi_, map_.size());
		assert_lt(map_[i + mapi_], sa.offs.size());
		size_t saoff = map_[i + mapi_];
		sa.offs[saoff] = off;
		assert_eq(off, sa.offs[saoff]);
	}

	/**
	 * Advance this GWState by one step (i.e. one BW operation).  In
	 * the event of a "split", more elements are added to the EList
	 * 'st', which must have room for at least 3 more elements without
	 * needing another expansion.  If an expansion of 'st' is
	 * triggered, this GWState object becomes invalid.
	 *
	 * Returns a pair of numbers, the first being the number of
	 * resolved but unreported offsets found during this advance, the
	 * second being the number of as-yet-unresolved offsets.
	 */
	template <int S>
	pair<int, int> advance(
		const Ebwt<index_t>& ebwt,   // the forward Bowtie index, for stepping left
		const BitPairReference& ref, // bitpair-encoded reference
		SARangeWithOffs<T>& sa,      // SA range with offsets
		GWHit<index_t, T>& hit,      // the associated GWHit object
		index_t range,               // which range is this?
		bool reportList,             // if true, "report" resolved offsets immediately by adding them to 'res' list
		EList<WalkResult<index_t>, 16>* res,  // EList where resolved offsets should be appended
		EList<GWState, S>& st,       // EList of GWStates for range being advanced
		GroupWalkState<index_t>& gws,         // temporary storage for masks
		WalkMetrics& met,
		PerReadMetrics& prm)
	{
		ASSERT_ONLY(index_t origTop = top);
		ASSERT_ONLY(index_t origBot = bot);
		assert_geq(step, 0);
		assert_eq(step, lastStep_);
		assert_geq(st.capacity(), st.size() + 4);
		assert(tloc.valid()); assert(tloc.repOk(ebwt.eh()));
		assert_eq(bot-top, (index_t)(map_.size()-mapi_));
		pair<int, int> ret = make_pair(0, 0);
		assert_eq(top, tloc.toBWRow());
		if(bloc.valid()) {
			// Still multiple elements being tracked
			assert_lt(top+1, bot);
			index_t upto[4], in[4];
			upto[0] = in[0] = upto[1] = in[1] =
			upto[2] = in[2] = upto[3] = in[3] = 0;
			assert_eq(bot, bloc.toBWRow());
			met.bwops++;
			prm.nExFmops++;
			// Assert that there's not a dollar sign in the middle of
			// this range
			assert(bot <= ebwt._zOff || top > ebwt._zOff);
			ebwt.mapLFRange(tloc, bloc, bot-top, upto, in, gws.masks);
#ifndef NDEBUG
			for(int i = 0; i < 4; i++) {
			  assert_eq(bot-top, (index_t)(gws.masks[i].size()));
			}
#endif
			bool first = true;
			ASSERT_ONLY(index_t sum = 0);
			index_t newtop = 0, newbot = 0;
			gws.map.clear();
			for(int i = 0; i < 4; i++) {
				if(in[i] > 0) {
					// Non-empty range resulted
					if(first) {
						// For the first one, 
						first = false;
						newtop = upto[i];
						newbot = newtop + in[i];
						assert_leq(newbot-newtop, bot-top);
						// Range narrowed so we have to look at the masks
						for(size_t j = 0; j < gws.masks[i].size(); j++) {
							assert_lt(j+mapi_, map_.size());
							if(gws.masks[i][j]) {
								gws.map.push_back(map_[j+mapi_]);
								assert(gws.map.size() <= 1 || gws.map.back() != gws.map[gws.map.size()-2]);
#ifndef NDEBUG
								// If this element is not yet resolved,
								// then check that it really is the
								// expected number of steps to the left
								// of the corresponding element in the
								// root range
								assert_lt(gws.map.back(), sa.size());
								if(sa.offs[gws.map.back()] == (index_t)OFF_MASK) {
									assert_eq(newtop + gws.map.size() - 1,
											  ebwt.walkLeft(sa.topf + gws.map.back(), step+1));
								}
#endif
							}
						}
 						assert_eq(newbot-newtop, (index_t)(gws.map.size()));
					} else {
						// For each beyond the first, create a new
						// GWState and add it to the GWState list. 
						// NOTE: this can cause the underlying list to
						// be expanded which in turn might leave 'st'
						// pointing to bad memory.
						st.expand();
						st.back().reset();
						index_t ntop = upto[i];
						index_t nbot = ntop + in[i];
						assert_lt(nbot-ntop, bot-top);
						st.back().mapi_ = 0;
						st.back().map_.clear();
						met.branches++;
						// Range narrowed so we have to look at the masks
						for(size_t j = 0; j < gws.masks[i].size(); j++) {
							if(gws.masks[i][j]) st.back().map_.push_back(map_[j+mapi_]);
						}
						pair<int, int> rret =
						st.back().init(
							ebwt,        // forward Bowtie index
							ref,         // bitpair-encodede reference
							sa,          // SA range with offsets
							st,          // EList of all GWStates associated with original range
							hit,         // associated GWHit object
							(index_t)st.size()-1, // range offset
							reportList,  // if true, report hits to 'res' list
							res,         // report hits here if reportList is true
							ntop,        // BW top of new range
							nbot,        // BW bot of new range
							step+1,      // # steps taken to get to this new range
							met);        // update these metrics
						ret.first += rret.first;
						ret.second += rret.second;
					}
					ASSERT_ONLY(sum += in[i]);
				}
			}
			mapi_ = 0;
			assert_eq(bot-top, sum);
			assert_gt(newbot, newtop);
			assert_leq(newbot-newtop, bot-top);
			assert(top != newtop || bot != newbot);
			//assert(!(newtop < top && newbot > top));
			top = newtop;
			bot = newbot;
			if(!gws.map.empty()) {
				map_ = gws.map;
			}
			//assert(repOkMapRepeats());
			//assert(repOkMapInclusive(hit, range));
			assert_eq(bot-top, (index_t)map_.size());
		} else {
			// Down to one element
			assert_eq(bot, top+1);
			assert_eq(1, map_.size()-mapi_);
			// Sets top, returns char walked through (which we ignore)
			ASSERT_ONLY(index_t oldtop = top);
			met.bwops++;
			prm.nExFmops++;
			ebwt.mapLF1(top, tloc);
			assert_neq(top, oldtop);
			bot = top+1;
			if(mapi_ > 0) {
				map_[0] = map_[mapi_];
				mapi_ = 0;
			}
			map_.resize(1);
		}
		assert(top != origTop || bot != origBot);
		step++;
		assert_gt(step, 0);
		assert_leq((index_t)step, ebwt.eh().len());
		pair<int, int> rret =
		init<S>(
			ebwt,       // forward Bowtie index
			ref,        // bitpair-encodede reference
			sa,         // SA range with offsets
			st,         // EList of all GWStates associated with original range
			hit,        // associated GWHit object
			range,      // range offset
			reportList, // if true, report hits to 'res' list
			res,        // report hits here if reportList is true
			met);       // update these metrics
		ret.first += rret.first;
		ret.second += rret.second;
		return ret;
	}

	/**
	 * Clear all state in preparation for the next walk.
	 */
	void reset() {
		top = bot = step = mapi_ = 0;
		ASSERT_ONLY(lastStep_ = -1);
		ASSERT_ONLY(inited_ = false);
		tloc.invalidate();
		bloc.invalidate();
		map_.clear();
	}
	
	/**
	 * Resize the map_ field to the given size.
	 */
	void initMap(size_t newsz) {
		mapi_ = 0;
		map_.resize(newsz);
		for(size_t i = 0; i < newsz; i++) {
			map_[i] = (index_t)i;
		}
	}

	/**
	 * Return true iff all rows corresponding to this GWState have been
	 * resolved and reported.
	 */
	bool doneReporting(const GWHit<index_t, T>& hit) const {
		for(size_t i = mapi_; i < map_.size(); i++) {
			if(!hit.reported(map(i))) return false;
		}
		return true;
	}

	/**
	 * Return true iff all rows corresponding to this GWState have been
	 * resolved (but not necessarily reported).
	 */
	bool doneResolving(const SARangeWithOffs<T>& sa) const {
		for(size_t i = mapi_; i < map_.size(); i++) {
			if(sa.offs[map(i)] == (index_t)OFF_MASK) return false;
		}
		return true;
	}

	SideLocus<index_t> tloc;      // SideLocus for top
	SideLocus<index_t> bloc;      // SideLocus for bottom
	index_t            top;       // top elt of range in BWT
	index_t            bot;       // bot elt of range in BWT
	int                step;      // how many steps have we walked to the left so far

protected:
	
	ASSERT_ONLY(bool inited_);
	ASSERT_ONLY(int lastStep_);
	EList<index_t, 16> map_; // which elts in range 'range' we're tracking
	index_t mapi_;           // first untrimmed element of map
};

template<typename index_t, typename T, int S>
class GroupWalk2S {
public:
	typedef EList<GWState<index_t, T>, S> TStateV;

	GroupWalk2S() : st_(8, GW_CAT) {
		reset();
	}
	
	/**
	 * Reset the GroupWalk in preparation for the next SeedResults.
	 */
	void reset() {
		elt_ = rep_ = 0;
		ASSERT_ONLY(inited_ = false);
	}

	/**
	 * Initialize a new group walk w/r/t a QVal object.
	 */
	void init(
		const Ebwt<index_t>& ebwtFw, // forward Bowtie index for walking left
		const BitPairReference& ref, // bitpair-encoded reference
		SARangeWithOffs<T>& sa,      // SA range with offsets
		RandomSource& rnd,           // pseudo-random generator for sampling rows
		WalkMetrics& met)            // update metrics here
	{
		reset();
#ifndef NDEBUG
		inited_ = true;
#endif
		// Init GWHit
		hit_.init(sa, 0, false, 0);
		// Init corresponding GWState
		st_.resize(1);
		st_.back().reset();
		assert(st_.back().repOkBasic());
		index_t top = sa.topf;
		index_t bot = (index_t)(top + sa.size());
		st_.back().initMap(bot-top);
		st_.ensure(4);
		st_.back().init(
			ebwtFw,             // Bowtie index
			ref,                // bitpair-encoded reference
			sa,                 // SA range with offsets
			st_,                // EList<GWState>
			hit_,               // GWHit
			0,                  // range 0
			false,              // put resolved elements into res_?
			NULL,               // put resolved elements here
			top,                // BW row at top
			bot,                // BW row at bot
			0,                  // # steps taken
			met);               // update metrics here
		elt_ += sa.size();
		assert(hit_.repOk(sa));
	}

	//
	// ELEMENT-BASED
	//

	/**
	 * Advance the GroupWalk until all elements have been resolved.
	 * FIXME FB: Commented as the types of advanceElements do not correlate with the types of the function definition.
	 */
//	void resolveAll(WalkMetrics& met, PerReadMetrics& prm) {
//		WalkResult<index_t> res; // ignore results for now
//		for(size_t i = 0; i < elt_; i++) {
//			advanceElement((index_t)i, res, met, prm);
//		}
//	}

	/**
	 * Advance the GroupWalk until the specified element has been
	 * resolved.
	 */
	bool advanceElement(
		index_t elt,                  // element within the range
		const Ebwt<index_t>& ebwtFw,  // forward Bowtie index for walking left
		const BitPairReference& ref,  // bitpair-encoded reference
		SARangeWithOffs<T>& sa,       // SA range with offsets
		GroupWalkState<index_t>& gws, // GroupWalk state; scratch space
		WalkResult<index_t>& res,     // put the result here
		WalkMetrics& met,             // metrics
		PerReadMetrics& prm)          // per-read metrics
	{
		assert(inited_);
		assert(!done());
		assert(hit_.repOk(sa));
		assert_lt(elt, sa.size()); // elt must fall within range
		// Until we've resolved our element of interest...
		while(sa.offs[elt] == (index_t)OFF_MASK) {
			// Get the GWState that contains our element of interest
			size_t range = hit_.fmap[elt].first;
			st_.ensure(4);
			GWState<index_t, T>& st = st_[range];
			assert(!st.doneResolving(sa));
			// Returns a pair of numbers, the first being the number of
			// resolved but unreported offsets found during this advance, the
			// second being the number of as-yet-unresolved offsets.
			st.advance(
				ebwtFw,
				ref,
				sa,
				hit_,
				(index_t)range,
				false,
				NULL,
				st_,
				gws,
				met,
				prm);
			assert(sa.offs[elt] != (index_t)OFF_MASK ||
			       !st_[hit_.fmap[elt].first].doneResolving(sa));
		}
		assert_neq((index_t)OFF_MASK, sa.offs[elt]);
		// Report it!
		if(!hit_.reported(elt)) {
			hit_.setReported(elt);
		}
		met.reports++;
		res.init(
			0,              // seed offset
			false,          // orientation
			0,              // range
			elt,            // element
			sa.topf + elt,  // bw row
			(index_t)sa.len, // length of hit
			sa.offs[elt]);  // resolved text offset
		rep_++;
		return true;
	}

	/**
	 * Return true iff all elements have been resolved and reported.
	 */
	bool done() const { return rep_ == elt_; }
	
#ifndef NDEBUG
	/**
	 * Check that GroupWalk is internally consistent.
	 */
	bool repOk(const SARangeWithOffs<T>& sa) const {
		assert(hit_.repOk(sa));
		assert_leq(rep_, elt_);
		// This is a lot of work
		size_t resolved = 0, reported = 0;
		// For each element
		const size_t sz = sa.size();
		for(size_t m = 0; m < sz; m++) {
			// Is it resolved?
			if(sa.offs[m] != (index_t)OFF_MASK) {
				resolved++;
			} else {
				assert(!hit_.reported(m));
			}
			// Is it reported?
			if(hit_.reported(m)) {
				reported++;
			}
			assert_geq(resolved, reported);
		}
		assert_geq(resolved, reported);
		assert_eq(rep_, reported);
		assert_eq(elt_, sz);
		return true;
	}
#endif

	/**
	 * Return the number of BW elements that we can resolve.
	 */
	index_t numElts() const { return elt_; }
	
	/**
	 * Return the size occupied by this GroupWalk and all its constituent
	 * objects.
	 */
	size_t totalSizeBytes() const {
		return 2 * sizeof(size_t) + st_.totalSizeBytes() + sizeof(GWHit<index_t, T>);
	}
	/**
	 * Return the capacity of this GroupWalk and all its constituent objects.
	 */
	size_t totalCapacityBytes() const {
		return 2 * sizeof(size_t) + st_.totalCapacityBytes() + sizeof(GWHit<index_t, T>);
	}
	
#ifndef NDEBUG
	bool initialized() const { return inited_; }
#endif
	
protected:

	ASSERT_ONLY(bool inited_);    // initialized?
	
	index_t elt_;    // # BW elements under the control of the GropuWalk
	index_t rep_;    // # BW elements reported

	// For each orientation and seed offset, keep a GWState object that
	// holds the state of the walk so far.
	TStateV st_;

	// For each orientation and seed offset, keep an EList of GWHit.
	GWHit<index_t, T> hit_;
};

#endif /*GROUP_WALK_H_*/