src/namgene.cc - augustus (upstream/3.3.1+dfsg)

Tree @upstream/3.3.1+dfsg (Download .tar.gz)

namgene.cc @upstream/3.3.1+dfsg — raw · history · blame

/*******************************************************************************************************
 * file:    namgene.cc
 * licence: Artistic Licence, see file LICENCE.TXT or 
 *          http://www.opensource.org/licenses/artistic-license.php
 * descr.:  
 * authors: Mario Stanke (mario@gobics.de), Stafilarakis
 *
 * date    |   author      |  changes 
 * --------|---------------|----------------------------------------------------------------------------- 
 * 26.09.01| Stafilarakis  | creation of the class
 * ??.11.02| Stanke        | getPathEmiProb
 * 09.03.04| Stefanie Dahms| getNextCutEndPoint
 *         | Stanke        | forward algorithm, sampling, sfc, ...
 * 09.09.05| Mario Stanke  | alternative transcripts and posterior probs from sampling
 * 13.07.06| Mario Stanke  | fixed minor bug: mixed up geneBegin and codingstart for sorting transcripts
 * 31.08.06| Mario Stanke  | rewrote getNextCutEndPoint
 * 21.09.06| Mario Stanke  | getStepGenes, etc.
 * 17.01.07| Mario Stanke  | computeReachableStates: save time by not going into irrelevant states
 *******************************************************************************************************/

#include "namgene.hh"

// project includes
#include "pp_scoring.hh" 
#include "statemodel.hh"
#include "motif.hh"
#include "gene.hh"
#include "projectio.hh"  // for comment, goto_line_after
#include "mea.hh"
#include "exoncand.hh"

// standard C/C++ includes
#include <iomanip>  // for setprecision
#include <iostream>

int synchstate = 0;

NAMGene::NAMGene() {
  /*
   * Prepare the following:
   *  - determine number of states
   *  - create states
   *  - read in transition probabilities
   */
  statecount = Properties::getIntProperty( "/NAMGene/statecount" );
  try {
      noInFrameStop = Properties::getBoolProperty("noInFrameStop");
  } catch (...) {
      noInFrameStop = false;
  }
  try {
    minmeanexonintronprob = Properties::getdoubleProperty("minmeanexonintronprob");
  } catch (...) {
    minmeanexonintronprob = 0.0;
  }
  try {
    minexonintronprob = Properties::getdoubleProperty("minexonintronprob");
  } catch (...) {
    minexonintronprob = 0.0;
  }
  try {
    maxtracks = Properties::getIntProperty("maxtracks");
  } catch (...) {
    maxtracks = -1; // default, no maximum
  }
  try {
    sampleiterations = Properties::getIntProperty("sample");
  } catch (...) {
    sampleiterations = 0;
  }
  if (sampleiterations > 0) {
      if (sampleiterations < 10){
	  cerr << "Error: Number of sample iterations is too low. (sample=" << sampleiterations << ")" << endl
	       << "I will not sample (sample=0) and will not estimate posterior probabilities." << endl;
	  sampleiterations = 0;
      } else if (sampleiterations < 20){
	  cerr << "Warning: Number of sample iterations 'sample'=" << sampleiterations << " is low." << endl
	       << "Posterior probabilities will be only rough estimates." << endl;
      }
  }
  try {
      alternatives_from_sampling = Properties::getBoolProperty("alternatives-from-sampling");
  } catch (...) {
      alternatives_from_sampling = false;
  }
  try {
      show_progress = Properties::getBoolProperty("progress");
  } catch (...) {
    show_progress = false;
  }

  try {
      mea_prediction = Properties::getBoolProperty("mea");
  } catch (...) {
    mea_prediction = false;
  }

  if(mea_prediction || Constant::MultSpeciesMode){
    sampleiterations = 100;
    alternatives_from_sampling = true;
    minexonintronprob = 0;
    minmeanexonintronprob = 0;
  }

  try {
    mea_eval = Properties::getBoolProperty("mea_evaluation");
  } catch (...) {
    mea_eval = false;
  }

  // Read in protein profile
  try {
      PP::Profile* prfl;
      try {
	  prfl = new PP::Profile(Properties::getProperty("proteinprofile"));
      } catch (PP::ProfileNotFoundError e) {
	  try {
	      prfl = new PP::Profile(Properties::getConfigFilename("proteinprofile"));
	  } catch (PP::ProfileNotFoundError) {
	      throw e;
	  }
      }
      profileModel = prfl ? new PP::SubstateModel(*prfl) : NULL;
      if (prfl) {
	  cout << "# Using protein profile " << prfl->getName() << "\n";
	  cout << "# " << prfl->getInfoLine() << "\n";
      }
  } catch (KeyNotFoundError) {
      profileModel = NULL;
  } catch (PP::ProfileInsigError e) {
      cerr << e.getMessage() << ". Not using protein profile.\n";
      cerr << "Choose different block thresholds to enforce using this profile.\n";
      cout << "# Not using protein profile.";
      profileModel = NULL;
  }

  needForwardTable = (sampleiterations > 0);
  readTransAndInitProbs();
  readOvlpLenDist();
  checkProbsConsistency();
  computeReachableStates();
  createStateModels();
  
  /*
   * Create map of state types
   */
  stateMap.resize(statecount);
  for (int i=0; i< statecount; i++) {
    stateMap[i] = getStateType(i);
  }

  if (Constant::temperature && Constant::augustus_verbosity)
      cout << "# setting temperature to " << Constant::temperature << " (for sampling)" << endl;
  sampledTxs = NULL;
}

void NAMGene::setStatesInitialProbs(){  
    for( int i = 0; i < statecount; i++ ) {
	viterbi[0][i] = initProbs[i];
	if (needForwardTable)
	    forward[0][i] = initProbs[i];
    }
}

StateType NAMGene::getStateType(int i){
    if (i >= 0 && i < statecount)
	return states[i]->getStateType();
    else
	return TYPE_UNKNOWN;
}

int NAMGene::getStateIndex(StateType type){
    for( int i = 0; i < statecount; i++ ) 
	if (getStateType(i) == type)
	    return i;
    string errmsg = "Unknown state type ";
    errmsg += type;
    throw NAMGeneError( errmsg);
}

void NAMGene::viterbiAndForward( const char* dna, bool useProfile){
 
  StateModel::setPP(useProfile ? profileModel : NULL);  
  int progress, oldprogress=0;

  /*
   * Determine the length of the DNA sequence under consideration
   * and form the forward matrix
   */
  int dnalen = strlen( dna );
  viterbi.assign(dnalen, statecount);
  if (needForwardTable) 
      forward.assign(dnalen, statecount);

  /*
   * Initialize the first Viterbi column with InitialProbs
   */
  setStatesInitialProbs();

  /*
   * The heart of the algorithm.
   * The actual work is done in the states.
   */
  OptionListItem oli;
  int deletionEnd = 0;
  cs.computeStairs(dna);
  StateModel::setContentStairs(&cs);

  curGCIdx = -1; // initialize with invalid GC content class
  prepareModels(dna, dnalen);
  

  /*
   * Check if all the sequence contains of non-nucleotides (e.g. N)
   * If yes, do not execute Viterbi/Forward: everything is considered intergenic
   */

  if (containsJustNonNucs(dna, dnalen)) {
#ifdef DEBUG
    cout << "#warning: sequence contains just letters different from a,c,g,t: " << dnalen << " of them." << endl;
#endif
    curGCIdx = cs.idx[0];
    initAlgorithms(); // this indirectly calls initPredecessors required for backtracking later
    updateToLocalGCEach(curGCIdx);
    for( int j = 1; j < dnalen; j++ ) {
      for( int i = 0; i < statecount; i++ ){
	if (i == synchstate) {
	    viterbi[j][i] = viterbi[j-1][i]/4;
	  if (needForwardTable)
	      forward[j][i] = forward[j-1][i]/4;
	} else {
	    viterbi[j].erase(i);
	    if (needForwardTable)
		forward[j].erase(i);
	}
      }
    }
    return;
  }

  if (show_progress){
      cerr << "viterbi algorithm progress:\n[%]: ";
      oldprogress = 0;
  }
#ifdef DEBUG
  double capacity=0;
  double fcapacity=0;
  cerr << "debug activated!" << endl;
  vector<string> statevec;
  for (int i=0; i<statecount; i++)
      statevec.push_back(stateTypeNames[getStateType(i)]);
  double beforeCount=0.0, afterCount=0.0;
  int mincount = 1000;
#endif

  initAlgorithms(); // update GC content dependent parameters
  for( int j = 1; j < dnalen; j++ ) { // TODO: this ignores the first nucleotide
      if (cs.idx[j] != curGCIdx) {// check whether GC content has changed, this is in particular the case at the very start
          curGCIdx = cs.idx[j];
	  updateToLocalGCEach(curGCIdx, j, cs.getNextStep(j) - 1); // update GC content dependent parameters
      }
      if (show_progress) {
	  progress = 1+100*j/dnalen;
	  if (progress > oldprogress){
	      if (progress%10<9 && progress%10>0)
		  cerr << ".";
	      else if (progress%10==0)
		  cerr << progress;
	  }
	  oldprogress = progress;
      }
      if (useProfile) 
	  profileModel->advanceScores(j);
      for( int i = 0; i < statecount; i++ ){
	  if (stateReachable[i]) {
	      states[i]->viterbiForwardAndSampling(viterbi, forward, i, j, doViterbi(needForwardTable), oli);
	  }
      }
      if (j % 1000 == 0) {
#ifdef DEBUG 
	  cerr << "[" << j;
#endif
	  int deletedSubmaps=0;
	  int newDeletionEnd = StateModel::getActiveWindowStart(j);
	  while (deletionEnd < newDeletionEnd) {
	      deletedSubmaps += viterbi[++deletionEnd].eraseUnneededSubstates();
#ifdef DEBUG
	      // viterbi[deletionEnd].checkAfterWindow();
#endif
	  }

#ifdef DEBUG
	  cerr << "]";
#endif 
      }
      
#ifdef DEBUG
      try {
	  viterbi[j].testIntegrity();
      } catch (...) {
	  cerr << "j=" << j << "\n";
	  throw;
      }
      capacity += viterbi[j].capacity();
      if (needForwardTable) 
	  fcapacity += forward[j].capacity();
#endif
      if (useProfile) {
#ifdef DEBUG
	  beforeCount += viterbi[j].totalSubstateCount();
#endif
	  profileModel->clearLowScoringSubstates(viterbi[j]);
#ifdef DEBUG
	  int newcount = viterbi[j].totalSubstateCount();
	  afterCount += newcount;
	  if (newcount > mincount)  {
//	      viterbi[j].showSubstateInfo(cerr);
	      mincount = newcount;
	  }
#endif
      }
#ifdef DEBUG
# ifdef DEBUG_BACK
      if (useProfile) 
	  for ( int i=0; i<statecount; i++) 
	      if (viterbi[j].hasSubstates(i)) { 
		  if (i==8 || i==25 || i==1 || i==24) {
		      cerr << "hit at " << j << "\n";
		      states[i]->viterbiForwardAndSampling(viterbi, forward, i, j,
							   doBacktracking, oli);
		  } else {
		      const ViterbiSubmapType& submap = viterbi[j].getSubstates(i);
		      for (ViterbiSubmapType::const_iterator it = submap.begin();
			   it != submap.end();
			   ++it) {
			  states[i]->viterbiForwardAndSampling(viterbi, forward, getFullStateId(i, it->first), j,
							       doBacktracking, oli);
		      }
		  }
		  try {
		      viterbi[j].getSubstates(i).checkCounts();
		  } catch (ProjectError) {
		      throw ProjectError("checkCounts() failed at base=" + itoa(j) + ", state=" + itoa(i) + " (" + stateTypeIdentifiers[stateMap[i]] + ")");
		  }
	      }
# endif
#endif
  }
#ifdef DEBUG
  if (useProfile) {
# ifdef DEBUG_DELALL
      while (deletionEnd < dnalen-1) 
      viterbi[++deletionEnd].eraseUnneededSubstates();
      // for (int j=dnalen-1; j>0; j--) 
      //     viterbi[j].eraseUnneededSubstates();
      viterbi.showSubstateMemoryUsage();
      exit(0);
# endif
      fprintf(stderr, "Number of substates reduced to %2.0f%%, average number was %.0f, is now %.0f.\n", afterCount/beforeCount*100, beforeCount/(dnalen-1), afterCount/(dnalen-1));
      profileModel->currentResult.clear();
      profileModel->outputFullBlocks();
        cerr << "\nMemory usage of Viterbi matrix (without substates): " << 
	  (dnalen*(sizeof(ViterbiColumnType) + statecount) + capacity*sizeof(Double))/1024 << "KB.\n";
      viterbi.showSubstateMemoryUsage(0, 2*deletionEnd - dnalen);
      viterbi.showSubstateMemoryUsage(2*deletionEnd - dnalen, deletionEnd);
      viterbi.showSubstateMemoryUsage(deletionEnd);
  }
#endif
  if (show_progress)
      cerr << endl;

  /*
   * The multiplication of the terminal probabilities is not done here, because some algorithms
   * call viterbiForwardAndSampling and thus change the viterbi and forward table again.
   * This probability needs to be multiplied at the appropriate places.
   */

} // end ViterbiAndForward

StatePath* NAMGene::getSampledPath(const char *dna, const char* seqname){
  StatePath *sampledPath = new StatePath();
  if (!needForwardTable)
      return sampledPath;
  if (seqname)
      sampledPath->seqname = seqname;

  /* determine the starting state of the sampling, i.e. the last state 
   * of the sampled state path
   */
  int dnalen = forward.size(); 
  int base, state;
  Double pathEmiProb = 1.0;
  if (containsJustNonNucs(dna, dnalen)) {
      sampledPath->push(new State(0, dnalen-1, getStateType(synchstate)));
    for (int i=0; i<dnalen; i++) 
	pathEmiProb *= (float) .25;
  } else { // normal case
      const ViterbiColumnType& lastCol = forward[dnalen-1];
      OptionsList ol;
      OptionListItem oli;
      for (int i=0; i < statecount; i++) 
	  if (lastCol[i] * termProbs[i]>0.0)
	      ol.add(i, dnalen-1, lastCol[i] * termProbs[i]);
      ol.prepareSampling();
      oli = ol.sample();
    
      base = oli.base;
      state = oli.state;
      pathEmiProb *= oli.probability;
      oli.reset();

      while (base>0) {
	  if (cs.idx[base] != curGCIdx) { // check whether GC content has changed
	    curGCIdx = cs.idx[base];
	    // update GC content dependent parameters,
	    // no new computation of SegProbs (empty interval [2,1]
	    updateToLocalGCEach(curGCIdx, 2,1);
	  }
	  states[state]->viterbiForwardAndSampling( viterbi, forward, state, base, 
						    doSampling, oli);
	  State *temp = new State(oli.base+1, base, getStateType(state));
	  temp->setTruncFlag(base, oli.base, dnalen);
	  sampledPath->push(temp);
	  if (Constant::overlapmode && oli.predEnd > -INT_MAX)
	      base = oli.predEnd;
	  else
	      base = oli.base;
	  state = oli.state;
	  pathEmiProb *= oli.probability;
	  oli.reset();
      }
  }

  /*
   * this pathEmiProb is (should be equal) to what getPathEmiProb computes on the sampledPath
   */
  sampledPath->pathemiProb = pathEmiProb;
  return sampledPath;
}

/*
 * NAMGene::getViterbiPath
 * Create the Viterbi-Path
 */
StatePath* NAMGene::getViterbiPath(const char *dna, const char* seqname){
  OptionListItem oli;
  int dnalen = viterbi.size();
  StatePath *viterbiPath = new StatePath();
  int state=-1;
  Double maxValue(0.0), value;
  if (seqname != NULL) 
    viterbiPath->seqname = seqname;

  // allow viterbi path to end in any state
  const ViterbiColumnType& lastcol = viterbi[dnalen-1];
  for (int i=0; i < statecount; i++) {
      SubstateId substate;
      if (profileModel && profileModel->allow_truncated) 
	  lastcol.getMaxSubstate(i, substate, value); // allow viterbi path to end in substates too
      else 
	  value = lastcol[i];
      value *= termProbs[i];
      if (value > maxValue) {
	  maxValue = value;
	  state = getFullStateId(i, substate);
      }
  }
  if (maxValue <= 0) {
      throw ProjectError("No feasible path found in HMM");
  }

  /*
   * Set the common probability of the viterbi path and the emission
   */
  viterbiPath->pathemiProb = maxValue;
#ifdef DEBUG
  cerr << "Viterbi Score: " << maxValue << "\n";
  cerr << "ln ""        : " << maxValue.log() << "\n";
#endif
    int base = dnalen-1;
    oli.reset();
    
    while (base>0) {
	int stateidx;
	SubstateId substate;  
	getStatePair(state, stateidx, substate);
#ifdef DEBUG_STATES
	if (stateMap[stateidx] != igenic && !isGeometricIntron(stateMap[stateidx]) &&
	    !isRGeometricIntron(stateMap[stateidx])) 
	{
	    cerr << "state=" << stateidx << ", stateType=" << stateTypeIdentifiers[stateMap[stateidx]]
		 << ", base=" << base << ", ln vit=" <<  viterbi[base].get(stateidx, substate).log() << endl;
	    if (isCodingExon(stateMap[stateidx])) 
		cerr << endl;
	}
#endif
	// need to test on long example  seqs with index changes (bug fixed on Mar 25th, 2015)
	if (cs.idx[base] != curGCIdx) { // check whether GC content has changed
	    curGCIdx = cs.idx[base];
	    // update GC content dependent parameters,
	    // no new computation of SegProbs (empty interval [2,1]
	    updateToLocalGCEach(curGCIdx, 2, 1);
	}
	states[stateidx]->viterbiForwardAndSampling( viterbi, forward, state, base, 
						     doBacktracking, oli );
	if (!Constant::overlapmode && ((oli.base >= base && oli.state == state) || (oli.base > base + 10))) {
	  throw ProjectError("Viterbi got stuck at state " + itoa(state) + "and base " + itoa(base) +
			     ".\n(oli.state =" + itoa(oli.state) + ", oli.base="+ itoa(oli.base) +")");
	}
	State *temp = new State(oli.base+1, base, getStateType(stateidx));
	temp->setTruncFlag(base, oli.base, dnalen);
	viterbiPath->push(temp);
	if (Constant::overlapmode && oli.predEnd > -INT_MAX)
	  base = oli.predEnd;
	else 
	  base = oli.base;
	state = oli.state;
	oli.reset();
    }
    if (profileModel)
	profileModel->appendMatchesTo(viterbiPath->proteinMatches);
    return viterbiPath;
}

/*
 * NAMGene::doViterbiPiecewise
 * Iterate the viterbi algorithm on pieces of DNA small enough so that the DP matrices fit into memory.
 */
Transcript *NAMGene::doViterbiPiecewise(SequenceFeatureCollection& sfc, AnnoSequence *annoseq, Strand strand){
  list<AltGene> *geneList = new list<AltGene>;
  char *dna = annoseq->sequence;
  char *curdna;
  AnnoSequence *curAnnoSeq;
  char *seqname = annoseq->seqname;
  vector<Double> *origInitProbs;
  vector<Double> *origTermProbs;
  vector<Double> *synchProbs;
//  SequenceFeatureCollection *partSFC;
  static int geneid = 1; // made this static so gene numbering goes across sequences and is unique
  int transcriptid;
  int curdnalen;
  
  bool singlestrand = false; // singlestrand = no shadow states
  try {
    singlestrand = Properties::getBoolProperty("singlestrand");
  } catch (...) {}
  
  int maxstep = 1000000;
  int endPos, beginPos;
  int seqlen = strlen(dna);
  try {
    maxstep = Properties::getIntProperty( "maxDNAPieceSize" );
  } catch (...) {}
  if (maxstep < 1000) {
      cerr << "maxDNAPieceSize is too small: " << maxstep << endl;
      exit(1);
  }

  beginPos = 0;
  try {
    synchstate = Properties::getIntProperty( "/NAMGene/SynchState" );  
  } catch (...) {
    cerr << "No synch state specified!" << endl;
  }

  /*
   * If we can't do it in one piece - we must do multiple pieces.
   * At the cutting points only the synchstate is allowed. So
   * change the initial and terminal probabilities for the cutting points.
   */
  origInitProbs = new vector<Double>(statecount, 0.0);
  origTermProbs = new vector<Double>(statecount, 0.0);
  synchProbs = new vector<Double>(statecount, 0.0);
    
  for (int i=0; i<statecount; i++) {
    (*origInitProbs)[i] = initProbs[i];
    (*origTermProbs)[i] = termProbs[i];
    if (i == synchstate)
      (*synchProbs)[i] = 1.0;
    else
      (*synchProbs)[i] = 0.0;
  }
 
  /*
   * loop over the pieces
   */
  do {
     endPos = getNextCutEndPoint(dna, beginPos, maxstep, sfc);
     if (show_progress) 
	 cerr << "examining piece " << beginPos + annoseq->offset + 1 << ".." 
	      << endPos + annoseq->offset + 1<< " (" << (endPos-beginPos+1)
	      << " bp)" << endl;
#ifdef DEBUG
      cout << "# examining piece " << beginPos + annoseq->offset + 1<< ".." << endPos + annoseq->offset + 1 << " (" << (endPos-beginPos+1) << " bp)" << endl;
#endif
    curAnnoSeq = new AnnoSequence();
    curdnalen = endPos-beginPos+1;
    curAnnoSeq->length = curdnalen;
    curdna = newstrcpy(dna+beginPos, curdnalen);
    curAnnoSeq->sequence = curdna;
    curAnnoSeq->offset = annoseq->offset + beginPos;

    /*
     * Set the initial and terminal probabilities.
     */
    for (int i=0; i<statecount; i++) {
      if (beginPos == 0)
	initProbs[i] = (*origInitProbs)[i];
      else 
	initProbs[i] = (*synchProbs)[i];
      if (endPos == seqlen-1)
	termProbs[i] = (*origTermProbs)[i];
      else 
	termProbs[i] = (*synchProbs)[i];
    }

    list<AltGene> *pieceGenes = new list<AltGene>;

    if (!singlestrand) {
	SequenceFeatureCollection partSFC(sfc, beginPos, endPos);
	pieceGenes = getStepGenes(curAnnoSeq, partSFC, strand);
    } else {
	if (strand == plusstrand || strand == bothstrands){
	    SequenceFeatureCollection partSFC(sfc, beginPos, endPos);
	    pieceGenes = getStepGenes(curAnnoSeq, partSFC, plusstrand);
	}
	if (strand == minusstrand || strand == bothstrands) {
	    list<AltGene> *pieceForwardGenes, *pieceReverseGenes;
	    char *reverseDNA = reverseComplement(curdna);
	    delete [] curAnnoSeq->sequence;
	    curAnnoSeq->sequence = reverseDNA;
	    SequenceFeatureCollection partSFC(sfc, beginPos, endPos, true);
	    pieceForwardGenes = getStepGenes(curAnnoSeq, partSFC, plusstrand);
	    pieceReverseGenes = reverseGeneList(pieceForwardGenes, endPos-beginPos);
	    pieceGenes->splice(pieceGenes->end(), *pieceReverseGenes);
	}
    }
    pieceGenes->sort();
    // shift gene coordinates, set sequence name, gene and transcript names
    for (list<AltGene>::iterator agit = pieceGenes->begin(); agit != pieceGenes->end(); ++agit){
	agit->shiftCoordinates(beginPos + annoseq->offset);
	agit->seqname = seqname;
     
	if (Constant::uniqueGeneId){
	    char buf[38];
	    sprintf(buf, "%.30s.g%d", seqname, geneid);
	    agit->id = buf;
	} else {
	    agit->id = "g"+itoa(geneid);
	} // TEMPorarily uncommented
      
	agit->sortTranscripts();

	transcriptid = 1;
	for(list<Transcript*>::iterator it = agit->transcripts.begin();it != agit->transcripts.end(); ++it ) {
	    (*it)->seqname = seqname;
	    (*it)->id = "t" + itoa(transcriptid);
	    (*it)->geneid = agit->id;
	    transcriptid++;
	}
	geneid++;
    }
    // print the genes
    printGeneList(pieceGenes, annoseq, Constant::codSeqOutput, Constant::proteinOutput, sfc.collection->hasHintsFile);

    // append pieceGenes to geneList
    geneList->splice(geneList->end(), *pieceGenes);

    
	
    delete curAnnoSeq;
    beginPos = endPos + 1;
  } while (beginPos < seqlen);

  /*
   * Reset the original initial and terminial probabilities.
   */
  for (int i=0; i<statecount; i++) {
    initProbs[i] = (*origInitProbs)[i];
    termProbs[i] = (*origTermProbs)[i];
  }
  delete origInitProbs;
  delete origTermProbs;
  delete synchProbs;
  if (geneList->empty() && !Constant::MultSpeciesMode)
      cout << "# (none)" << endl;
  return getPtr(geneList);
}

/*
 * NAMGene::getStepGenes
 */
list<AltGene> *NAMGene::getStepGenes(AnnoSequence *annoseq, SequenceFeatureCollection& sfc, Strand strand, bool onlyViterbi){
    list<AltGene> *genesAllHints, *genesPartialHints, *genes;
    list<list<AltGene> *> *results;
    const char *dna = annoseq->sequence;
    char *curdna;
    int curdnalen;
    int n = strlen(dna);
    bool uniqueCDS; 
    try {
	uniqueCDS = Properties::getBoolProperty("uniqueCDS");
    } catch(...) {
	uniqueCDS = false;
    }

    sfc.setSeqLen(n);
    sfc.computeHintedSites(dna);
    sfc.makeGroups();
    sfc.prepareLocalMalus(dna);
    //sfc->printGroups(); //AUSGABE
    StateModel::setSFC(&sfc);
    /*
     * First make a prediction run on the whole sequence using all hints at the same time.
     */
    genesAllHints = findGenes(dna, strand, onlyViterbi);
#ifdef DEBUG
    // printGeneList(genesAllHints, annoseq, false, false, false);
#endif
    results = new list<list<AltGene> *>;
    results->push_back(genesAllHints);
    sfc.createPredictionScheme(genesAllHints);
    PredictionScheme *scheme  = sfc.predictionScheme;
#ifdef DEBUG
    //scheme->print(annoseq->offset);
#endif
    /*
     * Then make further runs with certain hintgroups deactivated.
     */
#ifdef DEBUG
    int numruns = scheme->predictionRuns.size();
    cout << "Make " << numruns << " additional prediction runs." << endl; 
#endif

    for (list<PredictionRun>::iterator rit = scheme->predictionRuns.begin(); rit != scheme->predictionRuns.end(); ++rit){
#ifdef DEBUG
	cout << "Make run ";
	rit->print();
#endif
	curdnalen = rit->end - rit->begin + 1;
	curdna = newstrcpy(dna + rit->begin, curdnalen);
	sfc.setActiveFlag(rit->omittedGroups, false);
	sfc.shift(-rit->begin);
	sfc.setSeqLen(curdnalen);
	sfc.computeHintedSites(curdna);
	sfc.prepareLocalMalus(curdna);
        // TODO: set initial and terminal probs to allow partial genes again.
	genesPartialHints = findGenes(curdna, strand, onlyViterbi); 
	for (list<AltGene>::iterator agit = genesPartialHints->begin(); agit != genesPartialHints->end(); ++agit)
	    agit->shiftCoordinates(rit->begin);
#ifdef DEBUG
	printGeneList(genesPartialHints, annoseq, false, false, false);
#endif
	results->push_back(genesPartialHints);
	sfc.setSeqLen(n);
	sfc.shift(rit->begin); // shift back
	sfc.setActiveFlag(rit->omittedGroups, true);
	delete [] curdna;
    }
    genes = sfc.joinGenesFromPredRuns(results, maxtracks, uniqueCDS);
    // TODO: delete results 
    postProcessGenes(genes, annoseq); // truncate masked UTRs
    return genes;
}


/*
 * NAMGene::findGenes
 * onlyViterbi: if true, only the Viterbi transcripts are output
 * numSample: number of sample iterations, if onlyViterbi is true, these are used
 * to compute the a posteriori probabilities of the Viterbi transcripts, exons, introns.
 *
 */

list<AltGene> *NAMGene::findGenes(const char *dna, Strand strand, bool onlyViterbi){
  list<AltGene> *agl;
  list<Transcript*> alltranscripts;
  list<Transcript*> *filteredTranscripts = new list<Transcript*>;
  list<Transcript*>::iterator geneit1, geneit2;
  if (sampleiterations < 1)
      sampleiterations = 1;
  list<AltGene>::iterator agit;
  Transcript *genes = NULL, *g;
  StatePath *viterbiPath;
  StatePath *condensedViterbiPath;
  
  // compute the viterbi and forward table, main work done here
  viterbiAndForward(dna, profileModel);
#ifdef DEBUG
  cerr << "After viterbi: average load is " << viterbi.load() << ",\n"
       << "                      used are " << viterbi.used_load() << " of " << statecount << " states.\n";
  // cerr << "      Forward: average load is " << forward.load() << ",\n"
  //      << "                      used are " << forward.used_load() << " of " << statecount << " states.\n";
  cerr << "globalThreshUsedCount=" << globalThreshUsedCount << endl;
  globalThreshUsedCount=0;
#endif
 

  /*
   * add the viterbi transcripts to the list of genes
   */
  viterbiPath = getViterbiPath(dna, "");
  //getPathEmiProb(viterbiPath, dna); // for testing
  condensedViterbiPath = StatePath::condenseStatePath(viterbiPath);
  //cout << "Viterbi path:" << endl;
  //viterbiPath->print(); // for testing
  //cout << "Condensed Viterbi path:" << endl;
  //condensedViterbiPath->print(); // for testing
  //printDPMatrix();
  genes = condensedViterbiPath->projectOntoGeneSequence("g");
  delete viterbiPath;

  // clear viterbi matrix (not needed anymore) to save heap space;
  // the length will stay unchanged, but the entries at each base
  // position are deleted (this way we allow viterbiForwardAndSampling
  // to access the values so it won't cause errors; since in sampling
  // mode the values are ignored, we can assume from now on that they
  // are all 0).
  int dnalen = viterbi.size();
  viterbi.assign(dnalen, statecount); 
#ifdef DEBUG
  cerr << "Now we deleted the viterbi matrix.\n";
#endif

  for (g = genes; g != NULL; g = g->next) {
      g->apostprob = 1.0;        // add the viterbi transcripts with a very light weight
      g->setStatePostProbs(1.0); // just to ensure that they are present when sampling
      g->setSampleCount(1);
      g->hasProbs = true;
      g->throwaway = false;
      g->viterbi = true;
      alltranscripts.push_back(g);
  }

  int progress, oldprogress=0;
  if (sampleiterations > 1) {
      if (show_progress) {
	  cerr << "sampling algorithm progress:\n[%]: ";
	  oldprogress = 0;
      }
    /*
     * Sample and add the sampled genes to the list of genes
     */
    StatePath *sampledPath, *condensedsampledPath;
    for (int i=0; i < sampleiterations-1; i++) {
#ifdef DEBUG
	cerr << "Sample iteration " << i << endl;
#endif
      if (show_progress) {
	  progress = 100*i/(sampleiterations-2);
	  while (progress > oldprogress){
	      oldprogress++;
	      if (oldprogress%10<9 && oldprogress%10>0)
		  cerr << ".";
	      else if (oldprogress%10==0)
		  cerr << oldprogress;
	  }
      }
      // sample the transcripts 
      sampledPath = getSampledPath(dna, "");
      condensedsampledPath = StatePath::condenseStatePath(sampledPath);
      // condensedsampledPath->print(); // for testing
      char gr[9];
      sprintf(gr, "s%d-", (i+1));
      genes = condensedsampledPath->projectOntoGeneSequence(gr);
      delete sampledPath;
      delete condensedsampledPath;
      
      //cout << "i=" << i << endl;
      // store all sampled transcripts
      for (g = genes; g != NULL; g = g->next) {
	  //  g->printGFF();
        // initialize, all apostprobs and counts must be set to 1 
	g->apostprob = 1.0;
	g->setStatePostProbs(1.0);
	g->setSampleCount(1);
	g->hasProbs = true;
	g->viterbi = false;
	if (!alternatives_from_sampling)
	  g->throwaway = true; // no alternatives requested, throw sampled gene away later
	alltranscripts.push_back(g);
      }
    } // for i<sampleiterations
    if (show_progress)
	cerr << endl;
        
    alltranscripts.sort(ptr_comparison<Transcript>());
    // now remove multiple copies and increase apostprob instead
    for (geneit1 = alltranscripts.begin(); geneit1 != alltranscripts.end();){
	geneit2 = geneit1;
	for (geneit2++; geneit2 != alltranscripts.end() && (*geneit2)->geneBegin() == (*geneit1)->geneBegin();)
	    if (**geneit1 == **geneit2){
		(*geneit1)->throwaway &= (*geneit2)->throwaway; // throw away only if all versions should be thrown away.
		(*geneit1)->viterbi |= (*geneit2)->viterbi;
		(*geneit1)->addSampleCount(1);
		(*geneit1)->apostprob += 1.0;
		(*geneit1)->addStatePostProbs(1.0);
		delete *geneit2; // free space for duplicated transcript
		geneit2 = alltranscripts.erase(geneit2); // remove deleted pointer from the list as well
	    } else
		geneit2++;
	geneit1++;
    }
   
    /*
     * compute the posterior probabilities of the states and transcripts
     */
    for (geneit1 = alltranscripts.begin(); geneit1 != alltranscripts.end(); geneit1++){
	geneit2 = geneit1;
	geneit2++;
	for (; geneit2 != alltranscripts.end() && (*geneit2)->geneBegin() <= (*geneit1)->geneEnd(); geneit2++)
	    (*geneit1)->updatePostProb(*geneit2);
    }
    for (geneit1 = alltranscripts.begin(); geneit1 != alltranscripts.end(); geneit1++){
	(*geneit1)->normPostProb(sampleiterations); // +1 wegen Viterbipfad
    }
  }
  /*
   * filter transcripts by probabilities, strand
   */
  bool hasInFrameStop = false;
  filterGenePrediction(alltranscripts, *filteredTranscripts, dna, strand, noInFrameStop, hasInFrameStop,
                       minmeanexonintronprob, minexonintronprob);

#ifdef DEBUG
  if (hasInFrameStop)
      cerr << "have at least one gene with in-frame stop codon" << endl;
  else
      cerr << "have no gene with in-frame stop codon" << endl;
#endif
  
  // determine transcripts with maximum expected accuracy criterion
  if (Constant::MultSpeciesMode){
      // store alltranscripts in member variable of NAMGene
      setAllTranscripts(filteredTranscripts);
      agl = new list<AltGene>;
  } else if (mea_prediction){
      list<Transcript*> mt = getMEAtranscripts(*filteredTranscripts, dna);
      agl = groupTranscriptsToGenes(mt);
  } else { //filter transcripts by maximum track number 
      agl = groupTranscriptsToGenes(*filteredTranscripts);
  }
  
  if (sampleiterations>1) {
    /*
     * compute apostprob of genes. The posterior 'probability' of a gene is the 
     * expected number of transcripts overlaping the region of the gene on 
     * the given strand. This may now be larger than 1 (changed March 15, Mario).
     */
    for (agit = agl->begin(); agit != agl->end(); ++agit) {
	for (list<Transcript*>::iterator tit = alltranscripts.begin(); tit != alltranscripts.end() && (*tit)->geneBegin() <= agit->maxcodend; ++tit){
	    if (agit->overlaps(*tit))
		agit->apostprob += (*tit)->apostprob;
	}
    }
  } else { // no sampling, set hasProbs to false
    for (agit = agl->begin(); agit != agl->end(); ++agit) {
      agit->hasProbs = false;
      for (list<Transcript*>::iterator git = agit->transcripts.begin(); git != agit->transcripts.end(); ++git){
	(*git)->hasProbs = false;
	(*git)->setStateHasScore(false);
      }
    }
  }

  delete condensedViterbiPath;
 
#ifdef DEBUG
  cerr << "At the end of findGenes: average load is " << viterbi.load() << ",\n"
       << "                                used are " << viterbi.used_load() << " of " << statecount << " states.\n";
  cerr << "                Forward: average load is " << forward.load() << ",\n"
       << "                                used are " << forward.used_load() << " of " << statecount << " states.\n";
#endif
  return agl;
  // TODO: the *filteredTranscripts has to be deleted eventually after cmpgenepred has called .getAllTranscripts()
}


/* 
 * NAMGene::getNextCutEndPoint 
 * Find a suitable point to cut dna into pieces for separate analysis. 
 * Cutting point should be at most maxstep from dna + beginPos, as large as possible  
 * but also likely to be in the intergenic region (synch state). 
 */ 
int NAMGene::getNextCutEndPoint(const char *dna, int beginPos, int maxstep, SequenceFeatureCollection& sfc){
  int restlen = strlen(dna+beginPos);
  char *curdna; 
  int cutendpoint=0;
  int examChunkSize=50000;
  int examIntervalStart, examIntervalEnd;
  SequenceFeatureCollection *partSFC;
  list<Feature> *groupGaps;
  list<Feature>::iterator lastirit;
  
  if (examChunkSize < 0.2*maxstep)
    examChunkSize = (int) (0.2*maxstep);
  if (examChunkSize > 150000)
    examChunkSize = 150000;

  if (restlen <= maxstep){
    return beginPos + restlen-1;
  } else {
      /*
       * try to find a good breakpoint
       * 1st try: window of size examChunkSize around last groupgap in range or at end of range if groupgap does not exist.
       *          If have two genes with intergenic region in groupgap: take center of largest such ir.    
       * 2nd try: Double the size of the window. Try second to last groupgap.
       *          If have two genes with intergenic region in groupgap: take center of largest such ir.
       *          Take internal predicted intergenic region with largest overlap to a groupgap if exists.
       *          Take the larger predicted marginal intergenic region in groupgap.
       *          Give up: Take right boundary of range.
       */
      
      groupGaps = new list<Feature>;
      if (sfc.groupGaps) {
	  for (list<Feature>::iterator it = sfc.groupGaps->begin(); it != sfc.groupGaps->end() && it->start <= beginPos+maxstep; ++it)
	      if (it->end > beginPos && it->start <= beginPos+maxstep)
		  groupGaps->push_back(*it);
      } else {
	  groupGaps->push_back(Feature(1, strlen(dna)-1, irpartF, bothstrands, -1, "groups"));
      }

      int centerLastGap;
      if (!groupGaps->empty()) {
	  lastirit = groupGaps->begin();
	  while (lastirit != groupGaps->end() && lastirit->start < beginPos + maxstep)
	      ++lastirit;
	  if (lastirit != groupGaps->begin())
	      --lastirit;
	  if (lastirit->end - lastirit->start < examChunkSize)
	      centerLastGap = (lastirit->end + lastirit->start) / 2;
	  else 
	      centerLastGap = lastirit->end - examChunkSize/2; // for huge gaps, take the right subwindow of size examChunkSize
      } else {
	  centerLastGap = beginPos + maxstep - 1;
      }
      if (examChunkSize > maxstep) {
	  examIntervalStart = beginPos;
	  examIntervalEnd = beginPos + maxstep - 1;
      } else {
	  examIntervalStart = centerLastGap - examChunkSize/2;
	  examIntervalEnd   = centerLastGap + examChunkSize/2;
      
	  if (examIntervalEnd >= beginPos + maxstep){ // too far right: shift left
	      examIntervalStart -= (examIntervalEnd - (beginPos + maxstep-1));
	      examIntervalEnd   = beginPos + maxstep - 1;
	  }
	  if (examIntervalStart < beginPos) {// too far left: shift right
	      examIntervalEnd += beginPos - examIntervalStart;
	      examIntervalStart = beginPos;
	  }
      }

      /*
       * 1st try: window of size examChunkSize around last groupgap in range or at end of range if groupgap does not exist.
       *          If have two genes with intergenic region in groupgap: take center of largest such ir.
       */
      StatePath *viterbiPath; 
      StatePath *condensedViterbiPath;

      curdna = newstrcpy(dna + examIntervalStart, examIntervalEnd-examIntervalStart+1);
      partSFC = new SequenceFeatureCollection(sfc, examIntervalStart, examIntervalEnd + 10000);
      partSFC->setSeqLen(examIntervalEnd-examIntervalStart+1);
      partSFC->computeHintedSites(curdna);
      partSFC->prepareLocalMalus(curdna);
      StateModel::setSFC(partSFC);
      viterbiAndForward(curdna);      // do not use protein profile here
      viterbiPath = getViterbiPath(curdna, "temp");
      delete partSFC; //achtung, das muß nach getViterbiPath stehen
      delete [] curdna;
      condensedViterbiPath = StatePath::condenseStatePath(viterbiPath);
      //cout << "getNextCutEndPoint viterbi path:" << endl;
      //condensedViterbiPath->print();
      cutendpoint = tryFindCutEndPoint(condensedViterbiPath, examIntervalStart, examIntervalEnd, groupGaps, true);
      delete viterbiPath;
      delete condensedViterbiPath;
      if (cutendpoint != -1) { // found cutendpoint in first try
	  //cout << "# found cutendpoint in first try : " << cutendpoint << endl;
      } else {
	  /* 
	   * 2nd try: Double the size of the window. Try second to last groupgap.
	   *          Take internal predicted intergenic region with largest overlap to a groupgap if exists.
	   *          Take the larger predicted marginal intergenic region in groupgap.
	   *          Take the largest intergenic region (don't care about groupgaps).
	   *          Give up: Take right boundary of range.
	   */
	  examChunkSize *=2;
	  if (examChunkSize > maxstep)
	      examChunkSize = maxstep;
	  if (!groupGaps->empty()) {
	      // take the last but one group gap if existent and if the last group gap is not larger than the previous exam interval
	      if (lastirit->start >= examIntervalStart && lastirit != groupGaps->begin())
		  lastirit--;

	      if (lastirit->end - lastirit->start < examChunkSize)
		  centerLastGap = (lastirit->end + lastirit->start) / 2;
	      else 
		  centerLastGap = lastirit->end - examChunkSize/2; // for huge gaps, take the right subwindow of size examChunkSize
	  } else {
	      centerLastGap = beginPos + maxstep - 1;
	  }
	  examIntervalStart = centerLastGap - examChunkSize/2;
	  examIntervalEnd   = centerLastGap + examChunkSize/2;
	  
	  if (examIntervalEnd >= beginPos + maxstep){ // too far right: shift left
	      examIntervalStart -= (examIntervalEnd - (beginPos + maxstep-1));
	      examIntervalEnd   = beginPos + maxstep - 1;
	  }
	  if (examIntervalStart < beginPos){ // too far left: shift right
	      examIntervalEnd += beginPos - examIntervalStart;
	      examIntervalStart = beginPos;
	  }

	  curdna = newstrcpy(dna + examIntervalStart, examIntervalEnd-examIntervalStart+1);
	  partSFC = new SequenceFeatureCollection(sfc, examIntervalStart, examIntervalEnd + 10000);
	  partSFC->setSeqLen(examIntervalEnd-examIntervalStart+1);
	  partSFC->computeHintedSites(curdna);
	  partSFC->prepareLocalMalus(curdna);
	  StateModel::setSFC(partSFC);
	  viterbiAndForward(curdna); // do not use protein profile here
	  viterbiPath = getViterbiPath(curdna, "temp");
	  delete partSFC; // note: this has to be called AFTER getViterbiPath
	  delete [] curdna;
	  condensedViterbiPath = StatePath::condenseStatePath(viterbiPath);
	  cutendpoint = tryFindCutEndPoint(condensedViterbiPath, examIntervalStart, examIntervalEnd, groupGaps, true);
	  if (cutendpoint == -1) {
	      cutendpoint = tryFindCutEndPoint(condensedViterbiPath, examIntervalStart, examIntervalEnd, groupGaps, false);
	      if (cutendpoint == -1) {
		  cutendpoint = tryFindCutEndPoint(condensedViterbiPath, examIntervalStart, examIntervalEnd, NULL, false);
		  if (cutendpoint == -1) {
		      cutendpoint = beginPos + maxstep - 1;
		  }
	      }
	  }
	  delete viterbiPath; 
	  delete condensedViterbiPath; 
      }
      
      delete groupGaps;
    
  } 
  if (cutendpoint <= beginPos + 0.05 * maxstep || cutendpoint <= beginPos + 5000) // move by at least 5% and by at least 5000bp
      cutendpoint = beginPos + maxstep - 1;
  return cutendpoint; 
} 

/*
 * NAMGene::tryFindCutEndPoint
 * 
 * returns the center of the largest intersection of a predicted intergenic region with a groupgap
 * only considers internal intergenic regions if onlyInternalIR=true, except if a marginal IR is at least half the size of the range 
 * If groupGaps is NULL, don't intersect with groupgaps. Simply find the largest IR.
 * return -1. if no such IR found.
 * irsize is the size of the likely intergenic region
 */ 

int NAMGene::tryFindCutEndPoint(StatePath *condensedExamPath, int examIntervalStart, int examIntervalEnd, list<Feature> *groupGaps, bool onlyInternalIR){
    // determine the internal intergenic region that has largest overlap with a groupGap (if any)
    int irbegin, irend;
    int maxirbegin = -1, maxirend = -1;
    State *st = condensedExamPath->first;
    bool groupGapsNULL = (groupGaps == NULL);
    if (groupGapsNULL){
	groupGaps = new list<Feature>;
	groupGaps->push_back(Feature(0, INT_MAX, irpartF, bothstrands, -1, "groups"));
    }
    if (st) {
	while (st) {
	    if(st->type == igenic){ // interior intergenic region
		irbegin = examIntervalStart + st->begin;
		irend = examIntervalStart + st->end;
		// intersect intergenic region with groupgaps
		int lgbegin=-1, lgend=-1;
		for (list<Feature>::iterator it = groupGaps->begin(); it != groupGaps->end(); ++it){
		    if (it->start<irbegin && it->end<=irend && it->end>=irbegin && it->end-irbegin> lgend-lgbegin) { 
			// predicted ir          -----------------
			// groupgap          ------------
			// intersection          --------
			lgbegin = irbegin;
			lgend = it->end;
		    } else if (it->start < irbegin && it->end>irend && irend-irbegin > lgend-lgbegin) {
			// predicted ir          -----------------
			// groupgap          -----------------------
			// intersection          -----------------
			lgbegin = irbegin;
			lgend = irend;
		    } else if (it->start > irbegin && it->end < irend && it->end-it->start > lgend-lgbegin) {
			// predicted ir          -----------------
			// groupgap                 ---------       
			// intersection             ---------     
			lgbegin = it->start;
			lgend = it->end;
		    } else if (it->start >= irbegin && it->start <= irend && it->end >= irend && irend-it->start > lgend-lgbegin) {
			// predicted ir          -----------------
			// groupgap                           ---------       
			// intersection                       ----
			lgbegin = it->start;
			lgend = irend;
		    } 
		}
		
		if (lgend-lgbegin > maxirend-maxirbegin &&
		    ((st != condensedExamPath->first && st->next != NULL) || // internal intergenic region
		     !onlyInternalIR || // allow marginal ir when onlyInternalIR is false
		     // or when the size is at least half the size of the examined range
		     lgend - lgbegin > (examIntervalEnd - examIntervalStart)/2)){
		    maxirbegin = lgbegin;
		    maxirend = lgend;
		}
	    }
	    st = st->next;
	} 
    }
    if (groupGapsNULL)
	delete groupGaps;

    if (maxirend-maxirbegin>0) {
	return (maxirend + maxirbegin)/2;
    } else {
	return -1;
    }
}

/*
 * NAMGene::getTrainViterbiPath
 * 
 * Does the whole job of finding the most likely path for training.
 * No splitting into chunks. No alternatives. Not using protein profile.
 */
StatePath* NAMGene::getTrainViterbiPath(const char *dna, SequenceFeatureCollection *sfc){
    bool crf = inCRFTraining;
    inCRFTraining = false; // this way Viterbi will be much more efficient because of use of static variables
    sfc->computeHintedSites(dna);
    StateModel::setSFC(sfc);
        viterbiAndForward(dna); // do not use protein profile here
    inCRFTraining = crf;
    return getViterbiPath(dna, "");
}


/*
 * NAMGene::getEmissionProbability()
 * 
 * returns the probability of the emission in the last run of the forward algorithm
 */ 

Double NAMGene::getEmissionProbability(){
    if (!needForwardTable)
	return 0.0;
    Double alpha = 0.0;
    const ViterbiColumnType& lastCol = forward[forward.size()-1];
    for (int i=0; i < statecount; i++) {
	alpha += lastCol[i] * termProbs[i];
    }
    return alpha;
}

/*
 * NAMGene::getPathEmiProb
 * compute the common probability of the path and the emission
 *
 * Only count the emissionprobabilities in the interval [countStart, coundEnd]
 * | state1 | | state3 | | | | |        state8                   | | | | state 12   | | |
 *                                        |                                           |  
 *                                      countStart                                countEnd
 *                                        <--      partProb   --->
 *                             <---        emi                --->
 */

Double NAMGene::getPathEmiProb(StatePath *path, const char *dna, SequenceFeatureCollection& sfc, int countStart, int countEnd){
    State *curstate, *laststate = NULL;
    Double pathemi = 1.0, emi;
    int stateidx;
    StateModel::setSFC(&sfc);
    if (!cs.dna || strcmp(dna,cs.dna))
      cs.computeStairs(dna); //(re)initialize GC content classes
    curGCIdx = -1; // initialize with invalid GC content class
    if (countEnd == -1) { // count all emission probabilities
	countStart = -100;
	countEnd = strlen( dna ) + 100;
    }
    for (curstate = path->first; curstate != NULL; curstate = curstate->next){
	stateidx = getStateIndex(curstate->type);
	int pos = curstate->end;
	if (pos > cs.n-1)
	    pos = cs.n-1; // some states, e.g. exons, may extend past the sequence end
	if (pos < 0)
	    pos = 0; // can happen with a start codon at the very beginning of the sequence
	if (cs.idx[pos] != curGCIdx) {
          curGCIdx = cs.idx[pos];
	  updateToLocalGCEach(curGCIdx, 2, 1); // update GC content dependent parameters
	}
	emi = states[stateidx]->emiProbUnderModel(curstate->begin, curstate->end);
	curstate->prob = emi;
	if (curstate->begin >= countStart && curstate->end <= countEnd){ // fragment lies completely in count region
	    pathemi *= emi;
	    //cout << "emi " << curstate->begin << " " <<  curstate->end << " " << emi << endl;
	} else if (!(curstate->end < countStart || curstate->begin > countEnd)){
	    // nontrivial overlap, only count a fraction corresponding to the overlap
	    int ovlp = ((curstate->end > countEnd)? countEnd : curstate->end) -
		((curstate->begin > countStart)? curstate->begin : countStart) + 1;
	    Double partProb = exp((emi.log() * ovlp / (curstate->end - curstate->begin + 1)));
	    pathemi *= partProb;
	} // otherwise ignore this emission probability
	
	if (laststate && curstate->begin >= countStart && curstate->end <= countEnd) {
	    // multiply also by the transition probability of the transition "laststate->curstate"
	    pathemi *= transitions[getStateIndex(laststate->type)][stateidx];
	} else { 
	    // first state: multiply also by the initial probability
	    /*
	     * HACK: if the first state is an exon with a negative start position use a transition from
	     * igenic region (=0) to this exon also.
	     */
	    if (initProbs[stateidx] > 0.0 && curstate->begin >= countStart && curstate->end <= countEnd)
		pathemi *= initProbs[stateidx];
	    //else // assume viterbi was correct, not correct, TODO 
	    //pathemi *= initProbs[0] * transitions[0][curstate->type]; //igenic region -> curstate
	}
	laststate = curstate;
    }
    // multiply with terminal probability
    if (laststate->begin >= countStart && laststate->end <= countEnd){
	pathemi *= termProbs[getStateIndex(laststate->type)];
    }
    path->pathemiProb = pathemi;
    return pathemi;
}

void NAMGene::readTransAndInitProbs( ){
  ifstream istrm;
  string speciesValue = Properties::getProperty(SPECIES_KEY);
  if (!inCRFTraining)
      cout << "# " << speciesValue << " version.";
  try {
      // look if there is a species specific transition file and take it
      string fname = Constant::fullSpeciesPath() + speciesValue + "_" + Properties::getProperty(TRANSFILE_KEY);
      istrm.open(fname.c_str());
      if (istrm.peek() == EOF) // file doesn't exist
	  throw ProjectError(fname + " doesn't exist");
      if (!inCRFTraining)
	  cout << " Using species specific transition matrix: " << fname;
  } catch (...){
      istrm.clear();
      // use default transition parameter file
      string fname = Constant::modelPath() + Properties::getProperty(TRANSFILE_KEY);
      istrm.open(fname.c_str());
      if (!inCRFTraining)
	  cout << " Using default transition matrix.";
  }
  cout << endl;

  if( istrm ){
    int i, j, k;
    int count;
    int numStartPossible=0, numEndPossible=0;
    transitions.assign(statecount, statecount);
    initProbs.assign(statecount, 0.0);
    termProbs.assign(statecount, 0.0);
    stateReachable.assign(statecount, true);

    /* First determine the number of states specified in the file.
     * Throw an exception if this value isn't equal to statecount from the 
     * properties.
     */
    istrm >> comment >> count;
    if( count != statecount ){
      istrm.close();
      throw NAMGeneError( "Incorrect state count in transition file!!!" );
    }

    /*
     * read in the initial probabilities
     * THIS IS A HACK. Put this into a configuration file.
     */
    istrm >> goto_line_after( "[Initial]" ) >> comment >> numStartPossible;
    for (k=0; k<numStartPossible; k++ ) {
      istrm >> comment >> i;
      istrm >> initProbs[i]; 
    }
    istrm >> goto_line_after( "[Terminal]" ) >> comment >> numEndPossible;
    for (k=0; k < numEndPossible; k++ ) {
      istrm >> comment >> i;
      istrm >> termProbs[i];
    }
	
    /*
     *  read in the transition probabilities
     */
    istrm >> goto_line_after( "[Transition]" );
    while( istrm ){
      istrm >> comment >> i >> j ;
      if (i >= statecount || j >= statecount){
	cerr << "transition:" << i << "->" << j << endl;
	throw NAMGeneError( "State number in transition file too large!");
      }
      istrm >> transitions[i][j];
    }

    istrm.close();
  } else {
    throw NAMGeneError( "Could't open the file with transition probabilities." );
  }
}

void NAMGene::readOvlpLenDist( ){
  if (!Constant::overlapmode)
    return;
  // this function is only relevant for prokaryotes (--genemodel=bacterium)
  ifstream istrm;
  string speciesValue = Properties::getProperty(SPECIES_KEY);
  try {
      // look if there is a species specific ovlp_len.pbl file and take it
      string fname = Constant::fullSpeciesPath() + speciesValue + "_" + OVLPLENFILE;
      istrm.open(fname.c_str());
      if (istrm.peek() == EOF) // file doesn't exist
	  throw ProjectError(fname + " doesn't exist");
      cout << "# Using species specific overlap length distribution: " << fname << endl;
  } catch (...){
      istrm.clear();
      // use default overlap length distribution file
      string fname = Constant::modelPath() + OVLPLENFILE;
      istrm.open(fname.c_str());
      cout << "# Using default overlap length distribution file." << endl;
  }
  
  if (Constant::maxOvlp >= Constant::min_coding_len){
      cerr << string("WARNING: maxOvlp(") +  itoa(Constant::maxOvlp) +
	  string(") is not smaller than /Constant/min_coding_len(")
	  + itoa(Constant::min_coding_len) + ").\n" +
	  "This may result in nested genes, either properly nested, or several genes with the same stop codon.\n" + 
	  "You may want to change this by decreasing maxOvlp or increasing /Constant/min_coding_len." << endl;
  }
  if (istrm) {
    Constant::head2tail_ovlp.assign(Constant::maxOvlp+1, 0.0);
    istrm >>  goto_line_after( "[HEAD2TAIL]" );
    istrm >> comment;
    for (int i=0; i <= Constant::maxOvlp; i++)
      istrm >> Constant::head2tail_ovlp[i];
    Constant::head2head_ovlp.assign(Constant::maxOvlp+1, 0.0);
    istrm >>  goto_line_after( "[HEAD2HEAD]" );
    istrm >> comment;
    for (int i=0; i <= Constant::maxOvlp; i++)
      istrm >> Constant::head2head_ovlp[i];
    Constant::tail2tail_ovlp.assign(Constant::maxOvlp+1, 0.0);
    istrm >>  goto_line_after( "[TAIL2TAIL]" );
    istrm >> comment;
    for (int i=0; i <= Constant::maxOvlp; i++)
      istrm >> Constant::tail2tail_ovlp[i];
    istrm.close();
  } else {
    throw NAMGeneError( "Could't open the file with the overlap length distribution." );
  }
}

#define EPSILON_CONSISTENCY 0.00000001
void NAMGene::checkProbsConsistency( ){
  Double sum;
  int i,j;
    
  /*
   * check whether initProbs really is a vector of probabilities.
   */
  sum=0;
  for (i=0; i< statecount; i++) {
    sum += initProbs[i];
  }
#ifdef DEBUG
  if (sum > Double(1.0) + EPSILON_CONSISTENCY || sum < Double(1.0) - EPSILON_CONSISTENCY)
    cerr << "Warning: Sum of all initial probabilities is 1 + delta, delta=" << setprecision(7) << sum-1.0 << ". Not equal to 1." << endl;
#endif
  
  /*
   * check whether termProbs really is a vector of probabilities.
   */
  sum=0;
  for (i=0; i< statecount; i++) {
    sum += termProbs[i];
  }
#ifdef DEBUG
  if (sum > Double(1.0) + EPSILON_CONSISTENCY || sum < Double(1.0) - EPSILON_CONSISTENCY)
    cerr << "Warning: Sum of all terminal probabilities is 1 + delta, delta=" << setprecision(7) << sum-1.0 << ". Not equal to 1." << endl;
#endif

  /*
   * check whether transitions really is a vector of probabilities.
   */
  for (i=0; i<statecount; i++) {  
    sum=0;
    for (j=0; j<statecount; j++) {
      sum += transitions[i][j];
      if (transitions[i][j]<0)
	  cerr << "Error: transition probability from state " << i << " to " << j << " is negative:" << transitions[i][j] << endl;
    }
#ifdef DEBUG
    if (sum > Double(1.0) + EPSILON_CONSISTENCY || sum < Double(1.0) - EPSILON_CONSISTENCY) {
	cerr << "Warning: Sum of transition probabilities out of state "<< i << " is 1 + delta, delta=" << setprecision(7) << sum-1.0
	     << ". Not equal to 1." << endl;
	cerr << sum << "\n";
	cerr << "Transitions are: " << transitions[i][0];
	for (j=1; j<statecount; j++) 
	    cerr << ", " << transitions[i][j];
	cerr << "\n";
    }
#endif
  }
  for (j=0; j<statecount; j++) {
    sum=0;
    for (i=0; i<statecount; i++) {
      sum += transitions[i][j];
    }
#ifdef DEBUG
    if (sum < EPSILON_CONSISTENCY) {
      cerr << "Warning: Sum of all transition probabilities into state "<< j << " is 0." << endl;
    }
#endif
  }
}

void NAMGene::computeReachableStates( ){
    bool expanded = true;
    bool allReachable;
    for (int i=0; i < statecount; i++)
	stateReachable[i] = (initProbs[i] > 0.0);

    while (expanded) {
	expanded = false;
	for (int i=0; i<statecount; i++) {
	    for (int j=0; j<statecount; j++) {
		if (stateReachable[i] && !stateReachable[j] && transitions[i][j] > 0.0) {
		    stateReachable[j] = expanded = true;
		}
	    }
	}
    }
    allReachable = true;
    for (int i=0; i < statecount; i++)
	allReachable &= stateReachable[i];
#ifdef DEBUG
    if (!allReachable) {
	cerr << "The following states are not reachable: ";
	for (int i=0; i < statecount; i++)
	    if (!stateReachable[i])
		cerr << i << ", ";
	cerr << endl << "If this is not on purpose you should check the transition matrix and the initial probabilities." << endl;
    }
#endif
}

void NAMGene::createStateModels( ){
    states.resize( statecount );
    StateModel::resetModelCounts();
    
    for( int i = 0; i < statecount; ++i ){
	/*
	 * Determine the names of the states and create the state objects
	 */ 
	const char* statename = Properties::getProperty("/NAMGene/state", i);
	states[i] = StateModel::newStateModelPtr(statename);
    }
}

void NAMGene::initAlgorithms(){
  StateModel::resetPars();
  for ( int i = 0; i < statecount; ++i ){
      states[i]->initAlgorithms(transitions, i);
  }
  /*
   * give the states the possibly implicitly changed transition matrix above 
   * in order to contruct the lists of predecessors
   *
   */
  for( int i = 0; i < statecount; i++ )
    states[i]->initPredecessors(transitions, i);
}

void NAMGene::updateToLocalGCEach(int idx, int from, int to){
    StateModel::updateToLocalGC(idx, from, to); 
    for ( int i = 0; i < statecount; ++i )
	states[i]->updateToLocalGCEach(transitions, i);
    /*
     * give the states the possibly implicitly changed transition matrix above 
     * in order to contruct the lists of predecessors
     *
     */
    for( int i = 0; i < statecount; i++ )
	states[i]->initPredecessors(transitions, i);

}

void NAMGene::prepareModels(const char* dna, int dnalen) {
    StateModel::prepareViterbi(dna, dnalen, stateMap);
#ifdef DEBUG
    PP::ExonScorer::transitions = &transitions;
#endif
    PP::SubstateModel::setStateStrands(stateMap);
}

/*
 * NAMGene::setPathAndProb
 * compute the emission probability of the path induced by the annotated genes
 * for scoreTx option to augustus
 */

void NAMGene::setPathAndProb(AnnoSequence *annoseq, FeatureCollection &extrinsicFeatures){
    while (annoseq){
	annoseq->anno->path = StatePath::getInducedStatePath(annoseq->anno->genes, annoseq->length);
	SequenceFeatureCollection &sfc = extrinsicFeatures.getSequenceFeatureCollection(annoseq->seqname);
	sfc.prepare(annoseq, false);
	sfc.computeHintedSites(annoseq->sequence);
	sfc.prepareLocalMalus(annoseq->sequence);
	prepareModels(annoseq->sequence, annoseq->length);
	annoseq->anno->emiProb = getPathEmiProb(&(*(annoseq->anno->path)), &(*(annoseq->sequence)), sfc);
	annoseq = annoseq->next;
    }
}

void NAMGene::printDPMatrix(){
    cout << "----- Viterbi matrix: -----" << endl;
    for (int t=0; t <viterbi.size(); t++){
	cout << t;
	for (int s = 0; s < statecount; s++)
	    cout << "\t" << stateTypeIdentifiers[stateMap[s]] << ": " << viterbi[t][s];
	cout << endl;
    }
}