src/evaluation.cc - augustus (debian/3.4.0+dfsg2-1)

Tree @debian/3.4.0+dfsg2-1 (Download .tar.gz)

evaluation.cc @debian/3.4.0+dfsg2-1 — raw · history · blame

/*
 * evaluation.cc
 *
 * License: Artistic License, see file LICENSE.TXT or 
 *          https://opensource.org/licenses/artistic-license-1.0
 */

#include "evaluation.hh"

// project includes
#include "gene.hh"
#include "namgene.hh"

// standard C/C++ includes
#include <iomanip>  // for setw, setprecision


void Evaluation::addToEvaluation(Transcript *predictedGenes, Transcript *annotatedGenes, Strand strand, Double quotient){
    // find flanking regions on both sides (any strand)
    leftFlankEnd=-1, rightFlankBegin=-1;
    for (Transcript* a = annotatedGenes; a != NULL; a = a->next){
	if ((leftFlankEnd == -1 && a->geneBegin() > 0) || (leftFlankEnd >=0 && a->geneBegin() - 1 < leftFlankEnd))
	    leftFlankEnd = a->geneBegin() - 1;
	if (rightFlankBegin == -1 || a->geneEnd() + 1 > rightFlankBegin)
	    rightFlankBegin = a->geneEnd() + 1;
    }
    if (rightFlankBegin == -1)
	for (Transcript* p = predictedGenes; p != NULL; p = p->next)
	    if (p->geneEnd() + 1 > rightFlankBegin)
		rightFlankBegin = p->geneEnd() + 1;

    int oldgeneFN = geneFN;
    Transcript *predFW, *predBW, *annoFW, *annoBW;
    
    predFW = Transcript::getGenesOnStrand(predictedGenes, plusstrand);
    predBW = Transcript::getGenesOnStrand(predictedGenes, minusstrand);
    annoFW = Transcript::getGenesOnStrand(annotatedGenes, plusstrand);
    annoBW = Transcript::getGenesOnStrand(annotatedGenes, minusstrand);  
    if (strand == bothstrands || strand == plusstrand) {
	addToEvaluation(predFW, annoFW);
    }
    if (strand == bothstrands || strand == minusstrand) {
	addToEvaluation(predBW, annoBW);
    }

    // temporarily for analysis
    if (geneFN > oldgeneFN) {
	string seqname = annotatedGenes ? annotatedGenes->seqname : "";
	//if (seqname)
	// cerr << "# at least 1 gene not correctly predicted in sequence " << seqname << endl;
    }

    /*
     * add the quotient to the vector of quotients
     */
    if (!(quotient < 0.0))
	quotients.push_front(quotient);

    // delete the gene sequences of the strands
    Transcript::destroyGeneSequence(predFW);
    Transcript::destroyGeneSequence(predBW);
    Transcript::destroyGeneSequence(annoFW);
    Transcript::destroyGeneSequence(annoBW);
}


/*
 * add one dataset to the evaluation
 */
void Evaluation::addToEvaluation(Transcript* predictedGeneList, Transcript* annotatedGeneList){
#ifdef DEBUG
    // for analysis only
    if (!predictedGeneList && annotatedGeneList)
	cerr << "In strand containing " << annotatedGeneList->id << " nothing was found." << endl;
    // end for analysis only
    for (Transcript *pg = predictedGeneList; pg != NULL; pg = pg->next)
	for (State *intron = pg->introns; intron != NULL; intron = intron->next)
	    if (intron->length()>longestPredIntronLen)
		longestPredIntronLen = intron->length();
#endif

    /*
     * Initialize variables
     */
     nukTPBK = nukFPBK = nukFPBKinside = nukFNBK = 0;
     exonTPBK = exonFPBK = exonFNBK = 0;

    /*
     * First make two lists just of all exons
     * Then let them contain only the unique exons.
     */
    list<State> *predictedExons=new list<State>, *annotatedExons=new list<State>;
    State *st;
    const Transcript *g;

    g = predictedGeneList;
    while (g) {
	st = g->exons;
	while (st) {
	    predictedExons->push_back(*st);
	    st = st->next;
	}
	g = g->next;
    }
 
    g = annotatedGeneList;
    while (g) {
	st = g->exons;
	while (st) {
	    annotatedExons->push_back(*st);
	    st = st->next;
	}
 	g = g->next;
    }   
    numAnnoExons += annotatedExons->size();
    numPredExons += predictedExons->size();
    /*
     * make the list of unique
     */
    
    predictedExons->sort();
    annotatedExons->sort();
    predictedExons->unique();
    annotatedExons->unique();

    numUniqueAnnoExons += annotatedExons->size();
    numUniquePredExons += predictedExons->size();


    // evaluateQuickOnNucleotideLevel(predictedExons.first, -1, annotatedExons.first, -1);
    evaluateOnNucleotideLevel(predictedExons, annotatedExons); // coding bases only
    evaluateOnExonLevel(predictedExons, annotatedExons);
    evaluateOnGeneLevel(predictedGeneList, annotatedGeneList);
    evaluateOnUTRLevel(predictedGeneList, annotatedGeneList);

    /*
     * update the derived values
     */ 
    // nucleotide level
    nukSens = (double) nukTP/(nukTP + nukFN);
    nukSpec = (double) nukTP/(nukTP + nukFP);

    // exon level
    exonFP = exonFP_wrong + exonFP_partial + exonFP_overlapping;
    exonFN = exonFN_wrong + exonFN_partial + exonFN_overlapping;
    exonSens = (double) exonTP/(exonTP+exonFN);
    exonSpec = (double) exonTP/(exonTP+exonFP);

    // gene level
    geneFP = numPredGenes - geneTP;
    geneSens = (double) geneTP / (geneTP + geneFN);
    geneSpec = (double) geneTP / (geneTP + geneFP);

    // UTR level see print();
    UTRexonSens = (double) UTRexonTP/(UTRexonTP+UTRexonFN);
    UTRexonSpec = (double) UTRexonTP/(UTRexonTP+UTRexonFP);
    nucUSens = (double) nucUTP/(nucUTP + nucUFN);
    nucUSpec = (double) nucUTP/(nucUTP + nucUFP);

    delete predictedExons;
    delete annotatedExons;
    numDataSets++;
}

void Evaluation::evaluateQuickOnNucleotideLevel(State* const predictedExon, int curPredBegin, 
						State* const annotatedExon, int curAnnoBegin){
    if (curPredBegin < 0 && predictedExon) {
	curPredBegin = predictedExon->begin;
    }
    if (curAnnoBegin < 0 && annotatedExon) {
	curAnnoBegin = annotatedExon->begin;
    }
    if (predictedExon == NULL && annotatedExon == NULL) {
	return;
    } else if (annotatedExon == NULL) {
	// only predicted Exons
	nukFP   += predictedExon->end - curPredBegin + 1;
	nukFPBK += predictedExon->end - curPredBegin + 1;
	evaluateQuickOnNucleotideLevel(predictedExon->next, -1, annotatedExon, curAnnoBegin);
    } else if (predictedExon == NULL) {
	nukFN   += annotatedExon->end - curAnnoBegin + 1;
	nukFNBK += annotatedExon->end - curAnnoBegin + 1;
	evaluateQuickOnNucleotideLevel(predictedExon, curPredBegin, annotatedExon->next, -1);
    } else {
	// both a predicted and an annotated exon
	if (curPredBegin < curAnnoBegin) {
	    // predicted exon begins first
	    if (predictedExon->end < annotatedExon->begin) {
		// predicted exon completely before annotated exon
		nukFP   += predictedExon->end - curPredBegin + 1;
		nukFPBK += predictedExon->end - curPredBegin + 1;
		evaluateQuickOnNucleotideLevel(predictedExon->next, -1, annotatedExon, curAnnoBegin);
	    } else {
		// part of predicted exon before first annotated exon
		nukFP   += curAnnoBegin - curPredBegin;
		nukFPBK += curAnnoBegin - curPredBegin;
		evaluateQuickOnNucleotideLevel(predictedExon, curAnnoBegin, annotatedExon, curAnnoBegin);
	    }
	} else if (curAnnoBegin < curPredBegin){
	    // annotated exon begins first
	     if (annotatedExon->end < predictedExon->begin) {
		// annotated exon completely before predicted exon
		nukFN   += annotatedExon->end - curAnnoBegin + 1;
		nukFNBK += annotatedExon->end - curAnnoBegin + 1;
		evaluateQuickOnNucleotideLevel(predictedExon, curPredBegin, annotatedExon->next, -1);
	    } else {
		// part of annotated exon before first predicted exon
		nukFN   += curPredBegin - curAnnoBegin;
		nukFNBK += curPredBegin - curAnnoBegin;
		evaluateQuickOnNucleotideLevel(predictedExon, curPredBegin, annotatedExon, curPredBegin);
	    }
	} else {
	    // annotated and predicted exon begin at the same position
	    if (predictedExon->end < annotatedExon->end) {
		// predicted exon shorter
		nukTP   += predictedExon->end - curPredBegin + 1;
		nukTPBK += predictedExon->end - curPredBegin + 1;
		evaluateQuickOnNucleotideLevel(predictedExon->next, -1, annotatedExon, predictedExon->end + 1); 
	    } else if (annotatedExon->end < predictedExon->end) {
		// annotated exon shorter	
		nukTP   += annotatedExon->end - curAnnoBegin +1;
		nukTPBK += annotatedExon->end - curAnnoBegin +1;
		evaluateQuickOnNucleotideLevel(predictedExon, annotatedExon->end +1, annotatedExon->next, - 1);
	    } else {
		// predicted and annotated exon have the same end and beginning :-)
		nukTP   += predictedExon->end - curPredBegin + 1;
		nukTPBK += predictedExon->end - curPredBegin + 1;
		evaluateQuickOnNucleotideLevel(predictedExon->next, -1, annotatedExon->next, - 1);
	    }
	}
    }
}

void Evaluation::evaluateOnNucleotideLevel(list<State> *predictedExon, list<State> *annotatedExon, bool UTR){
    
  list<State>::iterator pred, anno;
  int n=0;
  char *nuc;
  // find the last state in each list
  for (anno = annotatedExon->begin(); anno != annotatedExon->end(); anno++){
      if (anno->end > n)
	  n = anno->end;
  }
  for (pred = predictedExon->begin(); pred != predictedExon->end(); pred++)
    if (pred->end > n)
      n = pred->end;

  /*
   * classify each nucleotide into these four cases
   * first bit (1): annotated?
   * second bit (2): predicted?
   * third bit (4): predicted and in flanking region?
   */
   try {
    nuc = new char[n+1];
  } catch (...) {
    throw ProjectError("Not enough memory for evaluation on base level.");
    }

   for (int i=0; i<=n; i++)
     nuc[i] = 0;
  
    /*
     * Loop over the predicted exons and find the predicted coding nucs 
     */
  for (pred = predictedExon->begin(); pred != predictedExon->end(); pred++) {
      for (int i = (pred->begin >=0)? pred->begin : 0; i <= pred->end; i++){
	  nuc[i] |= 2;
	  if (i > leftFlankEnd && i < rightFlankBegin)
	      nuc[i] |= 4;
      }
  }
  /*
   * Loop over the annotated exons and find the annotated coding nucs 
   */
  for (anno = annotatedExon->begin(); anno != annotatedExon->end(); anno++) {
    for (int i = (anno->begin >= 0)? anno->begin : 0; i <= anno->end; i++)
      nuc[i] |= 1;
  }
  
  /*
   * Now count the TP, FP and FN
   */
  for (int i=0; i<=n; i++){
      int f = nuc[i];
      if (f == 1){
	if (!UTR) {nukFNBK++;} else {nucUFN++;}
      }
      if((f & 1) == 0 && (f & 2) ){
	if (!UTR) {nukFPBK++;} else {nucUFP++;}
      }
      if((f & 1) == 0 && (f & 4) ){
	if (!UTR) {nukFPBKinside++;} else {nucUFPinside++;}
      }
      if ((f & 1) && (f & 2) ){
	if (!UTR) {nukTPBK++;} else {nucUTP++;}
      }
    }
  if (!UTR){
    nukFN += nukFNBK;
    nukFP += nukFPBK;
    nukFPinside += nukFPBKinside;
    nukTP += nukTPBK;
  }
  delete []nuc;
}

void Evaluation::evaluateQuickOnExonLevel(State* predictedExon, State* annotatedExon){
    
    State *examined, *firstOverlap, *lastOverlap;

    /*
     * First go through the predicted exons and check for each one whether it is
     * a true positive, or one of the three kinds of false positives.
     */

    firstOverlap = annotatedExon;
    for (examined = predictedExon; examined != NULL; examined = examined->next){
	while (firstOverlap && firstOverlap->end < examined->begin) {
	    firstOverlap = firstOverlap->next;
	}
	if (firstOverlap == NULL || firstOverlap->begin > examined->end) {
	    exonFP_wrong++;
	    exonFPBK++;
	} else if ((firstOverlap->begin == examined->begin) &&
		   (firstOverlap->end == examined->end)) {
	    exonTP++;
	    exonTPBK++;
	} else {
	    lastOverlap = firstOverlap;
	    while (lastOverlap && lastOverlap->end < examined->end) {
		lastOverlap = lastOverlap->next;
	    }
	    if ((firstOverlap->begin == examined->begin) || 
		(lastOverlap && (lastOverlap->end == examined->end))) {
		exonFP_partial++;
		exonFPBK++;
	    } else {
		exonFP_overlapping++;
		exonFPBK++;
	    }
	}
    }  
    /*
     * Now go through the annotated exons and check for each one whether it is
     * one of the three kinds of false negatives.
     */
    firstOverlap = predictedExon;
    for (examined = annotatedExon; examined != NULL; examined = examined->next){
	while (firstOverlap && firstOverlap->end < examined->begin) {
	    firstOverlap = firstOverlap->next;
	}
	if (firstOverlap == NULL || firstOverlap->begin > examined->end) {
	    exonFN_wrong++;
	    exonFNBK++;
	} else if ((firstOverlap->begin == examined->begin) &&
		   (firstOverlap->end == examined->end)) {
	    // do nothing: true positives have been counted above
	} else {
	    lastOverlap = firstOverlap;
	    while (lastOverlap && lastOverlap->end < examined->end) {
		lastOverlap = lastOverlap->next;
	    }
	    if ((firstOverlap->begin == examined->begin) || 
		(lastOverlap && (lastOverlap->end == examined->end))) {
		exonFN_partial++;
		exonFNBK++;
	    } else {
		exonFN_overlapping++;
		exonFNBK++;
	    }
	}
    }
}

void Evaluation::evaluateOnExonLevel(list<State> *predictedExon, list<State> *annotatedExon, bool UTR){
    
    list<State>::iterator examined, anno, pred;
    int klasse;
    /*
     * First go through the predicted exons and check for each one whether it is
     * a true positive, or one of the four kinds of false positives.
     * 0 : wrong     FP
     * 1 : overlap   FP
     * 2 : partial   FP, one end exact
     * 3 : almost    FP, one end exact, other end at most UTRoffThresh bp off
     * 4 : correct   TP
     */

    for (examined = predictedExon->begin(); examined != predictedExon->end(); examined++){
	klasse = 0;
	for (anno = annotatedExon->begin(); anno != annotatedExon->end(); anno++){
	    // overlaps with the annotated exon?
	    if (!(examined->begin > anno->end || examined->end < anno->begin)) 
		if (klasse<1) 
		    klasse = 1;
	    // one splice site correct?
	    if (examined->begin == anno->begin || examined->end == anno->end)
		if (klasse < 2)
		    klasse = 2;
	    // almost correct?
	    if (abs(examined->begin-anno->begin) <= UTRoffThresh && abs(examined->end-anno->end) <= UTRoffThresh)
	      if (klasse < 3)
		klasse = 3;
	    // correct?
	    if (examined->begin == anno->begin && examined->end == anno->end)
		if (klasse < 4)
		    klasse = 4;
	}
	switch (klasse) {
	  case 0: if (!UTR) {exonFP_wrong++;       exonFPBK++; } else {UTRexonFP++;} break;
	  case 1: if (!UTR) {exonFP_overlapping++; exonFPBK++; } else {UTRexonFP++;} break;
	  case 2: if (!UTR) {exonFP_partial++;     exonFPBK++; } else {UTRexonFP++;} break;
          case 3: if (!UTR) {exonFP_partial++;     exonFPBK++; } else {UTRexonTP++;} break; // UTR exons less stringent
	  case 4: if (!UTR) {exonTP++;             exonTPBK++; } else {UTRexonTP++;} break;
	    default: break;
	}
    }

    /*
     * Now go through the annotated exons and check for each one whether it is
     * one of the three kinds of false negatives.
     * 0 : wrong     FN
     * 1 : overlap   FN
     * 2 : partial   FN
     * 3 : almost    FN (UTR:TP)
     * 3 : correct   TP (not counted here, because exons could be annotated multiply)
     */

    for (examined = annotatedExon->begin(); examined != annotatedExon->end(); examined++){
	klasse = 0;
	for (pred = predictedExon->begin(); pred !=  predictedExon->end(); pred++) {
	    // overlaps with the predicted exon?
	    if (!(examined->begin > pred->end || examined->end < pred->begin)) 
		if (klasse<1) 
		    klasse = 1;
	    // one splice site correct?
	    if (examined->begin == pred->begin || examined->end == pred->end)
		if (klasse < 2)
		    klasse = 2;
	    if (abs(examined->begin - pred->begin) <= UTRoffThresh && abs(examined->end - pred->end) <= UTRoffThresh)
              if (klasse < 3)
                klasse = 3;
	    // correct?
	    if (examined->begin == pred->begin && examined->end == pred->end)
		if (klasse < 4)
		    klasse = 4;
	}
	switch (klasse) {
	    case 0: if (!UTR) {exonFN_wrong++; }       else {UTRexonFN++;} break;
	    case 1: if (!UTR) {exonFN_overlapping++; } else {UTRexonFN++;} break;
	    case 2: if (!UTR) {exonFN_partial++; }     else {UTRexonFN++;} break;
	    case 3: if (!UTR) {exonFN_partial++; }     else {            } break; // UTR exons less stringent
	    default: break;
	}
    }
}

void Evaluation::evaluateQuickOnGeneLevel(Transcript* const predictedGeneList, Transcript* const annotatedGeneList){
   Transcript *examined, *firstOverlap;
   State *aexon, *pexon;

   /*
    * Check for each annotated Gene, whether it is completely correct predicted
    */
   firstOverlap = predictedGeneList;
   for(examined = annotatedGeneList; examined != NULL; examined = (Gene*) examined->next) {
       numAnnoGenes++;
       while (firstOverlap && firstOverlap->exons->begin < examined->exons->begin)
	   firstOverlap = firstOverlap->next;

       if (firstOverlap) {
	   aexon = examined->exons;
	   pexon = firstOverlap->exons;
	   while (aexon && pexon && aexon->begin == pexon->begin && aexon->end == pexon->end) {
	       aexon = aexon->next;
	       pexon = pexon->next;
	   }
	   if (aexon == NULL && pexon == NULL && firstOverlap->complete) {
	       geneTP++;
	   } else {
	       geneFN++;
	   }
       } else {
	   geneFN++;
       }
   }

   /*
    * Count the number of ((complete)) predicted Genes
    */
   for (examined = predictedGeneList; examined != NULL; examined = examined->next)
       numPredGenes++;
}

void Evaluation::evaluateOnGeneLevel(Transcript* const predictedGeneList, Transcript* const annotatedGeneList){
   Transcript *examined, *pred, *previous;
   State *aexon, *pexon;
   bool correctPredicted;
   /*
    * Check for each annotated Gene, whether it is completely correct predicted
    */

   for(examined = annotatedGeneList; examined != NULL; examined = (Gene*) examined->next) {
       numAnnoGenes++;
       correctPredicted = false;
       for (pred = predictedGeneList; pred != NULL && !correctPredicted; pred = pred->next) {
	   // check whether examined is equal to pred
	   aexon = examined->exons;
	   pexon = pred->exons;
	   while (aexon && pexon && aexon->begin == pexon->begin && aexon->end == pexon->end) {
	       aexon = aexon->next;
	       pexon = pexon->next;
	   }
	   if (aexon == NULL && pexon == NULL && pred->complete) {
	       correctPredicted = true;
	   }
       }
       if (correctPredicted)
	   geneTP++;
       else
	   geneFN++;
   }

   /*
    * Count the number of ((complete)) predicted CDS
    */
   for (examined = predictedGeneList; examined != NULL; examined = examined->next) {
       bool seen = false;
       for (previous = predictedGeneList; previous != examined && previous != NULL && !seen; previous = previous->next) {
	   if (*previous == *examined)
	       seen = true;
       }
       if (!seen)
	   numPredGenes++;
   }
}

void Evaluation::evaluateOnUTRLevel(Transcript* const predictedGeneList, Transcript* const annotatedGeneList){
  Transcript *examined, *pred;
  Gene * examinedG, *predG;
  int predTSS, annoTSS, predTIS, annoTIS, diff;
  int predTTS, annoTTS, predSTP, annoSTP;
  /*
   * Check for each predicted gene, whether it has a TSS
   * and whether an annotated gene with the same translation start site exists
   */
  for (pred = predictedGeneList; pred != NULL; pred = pred->next) {
      predG = dynamic_cast<Gene*>(pred);
      if (!predG)
	  continue; // ignore non-coding genes;
      
      predTSS = (predG->strand == plusstrand)? predG->transstart : predG->transend;
      predTIS = (predG->strand == plusstrand)? predG->codingstart : predG->codingend;
      if (predTSS >= 0 && predG->complete5utr){
	  numTotalPredTSS++;
	  for (examined = annotatedGeneList; examined != NULL; examined = examined->next) {
	      examinedG = dynamic_cast<Gene*>(examined);
	      if (!examinedG)
		  continue; // ignore non-coding genes;
   
	      annoTSS = (examinedG->strand == plusstrand)? examinedG->transstart : examinedG->transend;
	      annoTIS = (examinedG->strand == plusstrand)? examinedG->codingstart : examinedG->codingend;
	      if (annoTIS == predTIS && annoTSS >= 0){
		  diff = predTSS - annoTSS;
		  if (diff < 0)
		      diff = -diff;
		  numTSS++;
		  if (diff <= MAXUTRDIST)
		      tssDist[diff]++;
	      }
	  }
      }
  }
  /*
   * Check for each predicted gene, whether it has a TTS
   * and whether an annotated gene with the same stop codon exists
   */
  for (pred = predictedGeneList; pred != NULL; pred = pred->next) {
      predG = dynamic_cast<Gene*>(pred);
      if (!predG)
	  continue; // ignore non-coding genes;

      predTTS = (predG->strand == plusstrand)? predG->transend : predG->transstart;
      predSTP = (predG->strand == plusstrand)? predG->codingend : predG->codingstart;
      if (predTTS >= 0 && predG->complete3utr){
	  numTotalPredTTS++;
	  for (examined = annotatedGeneList; examined != NULL; examined = examined->next) {
	      examinedG = dynamic_cast<Gene*>(examined);
	      if (!examinedG)
		  continue; // ignore non-coding genes;
	      annoTTS = (examinedG->strand == plusstrand)? examinedG->transend : examinedG->transstart;
	      annoSTP = (examinedG->strand == plusstrand)? examinedG->codingend : examinedG->codingstart;
	      if (annoSTP == predSTP && annoTTS >= 0){
		  diff = predTTS - annoTTS;
		  if (diff<0)
		      diff = -diff;
		  numTTS++;
		  if (diff <= MAXUTRDIST)
		      ttsDist[diff]++;
	      }
	  }
      }
  }

  /* evaluate on UTR exons
   * First make two lists just of all UTR exons
   * Then let them contain only the unique exons.
   */
  list<State> *predictedUTRExons=new list<State>, *annotatedUTRExons=new list<State>;
  State *st;
  const Transcript *t;
  const Gene *g;

  t = predictedGeneList;
  while (t) {
      g = dynamic_cast<const Gene*>(t);
      if (g) {
	  st = g->utr5exons;
	  while (st) {
	      predictedUTRExons->push_back(*st);
	      st = st->next;
	  }
	  st = g->utr3exons;
	  while (st) {
	      predictedUTRExons->push_back(*st);
	      st = st->next;
	  }
      }
      t = t->next;
  }

  t = annotatedGeneList;
  while (t) {
      g = dynamic_cast<const Gene*>(t);
      if (g) {
	  st = g->utr5exons;
	  while (st) {
	      annotatedUTRExons->push_back(*st);
	      st = st->next;
	  }
	  st = g->utr3exons;
	  while (st) {
	      annotatedUTRExons->push_back(*st);
	      st = st->next;
	  }
      }
      t = t->next;
  }
  numAnnoUTRExons += annotatedUTRExons->size();
  numPredUTRExons += predictedUTRExons->size();
  /*
   * make the lists of UTR exons unique
   */

  predictedUTRExons->sort();
  annotatedUTRExons->sort();
  predictedUTRExons->unique();
  annotatedUTRExons->unique();

  numUniqueAnnoUTRExons += annotatedUTRExons->size();
  numUniquePredUTRExons += predictedUTRExons->size();

  evaluateOnExonLevel(predictedUTRExons, annotatedUTRExons, true);
  evaluateOnNucleotideLevel(predictedUTRExons, annotatedUTRExons, true);
}

void Evaluation::finishEvaluation(){
    // UTR level evaluation
    // transcription start site
    if (numTSS>0){
	meanTssDist = 0.0;
	int numInRange=0;
	medianTssDist = -1;
	int cumNum=0;
	cout << "TSS distances " << endl;
	for (int i=0; i <= MAXUTRDIST; i++){
	    if (tssDist[i]>0) {
		cumNum += tssDist[i];
		if (2*cumNum >= numTSS && medianTssDist < 0)
		    medianTssDist = i;
		meanTssDist += i*tssDist[i];
		numInRange += tssDist[i];
		cout << i << "\ttimes:" << tssDist[i] << endl;
	    }
	}
	if (numTSS-numInRange > 0)
	    cout << "Warning: " << numTSS-numInRange << " TSS are off by more than " << MAXUTRDIST << endl;
	meanTssDist += MAXUTRDIST * (numTSS-numInRange); // assume that every distance > MAXUTRDIST is equal to MAXUTRDIST
	meanTssDist /= numTSS;
    } else {
	medianTssDist = -1;
	meanTssDist = -1;
    }
    // transcription termination site
    if (numTTS>0){
	meanTtsDist = 0.0;
	int numInRange = 0;
	medianTtsDist = -1;
	int cumNum=0;
	cout << "TTS distances " << endl;
	for (int i=0; i <= MAXUTRDIST; i++){
	    if (ttsDist[i]>0) {
		cumNum += ttsDist[i];
		if (2*cumNum >= numTTS && medianTtsDist < 0)
		    medianTtsDist = i;
		meanTtsDist += i*ttsDist[i];
		numInRange += ttsDist[i];
		cout << i << "\ttimes:" << ttsDist[i] << endl;
	    }
	}
	if (numTTS-numInRange > 0)
	    cout << "Warning: " << numTTS-numInRange << " TTS are off by more than " << MAXUTRDIST << endl;
	meanTtsDist += MAXUTRDIST * (numTTS-numInRange); // assume that every distance > MAXUTRDIST is equal to MAXUTRDIST
	meanTtsDist /= numTTS;
    } else {
	medianTtsDist = -1;
	meanTtsDist = -1;
    }
}

void Evaluation::print(){
    cout << "\n*******      Evaluation of gene prediction     *******" << endl << endl;

    // nucleotide level
    cout << "---------------------------------------------\\" << endl;
    cout << setw(16) << " " << " | " << setw(11) << "sensitivity" << " | " 
	 << setw(11) << "specificity" << " |" << endl;
    cout << "---------------------------------------------|" << endl;
    cout << setw(16) << "nucleotide level" << " | " << setw(11)
	 << setprecision(3) << nukSens 
	 << " | " <<  setw(11) << setprecision(3) << nukSpec << " |" << endl;
    cout << "---------------------------------------------/" << endl << endl;

    // exon level
    cout << "-----------------------------------------------------------------------------" 
	 << "-----------------------------\\" << endl;
    cout << setw(10) << " " << " | " << setw(6) << "#pred" << " | " << setw(6) << "#anno"
	 << " | " << setw(4) << " " << " | " << setw(18) << "FP = false pos." 
	 << " | " << setw(18) << "FN = false neg." << " | " 
	 << setw(11) << " " << " | " 
	 << setw(11) << " " << " |" << endl;
    cout << setw(10) << " " << " | " << setw(6) << "total/" << " | " << setw(6) << "total/"
	 << " | "  << setw(4) << "TP" << " |"<< setw(19) << "--------------------" 
	 << "|" << setw(19) << "--------------------" 
	 << "| " << setw(11) << "sensitivity" << " | " 
	 << setw(11) << "specificity" << " |" << endl;
    cout << setw(10) << " " << " | " << setw(6) << "unique" << " | " << setw(6) << "unique"
	 << " | " << setw(4) << " " << " | " << setw(4) << "part" << " | " << setw(4) 
	 << "ovlp" << " | "  << setw(4) << "wrng" << " | " <<  setw(4) << "part" 
	 << " | " << setw(4) << "ovlp" << " | " 
	 << setw(4) << "wrng" << " | " << setw(11) << " " << " | " 
	 << setw(11) << " " << " |" << endl;
    cout << "-----------------------------------------------------------------------------" 
	 << "-----------------------------|" << endl;
    cout << setw(10) << " " << " | " << setw(6) << " " << " | " << setw(6) << " "
	 << " | " << setw(4) << " " << " | " << setw(18) << exonFP << " | " << setw(18) << exonFN 
	 << " | " << setw(11) << " " << " | " 
	 << setw(11) << " " << " |" << endl;
    cout << setw(10) << "exon level" << " | " << setw(6) << numPredExons << " | " 
	 << setw(6) << numAnnoExons << " | " << setw(4) << exonTP << " | "
	 << setw(18) << "------------------" << " | " << setw(18) << "------------------" 
	 << " | " << setw(11) << setprecision(3) << exonSens << " | " 
	 << setw(11) << setprecision(3) << exonSpec << " |" << endl;
    cout << setw(10) << " " << " | " << setw(6) << numUniquePredExons << " | " << setw(6) << numUniqueAnnoExons
	 << " | " << setw(4) << " " << " | " << setw(4) << exonFP_partial << " | " << setw(4) 
	 << exonFP_overlapping << " | "  << setw(4) << exonFP_wrong << " | " <<  setw(4) 
	 << exonFN_partial << " | " << setw(4) << exonFN_overlapping << " | " 
	 << setw(4) << exonFN_wrong << " | " << setw(11) << " " << " | " 
	 << setw(11) << " " << " |" << endl;
    cout << "-----------------------------------------------------------------------------" 
	 << "-----------------------------/" << endl << endl;
    // transcript level
    cout << "-----------------------------------------------------" 
	 << "-----------------------\\" << endl;
    cout << setw(10) << "transcript" << " | " << setw(5) << "#pred" << " | " 
	 << setw(5) << "#anno" << " | " << setw(4) << "TP" << " | " 
	 << setw(4) << "FP" << " | " << setw(4) << "FN" << " | " 
	 << setw(11) << "sensitivity" << " | " 
	 << setw(9) << "specificity" << " |" << endl;
    cout << "------------------------------------------------------" 
	 << "----------------------|" << endl;
    cout << setw(10) << "gene level" << " | " << setw(5) << numPredGenes << " | " 
	 << setw(5) << numAnnoGenes << " | " << setw(4) << geneTP << " | " 
	 << setw(4) << geneFP << " | " << setw(4) << geneFN << " | " 
	 << setw(11) << setprecision(3) << geneSens << " | " 
	 << setw(11) << setprecision(3) << geneSpec << " |" << endl;
    cout << "-------------------------------------------------------" 
	 << "---------------------/" << endl;
    if (numTotalPredTSS > 0 || numTotalPredTTS > 0){
      // utr level
      cout << endl;
      cout << "------------------------------------------------------------------------\\" << endl;
      cout << setw(15) << "UTR" << " | "  << setw(10) << "total pred" << " | " << setw(14) << "CDS bnd. corr." << " | "
	   << setw(10) << "meanDiff" << " | "<< setw(10) << "medianDiff" << " |" << endl;
      cout << "------------------------------------------------------------------------|" << endl;
      cout << setw(15) << "TSS" << " | "  << setw(10) << numTotalPredTSS << " | " << setw(14) << numTSS << " | " 
	   << setw(10) << setprecision(3) << meanTssDist << " | " << setw(10) << medianTssDist << " |" << endl;
      cout << setw(15) << "TTS" << " | "  << setw(10) << numTotalPredTTS << " | " << setw(14) << numTTS << " | " 
	   << setw(10) << setprecision(3) << meanTtsDist << " | " << setw(10) << medianTtsDist << " |" << endl;
      cout << "------------------------------------------------------------------------|" << endl;
      cout << setw(15) << "UTR" << " | "  << setw(10) << "uniq. pred" << " | " << setw(14) << "unique anno" << " | "
           << setw(10) << "   sens." << " | "<< setw(10) << "     spec." << " |" << endl;
      cout << "------------------------------------------------------------------------|" << endl;
      cout << setw(15) << " " << " | "  << setw(45) << "true positive = 1 bound. exact, 1 bound. <= " << UTRoffThresh << "bp off" " |" << endl;
      cout << setw(15) << "UTR exon level" << " | "  << setw(10) << numUniquePredUTRExons << " | " << setw(14) << numUniqueAnnoUTRExons << " | "
           << setw(10) << setprecision(3) << UTRexonSens << " | " << setw(10) << UTRexonSpec << " |" << endl;
      cout << "------------------------------------------------------------------------|" << endl;
      cout << setw(15) << "UTR base level" << " | "  << setw(10) << (nucUTP+nucUFP) << " | " << setw(14) << (nucUTP+nucUFN) << " | "
           << setw(10) << setprecision(3) << nucUSens << " | " << setw(10) << nucUSpec << " |" << endl;
      cout << "------------------------------------------------------------------------/" << endl;
      cout << "nucUTP= " << nucUTP << " nucUFP=" << nucUFP << " nucUFPinside= " << nucUFPinside << " nucUFN=" << nucUFN << endl;
  
    }


#ifdef DEBUG
    cout << "longest predicted intron had length " << longestPredIntronLen << endl;
#endif
}

void Evaluation::printQuotients(){
    cout << "a-posteriori probability of viterbi path" << endl;
    cout << "----------------------------------------" << endl;
    cout << "a-posteriori probability of correct path" << endl << endl;

    quotients.sort();
    int one=0, ten=0;
    list<Double>::iterator it = quotients.begin();
    for (; it != quotients.end() && *it < 1.000001; it++)
	one++;
    cout << one << " times were the paths equally likely (identical)." << endl;
    cout << "sorted quotients of the rest:" << endl;
    for (; it != quotients.end(); it++){
	cout << *it << endl;
	if (*it < 10.0)
	    ten++;
    }
    cout << endl << ten << " quotients were between 1 and 10" << endl;
    cout << endl;
}


/*
 * predictAndEvaluate
 */

Evaluation* predictAndEvaluate(vector<AnnoSequence*> trainGeneList, FeatureCollection &extrinsicFeatures){
    Evaluation *eval = new Evaluation();
    NAMGene namgene;
    StatePath *viterbiPath, *condensedViterbiPath;
    bool crf = inCRFTraining;
    inCRFTraining = 0; // this way Viterbi will be much more efficient because of use of static variables
    for (int i=0; i < trainGeneList.size(); i++){
	AnnoSequence* evalGene = trainGeneList[i];
	SequenceFeatureCollection& sfc=extrinsicFeatures.getSequenceFeatureCollection(evalGene->seqname);
	sfc.prepare(evalGene, false);
	viterbiPath = namgene.getTrainViterbiPath(evalGene->sequence, &sfc);
	condensedViterbiPath = StatePath::condenseStatePath(viterbiPath);
	Transcript* viterbiGenes = condensedViterbiPath->projectOntoGeneSequence("v");
	// condensedViterbiPath->print();
	eval->addToEvaluation(viterbiGenes, evalGene->anno->genes, bothstrands);
	delete viterbiPath;
	delete condensedViterbiPath;
	Transcript::destroyGeneSequence(viterbiGenes);
    }
    inCRFTraining = crf;
    return eval;
}