src/merkmal.cc - augustus (multiarch-fixes/main)

Tree @multiarch-fixes/main (Download .tar.gz)

merkmal.cc @multiarch-fixes/main — raw · history · blame

/*
 * merkmal.cc
 *
 * License: Artistic License, see file LICENSE.TXT or 
 *          https://opensource.org/licenses/artistic-license-1.0
 * 
 * Description: Features (German: Merkmale) for the Conditional Random Field
 */

#include "merkmal.hh"

// project includes
#include "geneticcode.hh"
#include "projectio.hh"
#include "namgene.hh"
#include "evaluation.hh"
#include "extrinsicinfo.hh"
#include "gene.hh"
#include "statemodel.hh"
#include "utrmodel.hh"

// standard C/C++ includes
#include <fstream>


vector<AnnoSequence*> CRF::evalAnnoSeqs;
FeatureCollection *CRF::evalExtrinsicFeatures = NULL;
int CRF::statecount = 0;
StateModel **CRF::states = NULL;

void MMGroup::registerPars(Parameters* parameters, double alphavalue){
    this->parameters = parameters;

    for (int idx=0; idx < getNumPars(); idx++)
	parameters->addMerkmal(getFactor(idx), alphavalue);
}

void MMGroup::addCount(int index, int count) {
    if (parameters)
	parameters->addCount(offset+index, count);
}

Double* PatMMGroup::getFactor(int index){
    return &probs[index];
}

string PatMMGroup::verbalDescription(int index){
    ostringstream descr;
    descr << parname;
    descr << " ";
    descr << Seq2Int(order+1).INV(index);
    return descr.str();
}

Double PatMMGroup::getMinProb(float qthresh){
  return quantile(probs, qthresh);
}

void FramedPatMMGroup::getFramePat(int index, int &frame, int &pattern){
    frame = 0;
    while (frame < 3 && index >= probs[frame].size()){
	index -= probs[frame].size();
	frame++;
    }
    if (frame > 2)
	throw ProjectError("FramedPatMMGroup: Index out of range.");
    pattern = index;
}

Double* FramedPatMMGroup::getFactor(int index){
    int frame, pattern;
    getFramePat(index, frame, pattern);
    return &probs[frame][pattern];
}

string FramedPatMMGroup::verbalDescription(int index){
    int frame, pattern;
    getFramePat(index, frame, pattern);
    ostringstream descr;
    descr << parname;
//    descr << " idx ";
//    descr << index;
    descr << " frame ";
    descr << frame;
    descr << " ";
    descr << Seq2Int(order+1).INV(pattern);
    return descr.str();
}


/*
 * BinnedMMGroup::trainBins
 *
 * Find bin boundaries and starting average values by computing the quantiles of the origprobs
 */

void BinnedMMGroup::trainBins(int maxNumBins){
    int i, j, n;
    int minBinSize;
    Double sum;

    avprobs.clear();
    bb.clear();
    if (maxNumBins < 1){
	nbins=0; // use no binning
	return;
    }
    if (origprobs.size() < 1){
	throw ProjectError("Tried to train binned feature group without training data.");
    }
    std::sort(origprobs.begin(), origprobs.end());

    minBinSize = origprobs.size() / maxNumBins;
    if (minBinSize < 1)
	minBinSize = 1;
    nbins = 0;
    i = j = 0;
    
    while (i < origprobs.size()){
	sum = 0.0;
	n = 0; // number of items in current bin
	while(i < origprobs.size() && (n < minBinSize || origprobs[i] == origprobs[i-1])){
	    sum += origprobs[i];
	    n++;
	    i++;
	}
	avprobs.push_back(sum / n); // (arithmetic average for now)
	if (i < origprobs.size()) {
	    bb.push_back((origprobs[i] + origprobs[i-1]) / 2); // warning origprobs[i] and origprobs[i-1] could be almost equal
	}
	nbins++;
    }
    monotonify();
}

void BinnedMMGroup::monotonify(){
    if (monotonic == 0)
	return;
    // if an adjacent pair of probs violates the monotonicity criteria, set both to their average
    for (int j=0; j < nbins-1; j++){
	if (monotonic * (avprobs[j+1] - avprobs[j]) < 0){
	    cout << "monotonify: set " << verbalDescription(j) << "= " << avprobs[j] << " and "
		 << verbalDescription(j+1) << "= " << avprobs[j+1] << " to their average." << endl;
	    avprobs[j] = avprobs[j+1] = (avprobs[j] + avprobs[j+1]) / 2;
	}
    }
    // it will be closer to monotinicity now, but it is not monotone in general, TODO if time
}

void BinnedMMGroup::printBoundaries(){
    for (int j=0; j < nbins-1; j++)
	cout << "bb[" << j << "]= " << bb[j] << endl;
}

int BinnedMMGroup::getIndex(Double p){
    // make a binary search to find the bin, in which p belongs
    int a=0, b=nbins-1, m;
    while (a < b){
	m = (a+b)/2;
	if (p < bb[m])
	    b = m;
	else
	    a = m+1;
    }
    if (a > 0 && a < nbins -1 && !(bb[a-1] <= p && bb[a] > p))
	cerr << "getIndex error " << p  << " " << bb[a-1] << " " << bb[a] << endl;
    return a;
}

void BinnedMMGroup::write(ostream &out){
    out << "# number of bins:\n" << nbins << endl;
    out << "# monotonic: ";
    if (monotonic>0)
	out << "increasing";
    else if (monotonic<0)
	out << "decreasing";
    else 
	out << "no";
    out << endl;
    out << "# lower bin boundary <= average probability" << endl;
    out << "\t" << avprobs[0] << endl;
    for (int i=1; i < nbins; i++)
	out << bb[i-1] << "\t" << avprobs[i] << endl;
}

void BinnedMMGroup::read(ifstream &in){
    if (!in)
	throw ProjectError("Error when reading in BinnedMMGroup.");
    in >> comment >> nbins;
    if (nbins < 1)
	throw ProjectError("BinnedMMGroup:: less than one bin.");
    avprobs.resize(nbins);
    bb.resize(nbins-1);
    in >> comment >> avprobs[0];
    for (int i=1; i < nbins; i++)
	in >> bb[i-1] >> avprobs[i];
}

Double BinnedMMGroup::getMinProb(float qthresh){
  if (nbins==0)
    return quantile(origprobs, qthresh);
  else
    return quantile(avprobs, qthresh);
}

Double* BinnedMMGroup::getFactor(int index){
    return &avprobs[index];
}

string BinnedMMGroup::verbalDescription(int index){
    ostringstream descr;
    descr << parname;
    descr << " bin[";
    if (index > 0)
	descr << bb[index-1];
    else
	descr << "-infty";
    descr << ", ";
    if (index < nbins-1)
	descr << bb[index];
    else
	descr << "+infty";
    descr << ")";
    return descr.str();
}

int Parameters::addMMGroup(MMGroup* mmgroup, double alphavalue){
    int offset = weights.size();
    // check consistency of offset 
    if (mmgroups && mmgroups->size() > 0){
	MMGroup* lastMMGroup = (*mmgroups)[mmgroups->size()-1];
	if (offset !=  lastMMGroup->getOffset() + lastMMGroup->getNumPars())
	   throw ProjectError("Parameters::addMMGroup: inconsistency detected");
    }
    mmgroup->registerPars(this, alphavalue);
    if (!mmgroups)
	mmgroups = new vector<MMGroup*>;
    mmgroups->push_back(mmgroup);
    mmgroup->setOffset(offset);
    return offset;
}

void Parameters::addMerkmal(Double* parptr, double alphavalue){
    if (!parptrs)
	parptrs = new vector<Double*>;
    parptrs->push_back(parptr);
    weights.push_back(parptr->log());
    alpha.push_back(alphavalue);
    counts.push_back(0);
}

void Parameters::addWeights(vector<double> &h){
    if (h.size() != weights.size())
	throw ProjectError ("Parameters::addWeights: vector sizes not matching.");
    for (int i=0; i < h.size(); i++)
	weights[i] += h[i];
}

void Parameters::setWeights(vector<double> &v){
    if (v.size() != counts.size())
	throw ProjectError ("Parameters::addWeights: vector sizes not matching.");
    weights = v;
}

void Parameters::smoothFeatures(){
    for (int i=0; i<mmgroups->size(); i++)
	(*mmgroups)[i]->smooth();
}

void Parameters::updatePars(){
    if (!parptrs)
	return;
    if (parptrs->size() != weights.size())
	throw ProjectError("Unequal parameter vectors in Parameters::updatePars.");
    for (int idx=0; idx < weights.size(); idx++)
	(*(*parptrs)[idx]) = LLDouble::exp(weights[idx]);
}

void Parameters::updateWeights(){
    if (!parptrs)
        return;
    if (parptrs->size() != weights.size())
        throw ProjectError("Unequal parameter vectors in Parameters::updatePars.");
    for (int idx=0; idx < weights.size(); idx++)
	weights[idx] = ((*parptrs)[idx])->log();
}

string Parameters::verbalDescription(int index){
    if (!mmgroups)
	return "";
    for (int i=0; i<mmgroups->size(); i++){
	int offset = (*mmgroups)[i]->getOffset();
	if ( offset <= index && offset + (*mmgroups)[i]->getNumPars() > index)
	    return (*mmgroups)[i]->verbalDescription(index-offset);
    }
    return "error";
}



/*
 * CRF::train
 */
void CRF::train(Parameters* startPars, vector<AnnoSequence*> trainGenes, FeatureCollection &extrinsicFeatures){
    /*
     * check whether there are hints for the evaluation set
     * TODO
     */
    evalExtrinsicFeatures = new FeatureCollection();


    /*
     * decide on the training method
     */
    string method;
    try {
	method = Properties::getProperty("CRF_TRAIN");
    } catch  (...) {
        method = "OLM";
    }
    // train parameters
    if (method == "IIS") {
	cout << "use IIS algorithm (Improved Iterative Scaling) for training" << endl;
	improvedIterativeScaling(startPars, trainGenes, extrinsicFeatures);
    } else {
	cout << "use OLM algorithm (Online Large Margin) for training" << endl;
	onlineLargeMarginTraining(startPars, trainGenes, extrinsicFeatures);
    }
}

/*
 * onlineLargeMarginTraining (OLM)
 * 
 */
void CRF::onlineLargeMarginTraining(Parameters* parameters, vector<AnnoSequence*> trainGenes, FeatureCollection &extrinsicFeatures){
    inCRFTraining = true;
    NAMGene namgene;
    Double viterbiPathEmiProb, correctPathEmiProb;
    double correctScore, viterbiScore;
    double loss; // value of loss function
    AnnoSequence* as;
    vector<int> a; // count vector of correct features minus viterbi features
    vector<double> h(parameters->size(), 0.0); // weight difference vector
    vector<double> v(parameters->size(), 0.0); // accumulated weights for averaging at the end
    unsigned N; // number of rounds
    int m; // training set size
    int numiterations=0, u; // at most Nm
    bool termination=false;
    double target;
    StatePath *viterbiPath, *correctPath, *condensedViterbiPath=NULL;//, *condensedCorrectPath=NULL;
    int chunksize = 100; // consider this many training genes to be one training example (will be increased a little below to fit)
    int maxTrainSeqs = -1; // use only this many training genes for CRF iterations per round (-1: use all)
    vector<double> bs;
    vector< vector<int> > viterbiCounts, correctCounts; // holds all feature counts of a chunk
    vector<int> chunkViterbiCounts(parameters->size(), 0), chunkCorrectCounts(parameters->size(), 0);
    vector<SequenceFeatureCollection*> sfcs; // sequence feature collection for each training sequence
    UtrModel::setTtsSpacing(1); // so the true parses are possible in the model

// temp for optimizing parameters
    double scorediffweight = 0.05;
    double lossweight = 2.0;
    try {
        scorediffweight = Properties::getdoubleProperty("scorediffweight");
    } catch (...){}
    try {
        lossweight = Properties::getdoubleProperty("lossweight");
    } catch (...){}
    cout << "scorediffweight= " << scorediffweight << "\t lossweight= " << lossweight << endl << flush;

    
    /*
     * preparations
     */
    try {
	N = Properties::getIntProperty("CRF_N");
	if (N < 1){
	    cerr << "CRF_N < 1 not allowed. Please use --CRF=0 to turn off CRF training. Will use CRF_N = 1..." << endl;
	    N = 1;
	}
    } catch  (...) {
	N = 3;
    }

    /*
     * precompute and save correct paths
     * removed since this takes 72 bytes for every state (basically 70Mb per mega base)
     */
    //vector<StatePath*> correctPaths;
    m = trainGenes.size();
    chunksize = (chunksize > m) ? m : (int)((float)m/(int)(m/chunksize) + .99999); // increase chunksize so the chunks are all equal size and fit in training set size evenly
    cout << "training set size: " << m << " chunk size: " << chunksize << endl;
    if (evalAnnoSeqs.size() <= 0)
      cout << "have no evaluation set" << endl;
    vector<int> numCorrect;
    numCorrect.resize(m, 0);
    bs.resize(chunksize, 0.0);
    viterbiCounts.resize(chunksize);
    correctCounts.resize(chunksize);

    for (unsigned round = 1; round <= N && !termination; round++){
	/*
	 * evaluate parameters
	 */
	if (evalAnnoSeqs.size() > 0){ // have evaluation set
            UtrModel::setTtsSpacing(10); 
	    cout << "Evaluating on " << evalAnnoSeqs.size() << " sequences " << endl;
	    Evaluation* eval = predictAndEvaluate(evalAnnoSeqs, *evalExtrinsicFeatures);
	    target = (3*eval->nukSens + 2*eval->nukSpec + 4*eval->exonSens + 3*eval->exonSpec + 2*eval->geneSens + 1*eval->geneSpec)/15;
	    cout << "Evaluation before round " << round << " target = " << target << endl;
	    eval->finishEvaluation();
	    eval->print();
	    // TODO Kriterien zum Abbruch des Trainings, z.B. round > 2 und wurde nicht mehr besser
	    UtrModel::setTtsSpacing(1);
	    termination = false;
	    delete eval;
	}
	if (termination){
	    cout << "Terminated training algorithm" << endl;
	    break;
	}
	cout << "*** CRF training round " << round << " of " << N <<  " ***" << endl;
	for (int i=0; i < m && (maxTrainSeqs < 0 || i < maxTrainSeqs); i += chunksize) {
	    // reset chunk counters
	    bs.assign(chunksize, 0.0);
	    // loop over chunk	   
	    for (u = 0; u < chunksize && i + u < m; u++) {
		as = trainGenes[i+u];
		cout << "using training sequence " << as->seqname << " (length=" << as->length << ")." << endl;
		if (round == 1) {
		    SequenceFeatureCollection &sfc = extrinsicFeatures.getSequenceFeatureCollection(as->seqname);
		    if (sfcs.size() != i+u) {
			throw ProjectError("inconsistency with index in sfcs");
		    }
		    sfc.prepare(as, true);
		    sfcs.push_back(&sfc);
		}
	    
		/*
		 * compute Viterbi path
		 */
		viterbiPath = namgene.getTrainViterbiPath(as->sequence, sfcs[i+u]);
		condensedViterbiPath = StatePath::condenseStatePath(viterbiPath);
		//condensedViterbiPath->print();
		Gene* viterbiGenes = (Gene*) condensedViterbiPath->projectOntoGeneSequence("v");
		if (viterbiGenes && round < 4)
		    printGeneList(viterbiGenes, as, false, false);
		/*
		 * correct path
		 */
		correctPath = StatePath::getInducedStatePath(as->anno->genes, as->length, false);
		//condensedCorrectPath = StatePath::condenseStatePath(correctPath);
		//condensedCorrectPath->print();
		if (round < 4)
		    as->anno->printGFF();

		/*
		 * compare Viterbi with correct path
		 */
		if (*correctPath == *viterbiPath){
		    cout << "Gene(s) correctly predicted in " << as->seqname << ". I don't do nothin." << endl;
		    numCorrect[u+i]++;
		} else {
		    cout << "Viterbi wrong in " << as->seqname << "." << endl;
		    // testing for now: Only count features falling in range of annotated gene(s)
		    int countFrom = as->anno->genes->geneBegin()-10000;
		    int countTo = as->anno->genes->geneEnd()+10000; //TODO: deal with multi-gene sequences
		    cout << "counting features only in region " << countFrom << ".." << countTo << endl;
		    StateModel::setCountRegion(countFrom, countTo);
		    parameters->resetCounts();
		    viterbiPathEmiProb = namgene.getPathEmiProb(viterbiPath, as->sequence, *sfcs[i+u], countFrom, countTo);
		    viterbiCounts[u] = parameters->getCounts();
		    viterbiScore = viterbiPathEmiProb.log();
		    cout << "Viterbi pathEmiProb=" << viterbiPathEmiProb << " score = " << setprecision(6) << viterbiScore << endl;
		    if (round == 1){
			cout << "counts of viterbi path" << endl;
			parameters->print<int>(viterbiCounts[u], 5, true);
		    }
		    
		    parameters->resetCounts();
		    correctPathEmiProb = namgene.getPathEmiProb(correctPath, as->sequence, *sfcs[i+u], countFrom, countTo);
		    correctCounts[u] = parameters->getCounts();
		    correctScore = correctPathEmiProb.log();
		    cout << "correct pathEmiProb=" << correctPathEmiProb << " score = " << setprecision(6) << correctScore << endl;
		    if (round == 1){
			cout << "counts of correct path" << endl;
			parameters->print<int>(correctCounts[u], 5, true);
		    }
		    parameters->resetCounts();
		    
		    if (!(correctPathEmiProb > 0.0)){
			cout << "Correct path IMPOSSIBLE in model. Can't do anything about this by only changing weights." << endl;
		    } else {
			/*
			 * compute loss function that measures how far Viterbi is off
			 */
			loss = lossFct(as->anno->genes, viterbiGenes); // TODO
			if (viterbiScore < correctScore){
			    cout << "viterbi score < correct score" << endl;
			}
			bs[u] = scorediffweight * ((viterbiScore > correctScore)? (viterbiScore - correctScore) : 0) + lossweight * loss;
			cout << "b = scorediffweight * ((viterbiScore > correctScore)? (viterbiScore - correctScore) : 0) + lossweight * loss = " 
			     << scorediffweight << " * ( " << viterbiScore << " - " << correctScore << ") + " << lossweight 
			     << " * " << loss << " = " << bs[u] << endl;
		    }
		} // *correctPath != *viterbiPath
		cout << "correct: " << numCorrect[u+i] << " of " << N << endl;
		// clean up
		delete correctPath;
		delete viterbiPath;
		if (condensedViterbiPath)
		    delete condensedViterbiPath;
	    } // end of loop over chunk elements
	    
	    vector<double> bsc = capOutliers(bs);
	    h.assign(h.size(), 0.0);
	    cout << "updating parameters after chunk processing" << endl;
	    for (int uu = 0; uu < bsc.size(); uu++){
		cout << "chunk item no " << uu << "\tcapped b = " << bsc[uu];
		if (bsc[uu] > 0.0){
		    int numfeat = 0;
		    a = correctCounts[uu];
		    for (int j=0; j < a.size(); j++)
			a[j] -= viterbiCounts[uu][j];
		    double normOfA = 0.0;
		    for (int j=0; j < a.size(); j++){
			normOfA += parameters->getAlpha(j) * a[j]*a[j];
			if (a[j] != 0)
			    numfeat++;
		    }
		    if (normOfA > 0.0) {
			double scalefactor = bsc[uu] / normOfA;
			for (int j=0; j < h.size(); j++){
			  // This is a hack against the following problem caused by the fact that currently not all features are CRF-trained
			  // Consider a difference only in a small initial exon. Then the only ss/exon feature difference is one 
			  // false dss feature, which gets all the change (in one case something like a factor of 65)
			  double change = scalefactor * parameters->getAlpha(j) * a[j];
			  if (change > 0.5) // = factor of 2
			    change = 0.5;
			  if (change < -0.5)
                            change = -0.5;
			  h[j] += change;
			}
		    }
		    cout << "\tnumfeat=" << numfeat << "\tnormOfA=" << normOfA;
		}
		cout << endl;
	    }
	    /*
	     * update weights ...
	     */
	    vector<double> startWeights = parameters->getWeights();     //TEMP
	    parameters->addWeights(h);
	    parameters->updatePars();
	    parameters->smoothFeatures();
	    parameters->updateWeights();
	    vector<double> endWeights = parameters->getWeights();       //TEMP
	    CRF::compareWeights(parameters, startWeights, endWeights, 30);       //TEMP
	    if (false){
		cout << "updated Parameters:" << endl;
		parameters->print(150, false);
	    }
	    
	    /*
	     * for testing: compare the scores of viterbi and the correct path under the new weights, b should be 0
	     *
	     viterbiPathEmiProb = namgene.getPathEmiProb(viterbiPath, as->sequence, sfc[i]);
	     viterbiScore = viterbiPathEmiProb.log();
	     correctPathEmiProb = namgene.getPathEmiProb(correctPath, as->sequence, sfc[i]);
	     correctScore = correctPathEmiProb.log();
	     b = viterbiScore - correctScore + loss;
	     cout << "with new pars: b = viterbiScore - correctScore + loss" << " = " << viterbiScore
	     << " - " << correctScore << " + " << loss << " = " << b << endl;
	     if (abs(b) > 0.001)
	     throw ProjectError("updating not correct");
	    */
	    
	    // add up the weight vector
	    for (int j=0; j < v.size(); j++)
		v[j] += parameters->getWeight(j);
	    numiterations++;
	
	    // ******** test evaluation
	    if (evalAnnoSeqs.size() > 0 && (numiterations <=200  || 400 < chunksize || ((int) numiterations/chunksize) % 4 == 0)){// 200 was 500, 400 was 100
		cout << "Evaluating on " << evalAnnoSeqs.size() << " sequences " << endl;
		UtrModel::setTtsSpacing(10);
		Evaluation* eval = predictAndEvaluate(evalAnnoSeqs, *evalExtrinsicFeatures);
		target = (3*eval->nukSens + 2*eval->nukSpec + 4*eval->exonSens + 3*eval->exonSpec + 2*eval->geneSens + 1*eval->geneSpec)/15;
		cout << "Evaluation after iteration " << numiterations << " and training sequence " << i + u << ". target = " << target << endl;
		eval->finishEvaluation();
		eval->print();
		UtrModel::setTtsSpacing(1);
		delete eval;
	    }
	    
	    //parameters->setWeights(startWeights); // undo changes, temp for testing
	    //parameters->updatePars();

	} // end of loop over training examples
	if (states && round < N){ // save intermediate parameters to file
	    char* suffix = new char[4];
	    sprintf(suffix, "%d", round);
	    cout << "saving intermediate parameters " << suffix << endl;
	    ContentDecomposition cd;
	    BaseCount bc;
	    for( int i = 0; i < statecount; i++ ) // first delete all files with this suffix
	      states[i]->printProbabilities( -1, &bc, suffix );
	    for (int idx=0; idx < cd.n; idx++) {
	      bc = cd.getBaseCount(idx);
	      for( int i = 0; i < statecount; i++ )
		states[i]->printProbabilities(idx, &bc, suffix);
	    }
	}
    } // end of loop over rounds
    cout << "N*m = " << N << " * " << m << " = " << N*m << ". Number of iterations: " << numiterations << endl;
    //for (int i=0; i < v.size(); i++)
    //	v[i] /= numiterations;
    cout << "using last parameters after training" << endl;
    //    parameters->setWeights(v);
    //    parameters->updatePars();
    // clean up
    // for (int i=0; i < correctPaths.size(); i++)
    // delete correctPaths[i];
    inCRFTraining = false;
}

/*
 * improvedIterativeScaling
 * Improved Iterative Scaling Training Algorithm (IIS)
 * Maximum Likelihood without regularization
 */
void CRF::improvedIterativeScaling(Parameters* parameters, vector<AnnoSequence*> trainGenes, FeatureCollection &extrinsicFeatures){
    inCRFTraining = true;
    NAMGene namgene;
    namgene.setNeedForwardTable(true); // for sampling
    int m=0; // training set size, only feasible
    int sample=100; // number of sample iterations per sequence
    double up = 0.0; // upper limit on the feature sum
    double score, totalScore = 0.0, target;
    AnnoSequence* as;
    int N; // number of rounds
    vector<int> counts;
    StatePath *sampledPath, *correctPath, *condensedSampledPath=NULL;//, *condensedCorrectPath=NULL;
    Double pathEmiProb;
    bool termination = false;
    vector<int> Nf(parameters->size(), 0); // actual count of features in training data
    vector<int> Sf(parameters->size(), 0); // sampled count of features in training data
    vector<double> Ef(parameters->size(), 0.0); // expected count of features in training data
    vector<bool> feasible; // for each training gene: whether it is possible in the model
    vector<SequenceFeatureCollection*> sfcs; // sequence feature collection for each training sequence
    vector<double> h; // weight difference vector
    /*
     * preparations
     */
    try {
        N = Properties::getIntProperty("CRF_N");
    } catch  (...) {
        N = 1;
    }
    if (evalAnnoSeqs.size() <= 0)
      cout << "Have no evaluation set." << endl;
    /*
     * iterate over optimization rounds
     */
    for (int round = 1; round <= N && !termination; round++){
	/*
         * evaluate parameters
         */
        if (evalAnnoSeqs.size() > 0){ // have evaluation set
            Evaluation* eval = predictAndEvaluate(evalAnnoSeqs, *evalExtrinsicFeatures);
            target = (3*eval->nukSens + 2*eval->nukSpec + 4*eval->exonSens + 3*eval->exonSpec + 2*eval->geneSens + 1*eval->geneSpec)/15;
            cout << "Evaluation before round " << round << " target = " << target << endl;
            eval->print();
            termination = false;
            delete eval;
        }
        if (termination){
            cout << "Terminated training algorithm" << endl;
            break;
        }

	/*
	 * loop over the training examples
	 */
	cout << "*** CRF(IIS) training round " << round << " of " << N <<  " ***" << endl;
	for (int k=0; k < parameters->size(); k++) // reset sampled counts to 0
	    Sf[k] = 0;
	totalScore = 0.0;
	for (int i=0; i < trainGenes.size(); i++){
	    as = trainGenes[i];
	    if (round > 1 && feasible.size() >i && !feasible[i]) {
		cout << "skipping infeasible seq " << as->seqname << endl;
		continue;
	    }
	    //cout << "computing expected feature counts for training example " << as->seqname << " (length=" << as->length << ")." << endl;
	    if (round == 1) {
		SequenceFeatureCollection &sfc = extrinsicFeatures.getSequenceFeatureCollection(as->seqname);
		if (sfcs.size() != i) {
		    throw ProjectError("inconsistency with index in sfcs");
		}
		sfc.prepare(as, true);
		sfcs.push_back(&sfc);
	    }
	    StatePath *viterbiPath = namgene.getTrainViterbiPath(as->sequence, sfcs[i]);
	    delete viterbiPath; // we actually don't need this in this algorithm, but namgene needs the forward table..
	    
	    /*
             * in the first round compute the feature sum for all training examples
             * this is done simultaneously because viterbi/forward/ automatically initializes for getPathEmiProb
             */
            if (round == 1 || feasible[i]){
                correctPath = StatePath::getInducedStatePath(as->anno->genes, as->length, false);
                //condensedCorrectPath = StatePath::condenseStatePath(correctPath);
                //cout << "correctPath:" << endl; condensedCorrectPath->print();
                parameters->resetCounts();
		//cout << "getting path emi prob" << endl << flush;
                pathEmiProb = namgene.getPathEmiProb(correctPath, as->sequence, *sfcs[i]);
		delete correctPath;
		//delete condensedCorrectPath;
                //cout << "correct pathEmiProb=" << pathEmiProb << endl;
		if (pathEmiProb > 0.0){
		    if (round == 1) {
			feasible.push_back(true);
			m++;
			if (as->length > up)
			    up = as->length;
			vector<int> counts = parameters->getCounts();
			for (int k=0; k < parameters->size(); k++)
			    Nf[k] += counts[k];
		    }
		    score = pathEmiProb.log();
		    totalScore += score;
		    cout << "training seq no " << m << " " << as->seqname << ", score= " << score << endl;
                } else {
		    if (round == 1) {
			cout << "training gene structure on " << as->seqname << " not feasible. Skipping this example from now on." << endl;
			feasible.push_back(false);
		    }
		}
            }
	    if (feasible[i]){
		for (int j=0; j < sample; j++){
		    sampledPath = namgene.getSampledPath(as->sequence, as->seqname);
		    condensedSampledPath = StatePath::condenseStatePath(sampledPath);
		    //cout << j+1 << "th sampled path:" << endl; condensedSampledPath->print();
		    parameters->resetCounts();
		    namgene.getPathEmiProb(sampledPath, as->sequence, *sfcs[i]);
		    vector<int> counts = parameters->getCounts();
		    for (int k=0; k < parameters->size(); k++)
			Sf[k] += counts[k];
		    delete sampledPath;
		    delete condensedSampledPath;
		}
	    }
	}
	if (round == 1){
	    cout << "training set has size " << m << endl;
	    cout << "total feature counts of all features: " << endl;
	    parameters->print<int>(Nf, true, true);
	    cout << "up = uppper limit for feature count of all training sequences and any gene structure = " << up << endl;
	}
	cout << "total Score of all training genes: " << setprecision(6) << totalScore << endl;
	/*
	 * compare expected counts with actual counts
	 */
	// cout << "sampled count of features:" << endl; parameters->print<int>(Sf, true, true);
	cout << "expected count of features:" << endl;
	for (int k=0; k < parameters->size(); k++)
	    Ef[k] = (double) Sf[k] / sample;
        parameters->print<double>(Ef, true, true);
	cout << "comparison actual -> expected feature counts:" << endl;
	CRF::compareWeights<int,double>(parameters, Nf, Ef);
	/*
	 * update weights
	 */
	if (round <= 50)
	    up = 300;
	else 
	    up = 500;
	h.resize(parameters->size());
	for (int k=0; k < h.size(); k++){
	    // TODO: regularization
	    if (Ef[k] == 0.0 && Nf[k] > 0){
		h[k] = 0.1; //arbitrary
	    } else if (Nf[k] == 0) {
		h[k] = -0.1;
	    } else {
		h[k] = log(Nf[k]/Ef[k])/up;
	    }
	    cout << "h[" << k << "] = " << h[k];
	    if (k%10 == 0 || k == h.size()-1)
		cout << endl;
	    else
		cout << "  ";
	}
	vector<double> startWeights = parameters->getWeights(); //TEMP
	parameters->addWeights(h);
	parameters->smoothFeatures();
	parameters->updatePars();
	vector<double> endWeights = parameters->getWeights();   //TEMP
	CRF::compareWeights(parameters, startWeights, endWeights, 50);   //TEMP
	cout << "updated Parameters:" << endl;
	parameters->print(50, false);
	// ******** test evaluation
	if (evalAnnoSeqs.size() > 0 && round == N){
	    cout << "Evaluating on " << evalAnnoSeqs.size() << " sequences " << endl;
	    Evaluation* eval = predictAndEvaluate(evalAnnoSeqs, *evalExtrinsicFeatures);
	    target = (3*eval->nukSens + 2*eval->nukSpec + 4*eval->exonSens + 3*eval->exonSpec + 2*eval->geneSens + 1*eval->geneSpec)/15;
	    cout << "Evaluation after last round " << round << " target = " << target << endl;
	    eval->print();
	    delete eval;
	}
    }
    inCRFTraining = false;
}


double CRF::lossFct(Transcript* correct, Transcript* predicted){
    Evaluation eval;
    eval.addToEvaluation(predicted, correct, bothstrands);
    cout << "nukFP=" << eval.nukFP << " nukFPinside=" << eval.nukFPinside << " nukFN=" << eval.nukFN << " nucUFPinside=" << eval.nucUFPinside << " nucUFN=" << eval.nucUFN << endl;
    return 0.001 * eval.nukFPinside + 0.001 * eval.nukFN // coding bases
      + 0.0004 * eval.nucUFPinside + 0.0004 * eval.nucUFN; // noncoding bases
}



/*
 * capOutliers
 * replace bs[u] by min{bs[u], cap}, where cap is the qth Quantile
 * this weighs down the effect of extreme outliers which could be misannotated or chanceless to be predicted correctly 
 */
vector<double> CRF::capOutliers(vector<double> bs){
    double q = 0.9, cap;
    vector<double> t = bs, bsc(bs.size(), 0.0);
    try {
      q = Properties::getdoubleProperty("capthresh");
    } catch (...){
      q = 0.9;
    }
    if (bs.size() > 0){
	std::sort(t.begin(), t.end());
	cap = t[(int) (q * (bs.size()-1))];
	cout << "capOutliers: " << 100*q << "% quantile= " << cap << endl;
	//cout << "t\tbs\tbsc" << endl;
	for (int i=0; i < bs.size(); i++){
   	    bsc[i] = (bs[i] <= cap)? bs[i] : 0; // this was set to cap instead of 0 before
	    if (bsc[i] < bs[i]){
		cout << "capped outlier " << bs[i] << " --> " << bsc[i] << endl;
	    }
	    //cout << t[i] << "\t" << bs[i] << "\t" << bsc[i] << endl;
	}
    }
    return bsc;
}