Codebase list augustus / debian/3.4.0+dfsg2-1 src / fastBlockSearch.cc
debian/3.4.0+dfsg2-1

Tree @debian/3.4.0+dfsg2-1 (Download .tar.gz)

fastBlockSearch.cc @debian/3.4.0+dfsg2-1raw · history · blame

/*
 * fastBlockSearch.cc
 *
 * License: Artistic License, see file LICENSE.TXT or 
 *          https://opensource.org/licenses/artistic-license-1.0
 */

// #include <iostream>
// #include <vector>
// #include <fstream>
// #include <iomanip>
// #include <sstream>
// #include <string>
// #include <set>
#include <iostream>
#include <fstream>
#include <iomanip>

#define VERBOSE_SEEDS 1

#include "properties.hh"
#include "pp_profile.hh"
#include "pp_fastBlockSearcher.hh"

// #define SEEDSIZE 6
// #define SEED_COLUMNSCORE 0.8
#define MAX_SEQSIZE (unsigned(-1)/2)

//#define TESTLEN 1000000
#define MINCOUNT 0

using namespace std;

// set this with --cutoff=x
// or with --avscore=exp(x)
double cutoff = 0.7; //
   
// struct IndexComp {
//     IndexComp(const PP::Column& v) : vec(v) {}
//     const PP::Column& vec;
//     bool operator() (int a, int b) {
// 	return vec.Q(a) < vec.Q(b);
//     }
// };

// struct PrefilterType {
//     PrefilterType() : seedhitcount(0), lastOffset(-10) {}
//     int seedhitcount;
//     int lastOffset;
// };


void printBestResults(PP::Profile& prfl, PP::FsHitCollection& searchResults, double cutoff, double columnCount, int offset) {
    multimap<double, PP::FsHitType> bestScores;
    searchResults.storeBestResults(bestScores, MINCOUNT, cutoff * columnCount);
    for (map<double,PP::FsHitType>::iterator it=bestScores.begin();
	 it != bestScores.end();
	 ++it) {
	vector<string> output(1, "--\n");
	cout << "Score:" << it->first << endl;
	cout << "Mult. score:" << exp(it->first / columnCount) << endl;
	for (PP::FsHitType* ht = &(it->second); ht != 0; ht = ht->predecessor) {
	    int startpos = ht->pos + offset + 
		3 * (ht->reverse ? 
		     prfl.blockSize(ht->blockNo)-ht->blockto :
		     ht->blockfrom);

	    ostringstream sstrm;

	    sstrm << startpos << "\t" << prfl[ht->blockNo].id;
	    if (ht->blockfrom != 0 || ht->blockto != prfl.blockSize(ht->blockNo)) 
		sstrm << "[" << ht->blockfrom << "," << ht->blockto << "]";
	    sstrm << "\t" << (ht->reverse ? '-' : '+') << "\t" 
		  << exp(ht->score / prfl.blockSize(ht->blockNo)) << "\t" 
//		  << ht->score << "\t" 
		  << prfl[ht->blockNo].getBackDist().normed(ht->score);
	    string blseq;
	    for (int i=0; i < prfl.blockSize(ht->blockNo); i++) {
		blseq += (i < ht->blockfrom || i >= ht->blockto) ? '.' : ht->reverse ? 
		    GeneticCode::revtranslate(PP::DNA::sequence + ht->pos + 3*(prfl.blockSize(ht->blockNo) -1 -i)) :
		    GeneticCode::translate(PP::DNA::sequence + ht->pos + 3*i);
	    }
	    sstrm << "\t" << blseq; // string(PP::DNA::sequence + ht->pos, prfl.blockSize(ht->blockNo)*3); 
	    sstrm << "\n";
#ifdef DEBUG
	    sstrm << string(PP::DNA::sequence + startpos - offset, 3*(ht->blockto - ht->blockfrom)) << endl;
#endif
	    output.push_back(sstrm.str());
	}
	while (!output.empty()) {
	    cout << output.back();
	    output.pop_back();
	}
    }
}



int main(int argc, char* argv[]) {
    bool abort = false;
    vector<string> argvec;
    for (int i=1; i<argc; i++) {
	string param(argv[i]);
	if (param.substr(0,2) == "--") {
	    int i = param.find("=");
	    if (param.find("=") != string::npos) {
		string key = param.substr(2, i-2);
		string value = param.substr(i+1);
		istringstream istrm(value);
		if (key == "cutoff" || key == "avscore") {
		    abort = !(istrm >> cutoff);
		    if (key == "avscore") 
			cutoff = log(cutoff);
		} else
		    Properties::addProperty(key, value);
	    } else
		abort = true;
	} else
	    argvec.push_back(param);
	if (abort) break;
    }

    if (abort || argvec.size() != 2) {
	cerr << "Usage: fastBlocksSearch [options] <seqs.fa> <fam.prfl>\n";
	return -1;
    }

    // read in sequence file
//     map<string, SeqType> sequences;
    ifstream fstrm(argvec[0].c_str());
    if (!fstrm) {
	cerr << "Could not open \"" << argvec[0] << "\".\n";
	return -1;
    }
    
// Properties::addProperty("/ProteinModel/block_threshold_spec", "0.0");
    // Properties::addProperty("/ProteinModel/block_threshold_sens", "0.0");
    PP::initConstants();
    // PP::Profile::min_anchor_count = 0;

    // read in ptn
    PP::Profile prfl(argvec[1].c_str());
    
    int columnCount = 0;
    PP::FsSeedCollection seedColl(prfl);
    for (int b=0; b<prfl.blockCount(); b++)
	columnCount += prfl.blockSize(b);

    cerr << "Profile has " << prfl.blockCount() << " blocks with " << columnCount << " columns." << endl;
    cerr << "Total seed count: " << seedColl.size() << endl;
    cerr << "Reading sequence(s)";
    int predictionStart=0, predictionLen=MAX_SEQSIZE;
    try {
	predictionStart = Properties::getIntProperty("predictionStart") -1;
    } catch (...) {}
    try {
	predictionLen = Properties::getIntProperty("predictionEnd") - predictionStart;
    } catch (...) {}
    if (predictionLen <= 0) {
	cerr << "predictionStart must be less than predictionEnd!" << endl;
	exit(-1);
    }
    while (fstrm) {
	PP::FsHitCollection fastSearchResults(prfl.blockCount());

	string seqname, currSeq;
	getline(fstrm, seqname);
	if (seqname[0] != '>') {
	    cerr << "Not in FASTA format.\n";
	    return -1;
	}
	// int pos=seqname.find_first_of("\t\n\v\f\r ");
	// if (pos>=0)
	//     seqname.erase(pos);
	seqname = seqname.substr(1);
	
	int readlen = 0;
	int progress = 0;
	while (fstrm && fstrm.peek() != '>') {
	    string line;
	    getline(fstrm, line);
	    int pos = 0;
	    for (int i=0; i<line.length() && pos < predictionLen - currSeq.length(); i++) 
		if (isalpha(line[i])) //  || currSeq[i] == '-' || currSeq[i] == '.')
		    line[pos++] = tolower(line[i]);
	    line.erase(pos);
	    pos += readlen;
	    if (predictionStart <= pos) {
		if (predictionStart > readlen) 
		    currSeq += line.substr(predictionStart - readlen);
		else  
		    currSeq += line;
	    }
	    readlen = pos;
	    int newprogress = readlen/1000000;
	    while (newprogress > progress) {
		progress++;
		if (progress%10) 
		    cerr << ".";
		else
		    cerr << " " << progress << "M ";
	    }
	    if (currSeq.length() >= predictionLen) {
		if (currSeq.length() > predictionLen) 
		    cerr << "ERROR: prediction Length incorrect!" << endl;
		break;
	    }
	}
	cerr << endl << "Read " << readlen << " bps from '" << seqname << "'.";
	if (predictionLen < MAX_SEQSIZE) 
	    cerr << " Using " << predictionLen << ".";
	cerr << endl;

	PP::DNA::initSeq(currSeq);
	PP::CandidateCollection candidates(prfl, seedColl);
	progress = 0;
	for (int t=2; t<currSeq.length(); t++) {
	    candidates.proceed(t, fastSearchResults);

	    double newprogress = (100.0 * (t+1)) / currSeq.length() -0.1;
	    while (progress < newprogress) {
		progress++;
		switch (progress%10) {
		    case 0 : cerr << progress; break;
		    case 9 : break;
		    case 8 : if (progress != 98) cerr << ".";
		    default: cerr << ".";
		}
	    }
	}
	cerr << "\n # hits: " << fastSearchResults.allHitCount() << endl; 
	cerr << " # hit groups: " << fastSearchResults.resultCount() << endl;
	cerr << "output will show all hits above " << cutoff * columnCount;
	if (MINCOUNT > 0)
	    cerr << ", at least " << MINCOUNT;
	cerr << ".\n";
    
	cout << "Hits found in " << seqname << endl;
	printBestResults(prfl, fastSearchResults, cutoff, columnCount, predictionStart);
	cout << endl;
        delete [] PP::DNA::sequence;
        
	if (predictionStart > 0 || predictionLen != MAX_SEQSIZE)
	    break;
    }
}