Codebase list fastqtl / HEAD src / analysisPermutation.cpp
HEAD

Tree @HEAD (Download .tar.gz)

analysisPermutation.cpp @HEADraw · history · blame

//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
//
//This program is free software: you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation, either version 3 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program.  If not, see <http://www.gnu.org/licenses/>.

#include "data.h"


void data::runPermutation(string fout, vector < int > nPermutations) {

    //0. Prepare genotypes
    vector < double > genotype_sd = vector < double > (genotype_count, 0.0);
    vector < double > phenotype_sd = vector < double > (phenotype_count, 0.0);
    if (covariate_count > 0) {
        LOG.println("\nCorrecting genotypes for covariates");
        covariate_engine->residualize(genotype_orig);
    }
    for (int g = 0 ; g < genotype_count ; g ++) genotype_sd[g] = RunningStat(genotype_orig[g]).StandardDeviation();
    normalize(genotype_orig);

    //1. Loop over phenotypes
    ofile fdo (fout);
    for (int p = 0 ; p < phenotype_count ; p ++) {

        LOG.println("\nProcessing gene [" + phenotype_id[p] + "]");

        //1.1. Enumerate all genotype-phenotype pairs within cis-window
        vector < int > targetGenotypes, targetDistances;
        for (int g = 0 ; g < genotype_count ; g ++) {
            int cisdistance = genotype_pos[g] - phenotype_start[p];
            if (abs(cisdistance) <= cis_window) {
                targetGenotypes.push_back(g);
                targetDistances.push_back(cisdistance);
            }
        }
        LOG.println("  * Number of variants in cis = " + sutils::int2str(targetGenotypes.size()));

        //1.2. Copy original data
        vector < float > phenotype_curr = phenotype_orig[p];
        if (covariate_count > 0) covariate_engine->residualize(phenotype_curr);
        phenotype_sd[p] = RunningStat(phenotype_curr).StandardDeviation();
        normalize(phenotype_curr);

        //1.3. Nominal pass: scan cis-window & compute statistics
        double bestCorr = 0.0;
        int bestDistance = ___LI___, bestIndex = -1;
        for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
            double corr = getCorrelation(genotype_orig[targetGenotypes[g]], phenotype_curr);
            if (abs(corr) > abs(bestCorr) || (abs(corr) == abs(bestCorr) && abs(targetDistances[g]) < bestDistance)) {
                bestCorr = corr;
                bestDistance = targetDistances[g];
                bestIndex = targetGenotypes[g];
            }
        }
        if (targetGenotypes.size() > 0) LOG.println("  * Best correlation = " + sutils::double2str(bestCorr, 4));

        //1.4. Permutation pass:
        bool done = false;
        int countPermutations = 0, nBetterCorrelation = 0;
        vector < double > permCorr;
        do {
            double bestCperm = 0.0;
            phenotype_curr = phenotype_orig[p];
            random_shuffle(phenotype_curr.begin(), phenotype_curr.end());
            if (covariate_count > 0) covariate_engine->residualize(phenotype_curr);
            normalize(phenotype_curr);
            for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
                double corr = getCorrelation(genotype_orig[targetGenotypes[g]], phenotype_curr);
                if (abs(corr) > abs(bestCperm)) bestCperm = corr;
            }
            if (abs(bestCperm) >= abs(bestCorr)) nBetterCorrelation++;
            permCorr.push_back(bestCperm);
            countPermutations++;

            if (nPermutations.size() == 1 && countPermutations >= nPermutations[0]) done = true;
            if (nPermutations.size() == 2 && (nBetterCorrelation >= nPermutations[0] || countPermutations >= nPermutations[1])) done = true;
            if (nPermutations.size() == 3 && (countPermutations >= nPermutations[0]) && (nBetterCorrelation >= nPermutations[1] || countPermutations >= nPermutations[2])) done = true;
        } while (!done);
        if (targetGenotypes.size() > 0) LOG.println("  * Number of permutations = " + sutils::int2str(nBetterCorrelation) + " / " + sutils::int2str(countPermutations));

        //1.5. Calculate effective DFs & Beta distribution parameters
        vector < double > permPvalues;
        double true_df = sample_count - 2 - ((covariate_count>0)?covariate_engine->nCovariates():0);
        double mean = 0.0, variance = 0.0, beta_shape1 = 1.0, beta_shape2 = 1.0;
        fdo << phenotype_id[p] << "\t" << targetGenotypes.size();
        if (targetGenotypes.size() > 0) {
            //Estimate number of degrees of freedom
            if (putils::variance(permCorr, putils::mean(permCorr)) != 0.0) {
                learnDF(permCorr, true_df);
                //LOG.println("  * Effective degree of freedom = " + sutils::double2str(true_df, 4));
            }
            //Compute mean and variance of p-values
            for (int c = 0 ; c < permCorr.size() ; c ++) permPvalues.push_back(getPvalue(permCorr[c], true_df));
            for (int pv = 0 ; pv < permPvalues.size() ; pv++) mean += permPvalues[pv];
            mean /= permPvalues.size();
            for (int pv = 0 ; pv < permPvalues.size() ; pv++) variance += (permPvalues[pv] - mean) * (permPvalues[pv] - mean);
            variance /= (permPvalues.size() - 1);
            //Estimate shape1 & shape2
            if (targetGenotypes.size() > 1 && mean != 1.0) {
                beta_shape1 = mean * (mean * (1 - mean ) / variance - 1);
                beta_shape2 = beta_shape1 * (1 / mean - 1);
                if (targetGenotypes.size() > 10) mleBeta(permPvalues, beta_shape1, beta_shape2);	//ML estimate if more than 10 variant in cis
            }
            LOG.println("  * Beta distribution parameters = " + sutils::double2str(beta_shape1, 4) + " " + sutils::double2str(beta_shape2, 4));
            fdo << "\t" << beta_shape1 << "\t" << beta_shape2 << "\t" << true_df;
        } else fdo << "\tNA\tNA\tNA";

        //1.6. Writing results
        if (targetGenotypes.size() > 0 && bestIndex >= 0) {
            double pval_fdo = getPvalue(bestCorr, true_df);
            double df = sample_count - 2 - ((covariate_count>0)?covariate_engine->nCovariates():0);
            double tstat2 = getTstat2(bestCorr, df);
            double pval_nom = getPvalueFromTstat2(tstat2, df);
            double pval_slope = getSlope(bestCorr, phenotype_sd[p], genotype_sd[bestIndex]);
            double slope_se = abs(pval_slope) / sqrt(tstat2);
            fdo << "\t" << pval_fdo;
            fdo << "\t" << genotype_id[bestIndex];
            fdo << "\t" << bestDistance;
            fdo << "\t" << genotype_ma_samples[bestIndex];
            fdo << "\t" << genotype_ma_count[bestIndex];
            fdo << "\t" << genotype_maf[bestIndex];
            fdo << "\t" << genotype_ref_factor[bestIndex];
            fdo << "\t" << pval_nom;
            fdo << "\t" << pval_slope;
            fdo << "\t" << slope_se;
            fdo << "\t" << (nBetterCorrelation + 1) * 1.0 / (countPermutations + 1.0);
            fdo << "\t" << pbeta(pval_fdo, beta_shape1, beta_shape2, 1, 0);
        } else fdo << "\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA";
        fdo << endl;

        LOG.println("  * Progress = " + sutils::double2str((p+1) * 100.0 / phenotype_count, 1) + "%");
    }
    fdo.close();
}