Codebase list seer / upstream/1.1.2 src / combineKmers.cpp
upstream/1.1.2

Tree @upstream/1.1.2 (Download .tar.gz)

combineKmers.cpp @upstream/1.1.2raw · history · blame

/*
 * combineKmers.cpp
 * Takes the union of kmer counts generated by dsk
 *
 */


//
// Consider transforming bases to a more compact representation
//
// Just use sample index vs. use full sample name
//
// TODO read dsk output directly

#include "combineKmers.hpp"

int main (int argc, char *argv[])
{
   // Program description
   std::cerr << "combineKmers: basic algorithm to combine kmer counts across samples\n";

   // Do parsing and checking of command line params
   // If no input options, give quick usage rather than full help
   boost::program_options::variables_map vm;
   if (argc == 1)
   {
      std::cerr << "Usage: combineKmers -s samples.txt -o all_kmers --min_samples 2\n\n"
         << "For full option details run combineKmers -h\n";
      return 0;
   }
   else if (parseCommandLine(argc, argv, vm))
   {
      return 1;
   }

   // Read in list of sample kmer files and their names
   std::vector<std::tuple<std::string, std::string> > samples = readSamples(vm["samples"].as<std::string>());
   std::unordered_map<int, std::string> sample_names;
   size_t min_samples = checkMin(samples.size(), vm["min_samples"].as<int>());

   // Open the output file before counting kmers
   ogzstream out_file((vm["output"].as<std::string>() + ".gz").c_str());

   // Map to store kmers
   // TODO this would be neater if written with objects
   std::unordered_map<std::string, std::vector<std::tuple<int, int>>> kmer_union;

   // Add kmers to map
   std::cerr << "Reading and mapping kmers..." << std::endl;
   for (unsigned int i = 0; i < samples.size(); ++i)
   {
      std::string sample_name, filename;
      std::tie(sample_name, filename) = samples[i];

      // Rather than storing lots of copies of sample names as strings, just
      // store hash keys
      sample_names[i] = sample_name;

      std::ifstream kmer_counts(filename.c_str());

      if (kmer_counts)
      {
         std::cerr << "File " << i + 1 << "/" << samples.size() << "\r";
         std::cerr.flush();

         std::string kmer, abundance;
         while (kmer_counts)
         {
            kmer_counts >> kmer >> abundance;

            kmer_union[kmer].push_back(std::make_tuple(i, stoi(abundance)));
         }
      }
      else
      {
         std::cerr << "Could not open " + filename << std::endl;
         std::cerr << "Skipping..." << std::endl;
      }
   }
   std::cerr << std::endl;

   // Print results
   std::cerr << "Printing union of kmers" << std::endl;
   for(auto kmer_it = kmer_union.cbegin(); kmer_it != kmer_union.cend(); ++kmer_it)
   {
      if (kmer_it->second.size() >= min_samples)
      {
         out_file << kmer_it->first;
         for (auto sample_it = kmer_it->second.cbegin(); sample_it != kmer_it->second.cend(); ++sample_it)
         {
            int sample, abundance;
            std::tie (sample, abundance) = *sample_it;

            out_file << " " << sample_names[sample] + ":" + std::to_string(abundance);
         }
         out_file << std::endl;
      }
   }

   std::cerr << "Done." << std::endl;

   return(0);
}