CRequiredTaxon.h - baitfisher (debian/1.0+dfsg-1)

Tree @debian/1.0+dfsg-1 (Download .tar.gz)

CRequiredTaxon.h @debian/1.0+dfsg-1 — raw · history · blame

/*  BaitFisher (version 1.2.7) a program for designing DNA target enrichment baits
 *  Copyright 2013-2016 by Christoph Mayer
 *
 *  This source file is part of the BaitFisher-package.
 * 
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with BaitFisher.  If not, see <http://www.gnu.org/licenses/>.
 *
 *
 *  For any enquiries send an Email to Christoph Mayer
 *  c.mayer.zfmk@uni-bonn.de
 *
 *  When publishing work that is based on the results please cite:
 *  Mayer et al. 2016: BaitFisher: A software package for multi-species target DNA enrichment probe design
 *  
 */
#ifndef CREQUIREDTAXA_H
#define CREQUIREDTAXA_H

#include "faststring2.h"
#include <iostream>
#include <fstream>
#include <set>
#include <vector>
#include "CTaxonNamesDictionary.h"
#include "print_container.h"

class CRequiredTaxa
{
  std::vector<std::set<faststring> > required_groups;
  std::vector<std::set<unsigned> >   required_groups_u;
  CTaxonNamesDictionary &dictionary;
  bool empty;

  bool sets_intersect(const set<unsigned> &s1, const set<unsigned> &s2) const
  {
    //   cout << "Set1" << endl;
    //   print_set(cout, s1);
    //   cout << endl;
    //   cout << "Set2" << endl;
    //   print_set(cout, s2);
    //   cout << endl;

    unsigned size1 = s1.size();
    unsigned size2 = s2.size();

    unsigned size3 = macromin(size1, size2);

    // We allocate memory for the result set that will be stored in a vector.
    // Using allocated memory is more efficient than using an insert-iterator
    // that has more overhead.
    // Important: Later we have to shorten the vector to its actual size. 
    vector<unsigned> res(size3);
    vector<unsigned>::iterator it;
    it=set_intersection (s1.begin(), s1.end(), s2.begin(), s2.end(), res.begin());

/*     cout << "SET1: "; */
/*     print_container(cout, s1, "(", ",", ")\n"); */
/*     cout << "SET2: "; */
/*     print_container(cout, s2, "(", ",", ")\n"); */
/*     cout << "SET-RES1: "; */
/*     print_container(cout, res, "(", ",", ")\n");     */
/*     cout << "SIZE of intersection (A): " << res.size() << endl; */

    // Shorten the vector to the actual size of the intersection.
    // Otherwise we have trailing 0s from the constructor of the vector
    // and the size of res would always be size3.
    res.resize(it-res.begin());
/*     cout << "SIZE of intersection (B): " << res.size() << endl; */
    
/*     cout << "SET-RES2: "; */
/*     print_container(cout, res, "(", ",", ")\n");     */

    return (res.size()>0);
  }
  
  // Do not allow gaps:
  void parse_required_groups(faststring line, int &error)
  {
    if (line.empty())
    {
      cout << "LOG: Required taxa string is empty. No required taxa specified." << endl;
      empty = true;
    }

    //    dictionary.print(cout);

    set<faststring> empty_set;
    set<unsigned> empty_set_u;

    unsigned i1, i2, N1, N2;

    vector<faststring> splitter1;
    vector<faststring> splitter2;
  
    split_respect(splitter1, line, ", ");

    N1 = splitter1.size();

    for (i1=0; i1<N1; ++i1)
    {
      required_groups.push_back(empty_set);
      required_groups_u.push_back(empty_set_u);

      //    cout << splitter1[i] << endl;
      split(splitter2, splitter1[i1], "(, )");
      N2 = splitter2.size();
      for (i2=0; i2<N2; ++i2)
      {
	splitter2[i2].removeSpacesBack();
	splitter2[i2].removeSpacesFront();
	required_groups[i1].insert(splitter2[i2]);
	unsigned index = dictionary.dictionary_get_index_of_taxonname(splitter2[i2]);
	if (index == -1u)
	{
	  std::cerr << "ERROR: Could not find taxon name " << splitter2[i2]
		    << " in the list of all taxon names found in all given sequence files." << std::endl;
	  std::cerr << "Please check that the taxon name is spelled properly or remove the taxon name from the list of required"
		    << " taxa to proceed. Exiting." << std::endl;
	  error = 3;
	}
	required_groups_u[i1].insert(index);
      }
    }
  }


 public:
  // Error codes:
  // 0: No error,
  // 1: File does not contain a single line as required. We might have more or less than one line in the file.
  // 3: Sequence names not found in dictionary 

  // The dummy parameter is only used to distinguish this and the following constructor, since both have the same parameter list
  // except of this dummy parameter.
  /* Depricates. Required taxa have to be specified in the parameter file.
 CRequiredTaxa(faststring infilename, CTaxonNamesDictionary &dict, int &error, int dummy):dictionary(dict),empty(false)
  {
    std::ifstream is(infilename.c_str());

    error = 0;
    std::vector<faststring> file_as_vec;

    appendFileToVector(is, file_as_vec);

    unsigned i;
    i=0;
    while (i< file_as_vec.size() )
    {
      file_as_vec[i].removeSpacesFront();
      file_as_vec[i].removeSpacesBack();
      if (file_as_vec[i].empty() )
      {
	file_as_vec.erase(file_as_vec.begin()+i);
      }
      else
      {
	++i;
      }
    }
    if (file_as_vec.size() != 1)
    {
      error = 1;
      return;
    }
    parse_required_groups(file_as_vec[0], error);
  }
  */

   // Error codes:
  // 0: No error,
  // 1: -,  unused
  // 3: Sequence names not found in dictionary 
 CRequiredTaxa(faststring required_string, CTaxonNamesDictionary &dict, int &error):dictionary(dict),empty(false)
  {
    error = 0;
    parse_required_groups(required_string, error);
  }
 
  bool is_empty() const
  {
    return empty;
  }

  bool check_precense_of_all_groups_of_taxa(std::vector<faststring> taxon_vec) const
  {
    if (empty)
      return true;

    std::set<unsigned> taxon_set;

    unsigned i, N, index;
    for (i=0, N=taxon_vec.size(); i < N; ++i)
    {
      index = dictionary.dictionary_get_index_of_taxonname(taxon_vec[i]);
      taxon_set.insert(index);
    }

    // v_set must have a non vanishing intersection with all groups of taxa that are required.
    
    unsigned iset, Nset;
    bool all_taxa_present = true;

    for (iset=0, Nset=required_groups_u.size();iset < Nset; ++iset)
    {
      if ( !sets_intersect(taxon_set, required_groups_u[iset] ) )
      {
	//		cout << " Sets do not intersect: " << endl;
	//		print_set(cout, taxon_set);                cout << endl;
	//		print_set(cout, required_groups_u[iset]);  cout << endl;
	all_taxa_present = false;
	break;
      }
      
    }
    return all_taxa_present;
  }

  bool check_precense_of_all_groups_of_taxa(std::set<unsigned> taxon_ids) const
  {
    if (empty)
      return true;

    // taxon_ids must have a non vanishing intersection with all groups of taxa that are required.
    
    unsigned iset, Nset;
    bool all_taxa_present = true;

    for (iset=0, Nset=required_groups_u.size();iset < Nset; ++iset)
    {
      if ( !sets_intersect(taxon_ids, required_groups_u[iset] ) )
      {
	//		cout << " Sets do not intersect: " << endl;
	//		print_set(cout, taxon_set);                cout << endl;
	//		print_set(cout, required_groups_u[iset]);  cout << endl;
	all_taxa_present = false;
	break;
      }
    }
    return all_taxa_present;
  }

  


  void print(std::ostream &os, int mode = 0)
  {
    std::vector<std::set<faststring> >::iterator it_vec_faststrings;
    std::vector<std::set<unsigned> >::iterator   it_vec_unsigned;

    std::set<faststring>::iterator it_set_faststrings;
    std::set<unsigned>::iterator   it_set_unsigned;

    it_vec_faststrings = required_groups.begin();

    if (empty)
    {
      os << "CRequiredTaxa: empty" << endl;
      return;
    }

    while (it_vec_faststrings != required_groups.end())
    {
      os << "(";
      it_set_faststrings = it_vec_faststrings->begin();
      while (it_set_faststrings != it_vec_faststrings->end() )
      {
	os << *it_set_faststrings;
	++it_set_faststrings;
	if (it_set_faststrings != it_vec_faststrings->end())
	  os << ",";
      }
      ++it_vec_faststrings;
      os << "),";
      if (mode == 1)
	os << endl;
    }

    it_vec_unsigned  = required_groups_u.begin();

    os << "*********" << endl;

    while (it_vec_unsigned != required_groups_u.end())
    {
      os << "(";
      it_set_unsigned = it_vec_unsigned->begin();
      while (it_set_unsigned != it_vec_unsigned->end() )
      {
	os << *it_set_unsigned;
	++it_set_unsigned;
	if (it_set_unsigned != it_vec_unsigned->end())
	  os << ",";
      }
      ++it_vec_unsigned;
      os << "),";
      if (mode == 1)
	os << endl;
    }

  }
  

};

#endif