bait-fisher-helper.cpp - baitfisher (debian/1.0+dfsg-1)

Tree @debian/1.0+dfsg-1 (Download .tar.gz)

bait-fisher-helper.cpp @debian/1.0+dfsg-1 — raw · history · blame

/*  BaitFisher (version 1.2.7) a program for designing DNA target enrichment baits
 *  Copyright 2013-2016 by Christoph Mayer
 *
 *  This source file is part of the BaitFisher-package.
 * 
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with BaitFisher.  If not, see <http://www.gnu.org/licenses/>.
 *
 *
 *  For any enquiries send an Email to Christoph Mayer
 *  c.mayer.zfmk@uni-bonn.de
 *
 *  When publishing work that is based on the results please cite:
 *  Mayer et al. 2016: BaitFisher: A software package for multi-species target DNA enrichment probe design
 *  
 */
#include "faststring2.h"
#include <iostream>
#include <list>

#include "CFile/CFile2_1.h"
#include "CSequences2.h"
#include "CSequence_Mol2_1.h"

#include "bait-fisher-helper.h"

using namespace std;

// Extract species (taxon) name from transcript name:
faststring extract_taxon_name(faststring sequence_name, char delim, unsigned short field_num)
{
  list<faststring> container;  char             delim_cstring[2];
  
  delim_cstring[0] = delim;
  delim_cstring[1] = '\0';

  split(container, sequence_name, delim_cstring);

  // If the field number (0 based) is not smaller the number of fields, we have a problem.
  if (!(field_num < container.size() ) )
  {
    cerr << "ERROR: In extract_taxon_name: Found unexpected number of fields in a sequence name: " << sequence_name << endl;
    cerr << "I tried to extract field number " << field_num << " as specified in the parameter file or given as default value." << endl;
    cerr << "But this field does not exist." << endl;
    exit(-3);
  }

  list<faststring>::iterator it = container.begin();
  unsigned short i;
  
  for (i=0; i<field_num; ++i)
    ++it;

  return *it;
}


// Extract gene name from transcript name:
faststring extract_gene_name(faststring sequence_name, char delim, unsigned short field_num)
{
  list<faststring> container;
  char             delim_cstring[2];
  
  

  delim_cstring[0] = delim;
  delim_cstring[1] = '\0';

  split(container, sequence_name, delim_cstring);

  // If the field number (0 based) is not smaller the number of fields, we have a problem.
  if (!(field_num < container.size() ) )
  {
    cerr << "ERROR: In extract_gene_name: Found unexpected number of fields in a sequence name: " << sequence_name << endl;
    cerr << "I tried to extract field number " << field_num << " as specified in the parameter file or given as default value." << endl;
    cerr << "But this field does not exist." << endl;
    exit(-3);
  }

  list<faststring>::iterator it = container.begin();
  unsigned short i;
  
  for (i=0; i<field_num; ++i)
    ++it;
  
#ifdef SHORTEN_GENENAME_AFTER_MINUS
    faststring pre, post;
    it->backwards_search_divide_at_break_at('-', delim, pre, post);
    return pre;
#else
    return *it;
#endif
}

// (1) Read alignment.
// (2) Provide pointer to specific sequence (the one for which the name is specified, see (3)) as well as to the full alignment.
// (3) If species_name_query is not empty, find the sequence with this name.
//     If species_name_query is an empty string, assign NULL to pseq.
// (4) Important: The CSequence_Mol and CSequences2 objects need to be deleted by the user of this function when not used any more.
// Version 2: Read full alignment and return both: full alignment and requested sequence.
void read_alignment_sequence_of_species2(faststring       alignment_file,
					 faststring       species_name_query,
					 CSequence_Mol*&  pseq,
					 CSequences2*&    pseqs,
					 unsigned&        pseq_index,
					 char             delim, 
					 unsigned short   field_num)
{
  CFile           infile;
  //  unsigned        count_seq=0;
  CSequences2*    tmp_pseqs;


//   fputs("Reading alignment file: ", stderr);
//   fputs(alignment_file.c_str(), stderr);
//   fputs(".\n", stderr);

  infile.ffopen(alignment_file.c_str());
  if ( infile.fail() )
  {
    fprintf(stderr, "Sorry! The input file \n   \"%s\"\ncould not be opened. I guess it does not exist.\n",
	    alignment_file.c_str());
    exit(-1);
  }

  CSequence_Mol::processing_flag pflag = CSequence_Mol::processing_flag(0);
  // Convert all nucleotides to upper case letters and ambig to Ns.
  pflag = CSequence_Mol::processing_flag(pflag | CSequence_Mol::convert_toupper | CSequence_Mol::convert_ambig2N);
  tmp_pseqs = new CSequences2(CSequence_Mol::dna);

  ++DEBUG_COUNT_NEW_ALIGNMENT_FROM_FILE;

  tmp_pseqs->read_from_Fasta_File(infile, pflag, 0, -1, false);
  infile.ffclose();


//   fputsf("Alignment file has been read successfully.\n", stderr);

  pseqs = tmp_pseqs;


  if (species_name_query.empty() )
  {
    pseq = NULL;
    pseq_index = -1u;    
  }
  else
  {
    // Find sequence with taxon name equal to species_name_query
    unsigned i, N;
    N = pseqs->GetTaxaNum();
  
    for (i=0; i<N; ++i)
    {
      const char*  seq_name = pseqs->get_Seq_Name(i);
    
      if (seq_name == NULL)
      {
	cerr << "INTERNAL ERROR: Could not retriev sequence name for sequence of index: " 
	     << i << " in read_transcript_sequence_of_species2." << endl;
	exit(-5);
      }

      faststring species_name = extract_taxon_name(seq_name, delim, field_num);

      //     cout << "Index: " << i << " " << N << endl;
      //     cout << "Comapring: " << species_name << " " << species_name_query << endl;

      if (species_name == species_name_query)
	break;
    }
    if (i == N) // There is no sequence with species name equal to the one we search for.
    {
      pseq = NULL;
      pseq_index = -1u;
    }
    else
    {
      pseq = pseqs->get_seq_by_index(i);
      pseq_index = i;
    }
  }
}