/* BaitFisher (version 1.2.7) a program for designing DNA target enrichment baits
* Copyright 2013-2016 by Christoph Mayer
*
* This source file is part of the BaitFisher-package.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with BaitFisher. If not, see <http://www.gnu.org/licenses/>.
*
*
* For any enquiries send an Email to Christoph Mayer
* c.mayer.zfmk@uni-bonn.de
*
* When publishing work that is based on the results please cite:
* Mayer et al. 2016: BaitFisher: A software package for multi-species target DNA enrichment probe design
*
*/
#ifndef EASYSTRING_H
#define EASYSTRING_H
#include <iostream>
#include <string>
#include <cstring> // for strchr
#include <cstdlib>
#include <fstream>
// A usefull global function split:
//(Code copied from the stringtok.h file on the gnu libstdc++ page.
namespace {
inline bool
isws(char c, const char* wstr="\r\n\t\v\f ")
{
return (strchr(wstr,c) != NULL);
}
}
namespace {
//
// Reads through a string. If braces are encountered we do not tokenize inside but try to read them.
//
inline void
skip_until_ws_respect(const std::string &s, std::string::size_type &pos, char const * const ws = "\r\n\t\v\f ")
{
char c;
unsigned count_parent = 0;
unsigned count_brace = 0;
unsigned count_curly = 0;
unsigned count_sq = 0;
unsigned count_dq = 0;
std::string::size_type size = s.size();
while (pos < size)
{
c = s[pos];
if (c == '(') { ++count_parent; }
else if (c == ')') { --count_parent; }
else if (c == '[') { ++count_brace; }
else if (c == ']') { --count_brace; }
else if (c == '{') { ++count_curly; }
else if (c == '}') { --count_curly; }
else if (c == '\'') { count_sq = (count_sq+1)%2; }
else if (c == '\"') { count_dq = (count_dq+1)%2; }
if (isws(c, ws) && count_parent==0 && count_brace==0 && count_curly==0 && count_sq==0 && count_dq==0)
return;
++pos;
}
}
}
namespace std
{
/*****************************************************************
* Simplistic and quite Standard, but a bit slow. This should be
* templatized on basic_string instead, or on a more generic StringT
* that just happens to support ::size_type, .substr(), and so on.
* I had hoped that "whitespace" would be a trait, but it isn't, so
* the user must supply it. Enh, this lets them break up strings on
* different things easier than traits would anyhow.
*/
template <typename Container>
int
split (Container &l, string const &s, char const * const ws = "\r\n\t\v\f ")
{
l.clear();
const string::size_type S = s.size();
string::size_type i = 0;
while (i < S) {
// eat leading whitespace
while ((i < S) && (isws(s[i],ws))) ++i;
if (i == S) return l.size(); // nothing left but WS
// find end of word
string::size_type j = i+1;
while ((j < S) && (!isws(s[j],ws))) ++j;
// add word
l.push_back(s.substr(i,j-i));
// set up for next loop
i = j+1;
}
return l.size();
}
//
// Tokenizer which does not split braces.
//
template <typename Container>
int
split_respect (Container &l, string const &s, char const * const ws = "\r\n\t\v\f ")
{
l.clear();
const string::size_type S = s.size();
string::size_type i = 0;
while (i < S) {
// eat leading whitespace
while ((i < S) && (isws(s[i],ws))) ++i;
if (i == S) return l.size(); // nothing left but WS
// find end of word
string::size_type j = i; // We can't skip the first char which can be a brace.
skip_until_ws_respect(s, j, ws);
// add word
l.push_back(s.substr(i,j-i));
// set up for next loop
i = j+1;
}
return l.size();
}
// Multiple successive deliminators result in multiple hits.
template <typename Container>
int
split_strict (Container &l, string const &s, char const * const ws = " \t\n")
{
l.clear();
const string::size_type S = s.size();
string::size_type i = 0;
while (i < S) {
// find end of word
string::size_type j = i;
while ((j < S) && (!isws(s[j],ws))) ++j;
// add word
l.push_back(s.substr(i,j-i));
// set up for next loop
i = j+1;
if (i==S) // Only true of last string is empty since otherwise i==S+1
{
l.push_back("");
}
}
return l.size();
}
class easystring: public string
{
public:
// Default constructor:
easystring():string(){};
// Copy constructor:
easystring(const string& s):string(s){}
easystring(const string& s, size_type pos, size_type n):string(s,pos,n){}
// Other constructors:
easystring(const char* s, size_type n):string(s,n){}
easystring(size_type n, char c):string(n,c){}
template <class InputIterator>
easystring(InputIterator first, InputIterator last):string(first, last){}
// Type conversion constructors:
easystring(const char* s):string(s){}
easystring(const char c):string(1,c){}
easystring(const int i):string()
{
// std::cout << "Hallo" << std::endl;
char tmp[21]; // Sufficient for 64 bit numbers + sign
sprintf(tmp, "%d", i);
append(tmp);
}
easystring(const long i):string()
{
// std::cout << "Hallo" << std::endl;
char tmp[21]; // Sufficient for 64 bit numbers + sign
sprintf(tmp, "%ld", i);
append(tmp);
}
easystring(const unsigned i):string()
{
// std::cout << "Hallo" << std::endl;
char tmp[21]; // Sufficient for 64 bit numbers
sprintf(tmp, "%u", i);
append(tmp);
}
easystring(const unsigned long i):string()
{
// std::cout << "Hallo" << std::endl;
char tmp[21]; // Sufficient for 64 bit numbers
sprintf(tmp, "%lu", i);
append(tmp);
}
easystring(const double x, int pres):string()
{
// std::cout << "Hallo" << std::endl;
char tmp[25]; // Sufficient for 64 bit numbers
sprintf(tmp, "%.*f", pres, x);
append(tmp);
}
void removeSpacesFront(const char* delims="\r\n\t\v\f ")
{
string::size_type i;
const string::size_type n = size();
for (i=0; i < n && isws((*this)[i], delims); ++i);
erase(0,i);
}
void removeSpacesBack(const char* delims="\r\n\t\v\f ")
{
unsigned i;
unsigned n = size();
if (n > 0)
{
for (i=n-1; i != 0 && isws((*this)[i], delims); --i);
if (i == 0 && isws((*this)[0], delims))
erase();
else
erase(i+1);
}
}
void ToUpper()
{
iterator it, it_end;
it = begin();
it_end = end();
while (it != it_end)
{
*it = toupper(*it);
++it;
}
}
void ToLower()
{
string::iterator it, it_end;
it = begin();
it_end = end();
while (it != it_end)
{
*it = tolower(*it);
++it;
}
}
// Convert to unsigned long
unsigned ToUnsigned() const
{
char *end;
return strtoul(c_str(), &end, 0);
}
// Convert to unsigned long
unsigned long ToUnsignedLong() const
{
char *end;
return strtoul(c_str(), &end, 0);
}
// Convert to int
int ToInt() const
{
char *end;
return strtol(c_str(), &end, 0);
}
// Convert to long
long ToLong() const
{
char *end;
return strtol(c_str(), &end, 0);
}
long ToLong(string::size_type beg_pos,
string::size_type& end_pos) const
{
const char *beg = c_str()+beg_pos;
char *end;
long result;
result = strtol(beg, &end, 0);
end_pos = end-c_str();
return result;
}
unsigned ToUnsigned(string::size_type beg_pos,
string::size_type& end_pos) const
{
const char *beg = c_str()+beg_pos;
char *end;
long result;
result = strtoul(beg, &end, 0);
end_pos = end-c_str();
return result;
}
// Convert to double
double ToDouble() const
{
char *end;
return strtod(c_str(), &end);
}
double ToDouble(string::size_type beg_pos,
string::size_type& end_pos) const
{
const char *beg = c_str()+beg_pos;
char *end;
double result;
result = strtod(beg, &end);
end_pos = end-c_str();
return result;
}
// not tested
unsigned countChar(char c)
{
unsigned n = 0;
string::iterator b = begin();
string::iterator e = end();
while (b<e)
{
if (*b == c)
++n;
++b;
}
return n;
}
// not tested
void unquote()
{
removeSpacesBack();
removeSpacesFront();
if (*begin() == '"' && *(end()-1) == '"' )
{
this->erase(begin());
this->erase(end()-1);
}
else if (*begin() == '\'' && *(end()-1) == '\'' )
{
this->erase(begin());
this->erase(end()-1);
}
}
// not tested
void unquote(char s, char e)
{
removeSpacesBack();
removeSpacesFront();
if (*begin() == s && *(end()-1) == e )
{
this->erase(begin());
this->erase(end()-1);
}
}
// get the next token and remove it from the called object.
easystring& get_next_token(easystring &str, const char * ws = "\r\n\t\v\f ")
{
const string::size_type n = size();
string::size_type i = 0;
str.clear();
// eat leading whitespace/deliminators -- this sould be more efficient than
// calling removeSpacesFront since we only need to erase in the front of the string once.
while ( (i < n) && ( isws( (*this)[i], ws)) ) ++i;
if (i == n) // nothing left so re remove alle spaces and the string will be empty
{
erase();
}
else
{
// find end of word
string::size_type j = i+1;
while ( (j < n) && (!isws( (*this)[j], ws)) ) ++j;
str = string(*this, i, j); // the next token
erase(0, j); // remove token from called object
}
return *this;
}
// xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
// Be careful with keywords containing punctuations since they do not
// count as upper case chars. The upper case loop stops there.
// Possible bug: I think "I" abbreviates "INPUT" which is not OK.
// Use !isalpha() to as equivalent to "isupper() so we prolong the upper region.
bool is_CAPITALIZED_abbreviation_of(const easystring &keyword)
{
unsigned i = 0;
unsigned max = size();
// max cannot be greater than keyword.size()
if ( max > keyword.size() )
return false;
// We start comparrison in upper case region. Lets see how far we can go.
while (i < max && toupper( (*this)[i] ) == keyword[i] )
{
++i;
}
// First we check whether both strings have the same length and we matched all chars.
// This means that we matched exactly the upper case region of keyword, which is the
// complete keyword.
if ( i == max && max == keyword.size() )
return true;
// If we get here, i must be smaller than keyword.size() since we can only have
// * i == max and i == keyword.size(): Not possible after previous if
// * i < max and i == keyword.size(): Not possible since max <= keyword.size()
// * i == max and i < keyword.size():
// * i < max and i < keyword.size():
// Thus, keyword[i] is valid and we use it to check whether we are in the upper case region
// (were all !alpha() should also be considered as upper case chars in this context!!).
// We simply check whether we are in the !islower() region.
if ( !islower( keyword[i]) ) // We are still in non-lower case region
return false;
// If we get here, we are in the in the lower-case region
// There are two possibilities:
// - i == max,
if (i==max)
return true;
// - our string is longer than the upper case region, so we have to continue
// our comparison.
// Let us move through lower case region
while (i < max && ((*this)[i] == keyword[i] || tolower((*this)[i]) == keyword[i]) )
++i;
// Could we match all letters in str?
if (i==max)
return true;
else
return false;
}
};
}
#endif