Codebase list andi / upstream/0.10
Imported Upstream version 0.10 Andreas Tille 7 years ago
23 changed file(s) with 436 addition(s) and 270 deletion(s). Raw diff Collapse all Expand all
00 language: cpp
1 os:
2 - linux
3 - osx
14 compiler:
2 - gcc
5 - gcc
6 - clang
37 sudo: false
48 addons:
59 apt:
610 sources:
7 - deadsnakes
8 - ubuntu-toolchain-r-test
11 - deadsnakes
12 - ubuntu-toolchain-r-test
913 packages:
10 - cmake
11 - g++-4.8
12 - libglib2.0-dev
13 - libgsl0-dev
14 - cmake
15 - g++-4.8
16 - libglib2.0-dev
17 - libgsl0-dev
1418 install:
1519 - export LIBDIVDIR="$HOME/libdivsufsort"
1620 - pip install --user cpp-coveralls
1923 - cd libdivsufsort-master && mkdir build && cd build
2024 - cmake -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="$LIBDIVDIR" ..
2125 - make && make install
26 - if [ "${TRAVIS_OS_NAME}" = "osx" ]; then brew install gsl; fi
27 - if [ "${TRAVIS_OS_NAME}" = "osx" ]; then brew install glib; fi
2228 before_install:
23 - if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
29 - if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
2430
2531 # - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 90
2632 script:
33 - export DYLD_LIBRARY_PATH="$DYLD_LIBRARY_PATH:$LIBDIVDIR/lib"
2734 - export LD_LIBRARY_PATH="$LIBDIVDIR:$LIBDIVDIR/lib"
2835 - export LIBRARY_PATH="$LIBDIVDIR:$LIBRARY_PATH"
2936 - cd $TRAVIS_BUILD_DIR
3239 - ./configure --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
3340 - make
3441 - make check
42 - export MYFLAGS="-I$LIBDIVDIR/include"
43 - ./configure --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
3544 - make distcheck DISTCHECK_CONFIGURE_FLAGS="LDFLAGS=\"-L$LIBDIVDIR/lib\" CFLAGS=\"-I$LIBDIVDIR/include\" CXXFLAGS=\"-I$LIBDIVDIR/include\""
3645 - tar xzvf andi-*.tar.gz
3746 - cd andi-*
4049 - make check
4150 - cd ..
4251 after_success:
43 - coveralls --exclude libdivsufsort-master -E '^andi-.*' --exclude libs --exclude test --gcov `which gcov-4.8` --gcov-options '\-lp'
52 - if [ "${TRAVIS_OS_NAME}" = "linux" -a "$CXX" = "g++-4.8" ]; then coveralls --exclude libdivsufsort-master -E '^andi-.*' --exclude libs --exclude test --gcov `which gcov-4.8` --gcov-options '\-lp'; fi
66 This readme covers all necessary instructions for the impatient to get `andi` up and running. For extensive instructions please consult the [manual](andi-manual.pdf).
77
88
9 # Build Instructions
9 # Installation and Usage
1010
11 For the latest [stable release](https://github.com/EvolBioInf/andi/releases) of `andi` download the tar ball. If you'd like to contribute to this software, feel free to create a fork of our [git repository](https://github.com/EvolBioInf/andi) and send pull requests.
11 Stable versions of `andi` are available via package managers. For manual installation see below.
12
13 For Debian and Ubuntu (starting 16.04):
14
15 sudo apt-get install andi
16
17 For OS X with Homebrew:
18
19 brew install science/andi
20
21 For ArchLinux with aura:
22
23 sudo aura -A andi
24
25 With a successful installation you can get the usage instructions via `--help` or the man page.
26
27 $ andi --help
28 $ man andi
29
30 You can simply use `andi` with your genomes in `FASTA` format.
31
32 $ andi S1.fasta S2.fasta
33 2
34 S1 0.0 0.1
35 s2 0.1 0.0
36
37 From this distance matrix the phylogeny can be inferred via neighbor-joining. Check the [manual](andi-manual.pdf) for a more thorough description.
1238
1339
14 ## Prerequisites
40 ## Manual installation
1541
16 This program has the following external dependencies: [libdivsufsort](https://github.com/y-256/libdivsufsort) and the [GSL](https://www.gnu.org/software/gsl/). Please make sure you installed both before attempting a build. If you did get the source, not as a tarball, but straight from the git repository, you will also need the autotools. Run `autoreconf -i` to generate the configure script and continue with the next step.
42 If your system does not support one of the above package managers you have to manually build the latest [stable release](https://github.com/EvolBioInf/andi/releases) from a tarball. See the [manual](andi-manual.pdf) for extensive building instructions.
1743
18
19 ## Compiling
44 This program has the following external dependencies: [libdivsufsort](https://github.com/y-256/libdivsufsort) and the [GSL](https://www.gnu.org/software/gsl/). Please make sure you installed both before attempting a build. If you did get the source, not as a tarball, but straight from the git repository, you will also need the autotools.
2045
2146 Assuming you have installed all prerequisites, building is as easy as follows.
2247
23 $ ./configure
24 $ make
25 $ make install
48 $ autoreconf -fi -Im4 # optional when build from tarball
49 $ ./configure
50 $ make
51 $ make install
2652
27 Excessive build instructions are located in `INSTALL`. If the build was successful you can get the usage instructions via `--help` or the man page.
28
29 $ andi --help
30 $ man andi
31
32 You can simply use `andi` with your genomes in `FASTA` format.
33
34 $ andi S1.fasta S2.fasta
35 2
36 S1 0.0 0.1
37 s2 0.1 0.0
38
39 From this distance matrix the phylogeny can be inferred via neighbor-joining. Check the [manual](andi-manual.pdf) for a more thorough description.
40
53 Excessive build instructions are located in `INSTALL`.
4154
4255 # Links and Additional Resources
4356
5366
5467 ## License
5568
56 Copyright © 2014, 2015 Fabian Klötzl
69 Copyright © 2014 - 2016 Fabian Klötzl
5770 License GPLv3+: GNU GPL version 3 or later.
5871
5972 This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law. The full license text is available at <http://gnu.org/licenses/gpl.html>.
Binary diff not shown
0 AC_INIT([andi], [0.9.6.1])
0 AC_INIT([andi], [0.10])
11 AM_INIT_AUTOMAKE([-Wall foreign ])
22
33 AC_CONFIG_MACRO_DIR([m4])
3131
3232 AS_IF([test "x$with_libdivsufsort" != "xno"],
3333 [
34 # The libdivsufsort header contains some Microsoft extension making
35 # compilation fail on certain systems (i.e. OS X). Add the following
36 # flag so the build runs smoothly.
37 CPPFLAGS="$CPPFLAGS -fms-extensions"
3438 AC_CHECK_HEADERS([divsufsort.h],[have_libdivsufsort=yes],[have_libdivsufsort=no])
3539 AC_CHECK_LIB(divsufsort, divsufsort, [], [have_libdivsufsort=no])
3640 ],
0 .TH ANDI "1" "June 2015" "@VERSION@" ""
0 .TH ANDI "1" "2016-03-05" "@VERSION@" "andi manual"
11 .SH NAME
22 andi \- estimates evolutionary distance
33 .SH SYNOPSIS
1010 The output is a symmetrical distance matrix in \fIPHYLIP\fR format, with each entry representing divergence with a positive real number. A distance of zero means that two sequences are identical, whereas other values are estimates for the nucleotide substitution rate (Jukes-Cantor corrected). For technical reasons the comparison might fail and no estimate can be computed. In such cases \fInan\fR is printed. This either means that the input sequences were too short (<200bp) or too diverse (K>0.5) for our method to work properly.
1111 .SH OPTIONS
1212 .TP
13 \fB\-b, \-\-bootstrap\fR <INT>
13 \fB\-b\fR, \fB\-\-bootstrap\fR <INT>
1414 Compute multiple distance matrices, with \fIn-1\fR bootstrapped from the first. See the paper Klötzl & Haubold (2016, in review) for a detailed explanation.
1515 .TP
1616 \fB\-j\fR, \fB\-\-join\fR
2525 \fB\-p\fR <FLOAT>
2626 Significance of an anchor pair; default: 0.05.
2727 .TP
28 \fB\-t\fR <INT>
28 \fB\-t\fR, \fB\-\-threads\fR <INT>
2929 The number of threads to be used; by default, all available processors are used.
3030 .br
3131 Multithreading is only available if \fBandi\fR was compiled with OpenMP support.
3939 \fB\-\-version\fR
4040 Outputs version information and acknowledgments.
4141 .SH COPYRIGHT
42 Copyright \(co 2014, 2015 Fabian Klötzl
42 Copyright \(co 2014 - 2016 Fabian Klötzl
4343 License GPLv3+: GNU GPL version 3 or later.
4444 .br
4545 This is free software: you are free to change and redistribute it.
1414 \usepackage{amsthm}
1515 \usepackage{acronym}
1616 \usepackage{amssymb}
17 \usepackage{tikz}
18 \usepackage{pgfplots}
1917 \usepackage{caption}
2018 \usepackage{subcaption}
21 \usepackage{booktabs}
2219 \usepackage{xspace}
23 \usepackage{overpic}
2420
2521 \bibliographystyle{alpha}
2622
108104
109105 The easiest way to install \andi is via a package manager. This also handles all dependencies for you.
110106
107
108 \noindent Debian and Ubuntu (since 16.04):
109
110 \begin{lstlisting}
111 ~ % sudo apt-get install andi
112 \end{lstlisting}
113
111114 \noindent OS X with homebrew:
112115
113116 \begin{lstlisting}
118121
119122 \begin{lstlisting}
120123 ~ % aura -A andi
121 \end{lstlisting}
122
123 \noindent Debian and Ubuntu (soon):
124
125 \begin{lstlisting}
126 ~ % sudo apt-get install andi
127124 \end{lstlisting}
128125
129126 \andi is intended to run in a \algo{Unix} commandline such as \lstinline$bash$ or \lstinline$zsh$. All examples in this document are also intended for that environment. You can verify that \andi was installed correctly by executing \lstinline$andi -h$. This should give you a list of all available options (see Section~\ref{sec:options}).
188185 \begin{lstlisting}
189186 ~ % git clone git@github.com:EvolBioInf/andi.git
190187 ~ % cd andi
191 ~/andi % autoreconf -i
192 \end{lstlisting}
193
194 \noindent For old versions of \algo{autoconf} you may instead have to use \lstinline$autoreconf -i -Im4$.
188 ~/andi % autoreconf -fi -Im4
189 \end{lstlisting}
195190
196191 \noindent Continue with the \algo{Gnu} trinity as described in Section~\ref{sub:regular}.
197192
268263
269264 By default, the distances computed by \andi are \emph{Jukes-Cantor} corrected \cite{jukescantor}. Other evolutionary models are also implemented (Kimura, raw). The \lstinline$--model$ parameter can be used to switch between them.
270265
271 Since version 0.9.4 \andi includes a bootstrapping method. It can be activated via the \lstinline$--bootstrap$ or \lstinline$-b$ switch. This option takes a numeric argument representing the number of matrices to create. The output can then be piped into \algo{phylip}.
266 Since version 0.9.4 \andi includes a bootstrapping method. It can be activated via the \lstinline$--bootstrap$ or \lstinline$-b$ switch. This option takes a numeric argument representing the number of matrices to create. The output can then be piped into \algo{phylip}. For more information on computing support values from distance matrices see \cite{afra}.
272267
273268 \begin{lstlisting}
274269 ~ % andi -b 2 ../test/1M.1.fasta
332327 \end{figure}
333328
334329
330 \chapter{Warnings and Errors}
331
332 Here be an explanation of all possible errors. Other errors may occur and are due to the failure of underlying functions (e.\,g.~\lstinline$read(3)$). All warning messages are printed to \lstinline$stderr$. Most errors are non-recoverable and will result in \andi exiting with a non-zero state.
333
334 \section{Sequence Related Messages}
335
336 \subsection*{Unexpected Character}
337
338 \andi is pretty pedantic about the formatting of \algo{FASTA} files. If you violate the syntax, \andi will print the file name, the line and the problematic character. These errors are non-recovering, meaning no further sequences are read from the invalid file. The checks are implemented by the \href{https://github.com/kloetzl/pfasta}{\algo{pfasta}} library.
339
340
341 \subsection*{Non acgtACGT Nucleotides Stripped}
342
343 Our models of genome evolution (JC, Kimura) only work on the four canonical types of nucleotides. All others are stripped from the sequences. This can be ignored in most cases.
344
345 \subsection*{Too Short Sequence}
346
347 \andi was designed for big data sets of whole genomes. On short sequences the distance estimates are inaccurate. Use an multiple sequence alignment instead.
348
349 \subsection*{Too Long Sequence}
350
351 \algo{libdivsufsort} limits the length of a sequence to 31 bits. That count includes the reverse complement. So the technical limit for a sequence analysis is $2^{30} = 1.073.741.824$. Unfortunately, that excludes (full) human and mice genomes. Per-chromosome analysis works just fine.
352
353 \subsection*{Empty Sequence}
354
355 One of the given sequences contained either no nucleotides at all, or only non-canonical ones.
356
357 \subsection*{Less than two sequences given}
358
359 As \andi tries to compare sequences, at least two need to be supplied. Note that \andi may have regarded some of your given sequences as unusable.
360
361 \subsection*{Maximum Number of Sequences}
362
363 The maximum number of sequences \andi can possible compare is huge (roughly $457.845.052$). I doubt anyone will ever reach that limit. Please send me a mail, if you do.
364
365 \section{Technical Messages}
366
367 \subsection*{Out of Memory}
368
369 If \andi runs out of memory, it gives up. Either free memory, run \andi on a bigger machine, try the \lstinline$--low-memory$ mode or reduce the number of threads.
370
371 \subsection*{RNG allocation}
372
373 Some technical thing failed. If it keeps failing repeatedly, file a bug.
374
375 \subsection*{Bootstrapping failed}
376
377 This should not happen.
378
379 \subsection*{Failed index creation}
380
381 This should not happen, either.
382
383 \subsection*{Skipped and ignored Arguments}
384
385 Some command line parameters of \andi require arguments. If these are not of the expected type, a warning is given. See Section~\ref{sec:options} for their correct usage.
386
387
388 \section{Output-related Warnings}
389
390 As the input sequences get more evolutionary divergent, \andi finds less anchors. With less anchors, less nucleotides are cosidered homologous between two sequences. If no anchors are found comparison fails and \lstinline!nan! is printed instead. See out paper and especiallt Figure~2 for details.
391
392 \subsection*{NaN}
393
394 No anchors were found. Your sequences are very divergent ($d>0.5$) or sprout a lot of indels that make comparison difficult.
395
396 \subsection*{Little Homology}
397
398 Very few anchors were found and thus only a tiny part of the sequences is considered homologous. Expect that the given distance is errorneous.
335399
336400
337401 \chapter{DevOps} %%%%%
375439 ~/andi % make check
376440 \end{lstlisting}
377441
378 \noindent The unit tests are also checked each time a commit is sent to the repository. This is done via \algo{TravisCI}.\footnote{\url{https://travis-ci.org/EvolBioInf/andi}} Thus, a warning is produced, when the builds fail, or the unit tests to not run successfully. Currently, the unit tests cover more than 80\% of the code. This is computed via the \algo{Travis} builds and a service called \algo{Coveralls}.\footnote{\url{https://coveralls.io/r/EvolBioInf/andi}}
442 \noindent The unit tests are also checked each time a commit is sent to the repository. This is done via \algo{TravisCI}.\footnote{\url{https://travis-ci.org/EvolBioInf/andi}} Thus, a warning is produced, when the builds fail, or the unit tests to not run successfully. Currently, the unit tests cover more than 75\% of the code. This is computed via the \algo{Travis} builds and a service called \algo{Coveralls}.\footnote{\url{https://coveralls.io/r/EvolBioInf/andi}} Unfortunately, coveralls is broken at this point in time.
379443
380444 \section{Known Issues}
381445
382446 These minor issues are known. I intend to fix them, when I have time.
383447
384448 \begin{enumerate}
385 \item This code will not work under Windows. At two places Unix-only code is used: filepath-seperators are assumed to be \lstinline$/$ and file-descriptors are used for I/O.
449 \item This code will not work under Windows. At two places Unix-only code is used: filepath-separators are assumed to be \lstinline$/$ and file-descriptors are used for I/O.
450 \item Only the first 10 characters are printed in the output matrix. This can make long gi numbers indistinguishable. In version 0.10 we started printing a warning for this.
451 \item Unit tests for the bootstrapped matrices are missing.
452 \item Cached intervals are sometimes not “as deep as they could be”. If that got fixed \lstinline$get_match_cache$ could bail out on \lstinline$ij.lcp < CACHE_LENGTH$. However the \lstinline$esa_init_cache$ code is the most fragile part and should be handled with care.
386453 \end{enumerate}
387454
388455
392459
393460 %\subsection{Preparing a new Release}
394461
395 Once \andi is matured, the new features implemented, and all tests were run, a new release can be created. First, increase the version number in \lstinline$configure.ac$. Commit that change in git, and tag this commit with \lstinline$vX.y$. Create another commit, where you set the version number to the next release (e.\,g., \lstinline$vX.z-beta$). This assures that there is only one commit and build with that specific version. Now return to the previous commit \lstinline$git checkout vX.y$.
462 Once \andi is matured, the new features implemented, and all tests were run, a new release can be created. First, increase the version number in \lstinline$configure.ac$. Commit that change in git, and tag this commit with \lstinline$vX.y$. Tags should be annotated and signed, if possible. This manual then needs manual rebuilding.
396463
397464 Ensure that \andi is ready for packaging with \algo{autoconf}.
398465
413480 =================================================
414481 \end{lstlisting}
415482
416 If the command does not build successfully, no tarballs will be created. This may necessitate further study of \algo{autoconf} and \algo{automake}.
417
418
483 If the command does not build successfully, no tarballs will be created. This may necessitate further study of \algo{autoconf} and \algo{automake}.
484
485 Also verify that the recent changes did not create a performance regression. This includes testing both ends of the scale: \eco and \pneu. Both should be reasonable close to previous releases.
486
487 Create another commit, where you set the version number to the next release (e.\,g., \lstinline$vX.z-beta$). This assures that there is only one commit and build with that specific version.
419488
420489 \backmatter
421490 %\addcontentsline{toc}{chapter}{Bibliography}
4444 Author = {Chris Lattner and Vikram Adve},
4545 Title = {{LLVM}: A Compilation Framework for Lifelong Program
4646 Analysis and Transformation},
47 Booktitle = "Code Generation and Optimizatio",
47 Booktitle = "Code Generation and Optimization",
4848 Month = {Mar},
4949 Year = {2004},
5050 pages = {75--88},
108108 title={Efficient Estimation of Evolutionary Distances}
109109 }
110110
111 @article{afra,
112 AUTHOR = {Klötzl, Fabian and Haubold, Bernhard},
113 TITLE = {Support Values for Genome Phylogenies},
114 JOURNAL = {Life},
115 VOLUME = {6},
116 YEAR = {2016},
117 NUMBER = {1},
118 PAGES = {11},
119 URL = {http://www.mdpi.com/2075-1729/6/1/11},
120 ISSN = {2075-1729},
121 DOI = {10.3390/life6010011}
122 }
111123
124
125
00 /* @brief Here follows a simple implementation of the GNU function `strchrnul`.
11 * Please check the gnulib manual for details.
22 */
3 #include <string.h>
4
35 char *strchrnul(const char *s, int c){
46 char *p = strchr(s,c);
57
0 #!/usr/bin/zsh
1
2 # Compute the number of failing comparisons for different distances.
3
4 DISTS=(0.1 0.2 0.3 0.35 0.4 0.45 0.5 0.55 0.6 0.65 0.7)
5
6 LENGTH=100000
7
8 for dist in $DISTS; do
9 echo "" > est_$dist.dist
10 for (( i = 0; i < 1000; i++ )); do
11 ../test/test_fasta -l $LENGTH -d $dist > temp.fa
12 ../src/andi ./temp.fa > est.dist 2> /dev/null
13 tail -n 1 est.dist >> est_$dist.dist
14 done
15 avg=$(cat est_$dist.dist | awk '"nan" !~ $2 {sum+=$2;c++}END{print sum/c}')
16 sd=$(grep -v 'nan' est_$dist.dist | awk '{a[c++]=$2;aa+=$2}END{aa/=NR;for(c=0;c<NR;c++){t=a[c]-aa;sd+=t*t}print sqrt(sd/(NR-1))}')
17 failed=$(cat est_$dist.dist | grep -c 'nan')
18 echo $dist "\t" $avg"\t±" $sd "\t" $failed
19 done
1515
1616 .PHONY: perf
1717 perf: CFLAGS+= -g -O3 -ggdb -fno-omit-frame-pointer
18 perf: CXXFLAGS+= -g -O3 -ggdb -fno-omit-frame-pointer
1819 perf: andi
2424 #include <assert.h>
2525 #include <errno.h>
2626 #include <getopt.h>
27 #include <limits.h>
2728 #include <stdio.h>
2829 #include <stdlib.h>
2930 #include <string.h>
5455 *
5556 * The main function reads and parses the commandline arguments. Depending on
5657 * the set flags it reads the input files and forwards the contained sequences
57 * to processing.
58 * to processing. Also it verifies the input for correctness and issues warnings
59 * and errors.
5860 */
5961 int main(int argc, char *argv[]) {
6062 int c;
133135
134136 if (threads > (long unsigned int)omp_get_num_procs()) {
135137 warnx(
136 "The number of threads to be used, is greater then the "
138 "The number of threads to be used, is greater than the "
137139 "number of available processors; Ignoring -t %lu "
138140 "argument.",
139141 threads);
173175 } else if (strcasecmp(optarg, "KIMURA") == 0) {
174176 MODEL = M_KIMURA;
175177 } else {
176 warnx("Ignoring argument for --model. Expected Raw JC or Kimura");
178 warnx("Ignoring argument for --model. Expected Raw, JC or "
179 "Kimura");
177180 }
178181 break;
179182 }
195198 }
196199
197200 dsa_t dsa;
198 if (dsa_init(&dsa)) {
199 errx(errno, "Out of memory.");
200 }
201 dsa_init(&dsa);
201202
202203 const char *file_name;
203204
222223
223224 size_t n = dsa_size(&dsa);
224225
226 if (n < 2) {
227 errx(1,
228 "I am truly sorry, but with less than two sequences (%zu given) "
229 "there is nothing to compare.",
230 n);
231 }
232
225233 if (FLAGS & F_VERBOSE) {
226234 fprintf(stderr, "Comparing %zu sequences\n", n);
227235 fflush(stderr);
235243 // seed the random number generator with the current time
236244 gsl_rng_set(RNG, time(NULL));
237245
238 seq_t *sequences = dsa_data(&dsa);
246 // Warn about non ACGT residues.
247 if (FLAGS & F_NON_ACGT) {
248 warnx("The input sequences contained characters other than acgtACGT. "
249 "These were automatically stripped to ensure correct results.");
250 }
251
252 // validate sequence correctness
253 const seq_t *seq = dsa_data(&dsa);
254 for (size_t i = 0; i < n; ++i, seq++) {
255 if (strlen(seq->name) > 10) {
256 warnx("The sequence name '%s' is longer than ten characters. It "
257 "will be truncated in the output to '%.10s'.",
258 seq->name, seq->name);
259 }
260
261 const size_t LENGTH_LIMIT = (INT_MAX - 1) / 2;
262 if (seq->len > LENGTH_LIMIT) {
263 errx(1, "The sequence %s is too long. The technical limit is %zu.",
264 seq->name, LENGTH_LIMIT);
265 }
266
267 if (seq->len == 0) {
268 errx(1, "The sequence %s is empty.", seq->name);
269 }
270
271 if (seq->len < 1000) {
272 FLAGS |= F_SHORT;
273 }
274 }
275
276 if (FLAGS & F_SHORT) {
277 warnx("One of the given input sequences is shorter than a thousand "
278 "nucleotides. This may result in inaccurate distances. Try an "
279 "alignment instead.");
280 }
281
239282 // compute distance matrix
240 if (n >= 2) {
241 calculate_distances(sequences, n);
242 } else {
243 warnx("I am truly sorry, but with less than two sequences (%zu given) "
244 "there is nothing to compare.",
245 n);
246 }
283 calculate_distances(dsa_data(&dsa), n);
247284
248285 dsa_free(&dsa);
249286 gsl_rng_free(RNG);
288325 void version(void) {
289326 const char str[] = {
290327 "andi " VERSION "\n"
291 "Copyright (C) 2014, 2015 Fabian Klötzl\n"
328 "Copyright (C) 2014 - 2016 Fabian Klötzl\n"
292329 "License GPLv3+: GNU GPL version 3 or later "
293330 "<http://gnu.org/licenses/gpl.html>\n"
294331 "This is free software: you are free to change and redistribute it.\n"
301338 "Sequence Analysis, Genome Rearrangements, and Phylogenetic "
302339 "Reconstruction. pp 118f.\n"
303340 "3) SA construction: Mori, Y. (2005). Short description of improved "
304 "two-stage suffix sorting alorithm. "
341 "two-stage suffix sorting algorithm. "
305342 "http://homepage3.nifty.com/wpage/software/itssort.txt\n"};
306343 printf("%s", str);
307344 exit(EXIT_SUCCESS);
2828 * @param n - The number of sequences
2929 * @param M - A matrix for additional output data
3030 */
31 void NAME(struct model *M, seq_t *sequences, size_t n) {
31 void NAME(struct model *M, const seq_t *sequences, size_t n) {
3232 size_t i;
3333
3434 //#pragma
3535 P_OUTER
3636 for (i = 0; i < n; i++) {
37 seq_t *subject = &sequences[i];
37 seq_subject subject;
3838 esa_s E;
3939
40 if (seq_subject_init(subject) || esa_init(&E, subject)) {
41 errx(1, "Failed to create index for %s.", subject->name);
40 if (seq_subject_init(&subject, &sequences[i]) ||
41 esa_init(&E, &subject)) {
42 errx(1, "Failed to create index for %s.", sequences[i].name);
4243 }
4344
4445 // now compare every other sequence to i
5960
6061 size_t ql = sequences[j].len;
6162
62 M(i, j) = dist_anchor(&E, sequences[j].S, ql, subject->gc);
63 M(i, j) = dist_anchor(&E, sequences[j].S, ql, subject.gc);
6364 }
6465
6566 esa_free(&E);
66 seq_subject_free(subject);
67 seq_subject_free(&subject);
6768 }
6869 }
1616 #include <string.h>
1717 #include <assert.h>
1818 #include "esa.h"
19 #include "global.h"
1920
2021 static void esa_init_cache_dfs(esa_s *, char *str, size_t pos, lcp_inter_t in);
2122 static void esa_init_cache_fill(esa_s *, char *str, size_t pos, lcp_inter_t in);
6970 * @returns 0 iff successful
7071 */
7172 int esa_init_cache(esa_s *self) {
72 lcp_inter_t *cache =
73 malloc((1 << (2 * CACHE_LENGTH)) * sizeof(lcp_inter_t));
74
75 if (!cache) {
76 return 1;
77 }
73 lcp_inter_t *cache = malloc((1 << (2 * CACHE_LENGTH)) * sizeof(*cache));
74 CHECK_MALLOC(cache);
7875
7976 self->cache = cache;
8077
127124 continue;
128125 }
129126
127 if (ij.l <= (ssize_t)(pos + 1)) {
128 // Continue one level deeper
129 // This is the usual case
130 esa_init_cache_dfs(C, str, pos + 1, ij);
131 continue;
132 }
133
130134 // The LCP-interval is deeper than expected
131 if (ij.l > (ssize_t)(pos + 1)) {
132
133 // Check if it still fits into the cache
134 if ((size_t)ij.l < CACHE_LENGTH) {
135
136 // fill with dummy value
137 esa_init_cache_fill(C, str, pos + 1, in);
138
139 char non_acgt = 0;
140
141 // fast forward
142 size_t k = pos + 1;
143 for (; k < (size_t)ij.l; k++) {
144 // In some very edgy edge cases the lcp-interval `ij`
145 // contains a `;` or another non-acgt character. Since we
146 // cannot cache those, break.
147 char c = C->S[C->SA[ij.i] + k];
148 if (char2code(c) < 0) {
149 non_acgt = 1;
150 break;
151 }
152
153 str[k] = c;
154 }
155
156 if (non_acgt) {
157 esa_init_cache_fill(C, str, k, ij);
158 } else {
159 esa_init_cache_dfs(C, str, k, ij);
160 }
161
162 continue;
163 }
164
135 // Check if it still fits into the cache
136 if ((size_t)ij.l >= CACHE_LENGTH) {
165137 // If the lcp-interval exceeds the cache depth, stop here and fill
166138 esa_init_cache_fill(C, str, pos + 1, in);
167139 continue;
168140 }
169141
170 // Continue one level deeper
171 esa_init_cache_dfs(C, str, pos + 1, ij);
142 /* At this point the prefix `str` of length `pos` has been found.
143 * However, the call to `getInterval` above found an interval with
144 * an LCP value bigger than `pos`. This means that not all elongations
145 * (more precise: just one) of `str` appear in the subject. Thus fill
146 * all values with the matched result to far and continue only with
147 * the one special substring.
148 */
149 esa_init_cache_fill(C, str, pos + 1, in);
150
151 char non_acgt = 0;
152
153 // fast forward
154 size_t k = pos + 1;
155 for (; k < (size_t)ij.l; k++) {
156 // In some very edgy edge cases the lcp-interval `ij`
157 // contains a `;` or another non-acgt character. Since we
158 // cannot cache those, break.
159 char c = C->S[C->SA[ij.i] + k];
160 if (char2code(c) < 0) {
161 non_acgt = 1;
162 break;
163 }
164
165 str[k] = c;
166 }
167
168 if (non_acgt) {
169 esa_init_cache_fill(C, str, k, ij);
170 } else {
171 esa_init_cache_dfs(C, str, k, ij);
172 }
172173 }
173174 }
174175
215216 size_t len = self->len;
216217
217218 char *FVC = self->FVC = malloc(len);
218 if (!FVC) {
219 return 1;
220 }
219 CHECK_MALLOC(FVC);
221220
222221 const char *S = self->S;
223222 const int *SA = self->SA;
238237 * @param S - The sequence
239238 * @returns 0 iff successful
240239 */
241 int esa_init(esa_s *C, const seq_t *S) {
242 if (!C || !S || !S->S) return 1;
240 int esa_init(esa_s *C, const seq_subject *S) {
241 if (!C || !S || !S->RS) return 1;
243242
244243 *C = (esa_s){.S = S->RS, .len = S->RSlen};
245244
284283 return 1;
285284 }
286285
287 C->SA = malloc(C->len * sizeof(saidx_t));
288 if (!C->SA) {
289 return 2;
290 }
286 C->SA = malloc(C->len * sizeof(*C->SA));
287 CHECK_MALLOC(C->SA);
291288
292289 saidx_t result = 1;
293290
310307 if (!C || !C->LCP) {
311308 return 1;
312309 }
313 saidx_t *CLD = C->CLD = malloc((C->len + 1) * sizeof(saidx_t));
314 if (!C->CLD) {
315 return 2;
316 }
310 saidx_t *CLD = C->CLD = malloc((C->len + 1) * sizeof(*CLD));
311 CHECK_MALLOC(CLD);
317312
318313 saidx_t *LCP = C->LCP;
319314
320315 typedef struct pair_s { saidx_t idx, lcp; } pair_t;
321316
322 pair_t *stack = malloc((C->len + 1) * sizeof(pair_t));
317 pair_t *stack = malloc((C->len + 1) * sizeof(*stack));
318 CHECK_MALLOC(stack);
323319 pair_t *top = stack; // points at the topmost filled element
324320 pair_t last;
325321
378374
379375 // Allocate new memory
380376 // The LCP array is one element longer than S.
381 saidx_t *LCP = C->LCP = malloc((len + 1) * sizeof(saidx_t));
382 if (!LCP) {
383 return 3;
384 }
377 saidx_t *LCP = C->LCP = malloc((len + 1) * sizeof(*LCP));
378 CHECK_MALLOC(LCP);
385379
386380 LCP[0] = -1;
387381 LCP[len] = -1;
388382
389383 // Allocate temporary arrays
390 saidx_t *PHI = malloc(len * sizeof(saidx_t));
384 saidx_t *PHI = malloc(len * sizeof(*PHI));
391385 saidx_t *PLCP = PHI;
392 if (!PHI) {
393 free(PHI);
394 return 2;
395 }
386 CHECK_MALLOC(PHI);
396387
397388 PHI[SA[0]] = -1;
398389 saidx_t k;
55 #ifndef _ESA_H_
66 #define _ESA_H_
77
8 #include <sys/types.h>
89 #include "sequence.h"
910 #include "config.h"
1011
6768
6869 lcp_inter_t get_match_cached(const esa_s *, const char *query, size_t qlen);
6970 lcp_inter_t get_match(const esa_s *, const char *query, size_t qlen);
70 int esa_init(esa_s *, const seq_t *S);
71 int esa_init(esa_s *, const seq_subject *S);
7172 void esa_free(esa_s *);
7273
7374 #ifdef DEBUG
88 #define _GLOBAL_H_
99 #include <gsl/gsl_rng.h>
1010
11 #include <err.h>
1112 #include "config.h"
1213
1314 /**
6162 F_SHORT = 64
6263 };
6364
65 /**
66 * @brief This macro is used to unify the checks for the return value of malloc.
67 *
68 * @param PTR - The pointer getting checked.
69 */
70 #define CHECK_MALLOC(PTR) \
71 do { \
72 if (PTR == NULL) { \
73 err(errno, "Out of memory"); \
74 } \
75 } while (0);
76
6477 #endif
44 #define _GNU_SOURCE
55 #include <fcntl.h>
66 #include <math.h>
7 #include <limits.h>
78 #include <stdio.h>
89 #include <string.h>
910 #include <unistd.h>
2324 * "I didn't learn joined up handwriting for nothing, you know."
2425 * ~ Gilderoy Lockhart
2526 *
26 * @param in - The file pointer to read from.
27 * @param dsa - An array that holds found sequences.
28 * @param name - The name of the file to be used for the name of the sequence.
27 * @param file_name - The name of the file to be used for reading. The name is
28 * also used to infer the sequence name.
29 * @param dsa - (output parameter) An array that holds found sequences.
2930 */
3031 void read_fasta_join(const char *file_name, dsa_t *dsa) {
3132 if (!file_name || !dsa) return;
4647 */
4748
4849 const char *left = strrchr(file_name, '/'); // find the last path separator
49 left = (left == NULL) ? file_name : left + 1; // left is the position one of
50 // to the right of the path
51 // separator
50 left = (left == NULL) ? file_name : left + 1;
51 // left is the position one of to the right of the path separator
5252
5353 const char *dot = strchrnul(left, '.'); // find the extension
5454
55 joined.name = strndup(
56 left, dot - left); // copy only the file name, not its path or extension
55 // copy only the file name, not its path or extension
56 joined.name = strndup(left, dot - left);
57 CHECK_MALLOC(joined.name);
58
5759 dsa_push(dsa, joined);
5860 dsa_free(&single);
5961 }
6062
6163 /**
6264 * @brief This function reads sequences from a file.
63 * @param in - The file pointer to read from.
64 * @param dsa - An array that holds found sequences.
65 * @param file_name - The file to read.
66 * @param dsa - (output parameter) An array that holds found sequences.
6567 */
6668 void read_fasta(const char *file_name, dsa_t *dsa) {
6769 if (!file_name || !dsa) return;
123125 int use_scientific = 0;
124126
125127 double *DD = malloc(n * n * sizeof(*DD));
126 if (!DD) err(errno, "meh.");
128 CHECK_MALLOC(DD);
127129
128130 #define DD(X, Y) (DD[(X)*n + (Y)])
129131
132134
133135 switch (MODEL) {
134136 case M_RAW: estimate = &estimate_RAW; break;
137 default:
138 /* intentional fall-through. This is just here to silence any
139 * compiler warnings. The real default is set in andi.c.*/
135140 case M_JC: estimate = &estimate_JC; break;
136141 case M_KIMURA: estimate = &estimate_KIMURA; break;
137142 }
167172
168173 printf("%zu\n", n);
169174 for (i = 0; i < n; i++) {
170 // Print exactly nine characters of the name. Pad with spaces if
175 // Print exactly ten characters of the name. Pad with spaces if
171176 // necessary.
172 printf("%-9.9s", sequences[i].name);
177 printf("%-10.10s", sequences[i].name);
173178
174179 for (j = 0; j < n; j++) {
175180 // use scientific notation for small numbers
157157 * @brief Given an anchor, classify nucleotides.
158158 *
159159 * For anchors we already know that the nucleotides of the subject and the query
160 * are equal. Thus only one sequence has to be analysed. See `model_count` for
161 * an explanation of the algorithm.
160 * are equal. Thus only one sequence has to be analysed. Most models don't
161 * actually care about the individual nucleotides as long as they are equal in
162 * the two sequences. For these models, we just assume equal distribution.
162163 *
163164 * @param MM - The mutation matrix
164165 * @param S - The subject
165166 * @param len - The anchor length
166167 */
167168 void model_count_equal(model *MM, const char *S, size_t len) {
169 if (MODEL == M_RAW || MODEL == M_JC || MODEL == M_KIMURA) {
170 size_t fourth = len / 4;
171 MM->counts[AtoA] += fourth;
172 MM->counts[CtoC] += fourth;
173 MM->counts[GtoG] += fourth;
174 MM->counts[TtoT] += fourth + (len & 3);
175 return;
176 }
177
178 // Fall-back algorithm for future models. Note, as this works on a
179 // per-character basis it is slow.
180
168181 size_t local_counts[4] = {0};
169182
170183 for (; len--;) {
171184 char s = *S++;
172185
186 // ';!#' are all smaller than 'A'
173187 if (s < 'A') {
188 // Technically, s can only be ';' at this point.
174189 continue;
175190 }
176191
177 unsigned char nibble_s = s & 7;
178
179 static const unsigned int mm1 = 0x20031000;
180
181 unsigned char index = (mm1 >> (4 * nibble_s)) & 0x3;
182 local_counts[index]++;
192 // The four canonical nucleotides can be uniquely identified by the bits
193 // 0x6: A -> 0, C → 1, G → 3, T → 2. Thus the order below is changed.
194 local_counts[(s >> 1) & 3]++;
183195 }
184196
185197 MM->counts[AtoA] += local_counts[0];
186198 MM->counts[CtoC] += local_counts[1];
187 MM->counts[GtoG] += local_counts[2];
188 MM->counts[TtoT] += local_counts[3];
199 MM->counts[GtoG] += local_counts[3];
200 MM->counts[TtoT] += local_counts[2];
189201 }
190202
191203 /**
33 * @file
44 * @brief This file contains various distance methods.
55 */
6 #include <assert.h>
7 #include <math.h>
8 #include <stdint.h>
9 #include <stdio.h>
610 #include <stdlib.h>
711 #include <string.h>
8 #include <assert.h>
9 #include <math.h>
10 #include <stdio.h>
1112 #include "esa.h"
1213 #include "global.h"
1314 #include "io.h"
1516 #include "process.h"
1617 #include "sequence.h"
1718
18 #include <time.h>
1919 #include <gsl/gsl_rng.h>
2020 #include <gsl/gsl_randist.h>
2121
5353 /**
5454 * @brief Calculates the binomial coefficient of n and k.
5555 *
56 * We used to use gsl_sf_lnchoose(xx,kk) for this functionality.
57 * After all, why implement something that has already been done?
58 * Well, the reason is simplicity: GSL is used for only this one
59 * function and the input (n<=20) is not even considered big.
60 * Hence its much easier to have our own implementation and ditch
61 * the GSL dependency even if that means our code is a tiny bit
62 * less optimized and slower.
56 * We could (and probalby should) use gsl_sf_lnchoose(xx,kk) for this.
6357 *
6458 * @param n - The n part of the binomial coefficient.
6559 * @param k - analog.
206200 model_count_equal(&ret, query + last_pos_Q, last_length);
207201
208202 // Count the SNPs in between.
209 model_count(&ret, C->S + last_pos_S + last_length, query + last_pos_Q + last_length,
203 model_count(&ret, C->S + last_pos_S + last_length,
204 query + last_pos_Q + last_length,
210205 this_pos_Q - last_pos_Q - last_length);
211206 last_was_right_anchor = 1;
212207 } else {
221216 if (last_was_right_anchor) {
222217 // If the last was a right anchor, but with the current one,
223218 // we cannot extend, then add its length.
224 model_count_equal(&ret, C->S + last_pos_S, last_length);
219 model_count_equal(&ret, query + last_pos_Q, last_length);
225220 } else if ((last_length / 2) >= threshold) {
226221 // The last anchor wasn't neither a left or right anchor.
227222 // But, it was as long as an anchor pair. So still count it.
228 model_count_equal(&ret, C->S + last_pos_S, last_length);
223 model_count_equal(&ret, query + last_pos_Q, last_length);
229224 }
230225
231226 last_was_right_anchor = 0;
302297 * @param sequences - An array of pointers to the sequences.
303298 * @param n - The number of sequences.
304299 */
305 void calculate_distances(seq_t *sequences, int n) {
306 int i;
307
308 // check the sequences
309 for (i = 0; i < n; i++) {
310 if (sequences[i].S == NULL || sequences[i].len == 0) {
311 errx(1, "Missing sequence: %s", sequences[i].name);
312 }
313
314 if (sequences[i].len < 1000) {
315 FLAGS |= F_SHORT;
316 }
317 }
318
319 if (FLAGS & F_SHORT) {
320 warnx("One of the given input sequences is shorter than a thousand "
321 "nucleotides. This may result in inaccurate distances. Try an "
322 "alignment instead.");
323 }
324
325 // Warn about non ACGT residues.
326 if (FLAGS & F_NON_ACGT) {
327 warnx("The input sequences contained characters other than acgtACGT. "
328 "These were automatically stripped to ensure correct results.");
329 }
330
331 model *M = malloc(n * n * sizeof(*M));
300 void calculate_distances(seq_t *sequences, size_t n) {
301 struct model *M = NULL;
302
303 // The maximum number of sequences is near 457'845'052.
304 size_t intermediate = SIZE_MAX / sizeof(*M) / n;
305 if (intermediate < n) {
306 size_t root = (size_t)sqrt(SIZE_MAX / sizeof(*M));
307 err(1, "Comparison is limited to %zu sequences (%zu given).", root, n);
308 }
309
310 M = malloc(n * n * sizeof(*M));
332311 if (!M) {
333312 err(errno, "Could not allocate enough memory for the comparison "
334313 "matrix. Try using --join or --low-memory.");
385364
386365 // B is the new bootstrap matrix
387366 struct model *B = malloc(n * n * sizeof(*B));
388 if (!B) return 2;
367 CHECK_MALLOC(B);
389368
390369 // Compute a number of new distance matrices
391370 while (BOOTSTRAP--) {
77
88 #include "sequence.h"
99
10 void calculate_distances(seq_t *sequences, int n);
10 void calculate_distances(seq_t *sequences, size_t n);
1111
1212 #endif
1818 int dsa_init(dsa_t *A) {
1919 // allocate at least 4 slots so the growth by 1.5 below doesn't get stuck
2020 // at 3 slots.
21 A->data = malloc(sizeof(seq_t) * 4);
22 if (!A->data) {
23 return 1;
24 }
21 A->data = malloc(sizeof(*A->data) * 4);
22 CHECK_MALLOC(A->data);
2523
2624 A->capacity = 4;
2725 A->size = 0;
3533 } else {
3634 // use the near-optimal growth factor of 1.5
3735 seq_t *ptr = reallocarray(A->data, A->capacity / 2, sizeof(seq_t) * 3);
38 if (ptr == NULL) {
39 err(errno, "out of memory?");
40 }
36 CHECK_MALLOC(ptr);
4137
4238 A->capacity = (A->capacity / 2) * 3;
4339 A->data = ptr;
10096
10197 // A single malloc for the whole new sequence
10298 char *ptr = malloc(total);
103 if (ptr == NULL) {
104 return joined;
105 }
99 CHECK_MALLOC(ptr);
106100 char *next = ptr;
107101
108102 // Copy all old sequences and add a `!` in between
132126 */
133127 void seq_free(seq_t *S) {
134128 free(S->S);
135 free(S->RS);
136129 free(S->name);
137130 *S = (seq_t){};
138131 }
144137 * @return The reverse complement. The caller has to free it!
145138 */
146139 char *revcomp(const char *str, size_t len) {
140 if (!str) return NULL;
147141 char *rev = malloc(len + 1);
148 if (!str || !rev) return NULL;
142 CHECK_MALLOC(rev);
149143
150144 char *r = rev;
151145 const char *s = &str[len - 1];
182176 char *rev = revcomp(s, len);
183177
184178 char *temp = realloc(rev, 2 * len + 2);
185 if (!temp) {
186 free(rev);
187 return NULL;
188 }
179 CHECK_MALLOC(temp);
189180
190181 rev = temp;
191182 rev[len] = '#';
214205 }
215206
216207 /** @brief Prepares a sequences to be used as the subject in a comparison. */
217 int seq_subject_init(seq_t *S) {
218 S->gc = calc_gc(S);
219 S->RS = catcomp(S->S, S->len);
208 int seq_subject_init(seq_subject *S, const seq_t *base) {
209 S->gc = calc_gc(base);
210 S->RS = catcomp(base->S, base->len);
220211 if (!S->RS) return 1;
221 S->RSlen = 2 * S->len + 1;
212 S->RSlen = 2 * base->len + 1;
222213 return 0;
223214 }
224215
225216 /** @brief Frees some memory unused for when a sequence is only used as query.
226217 */
227 void seq_subject_free(seq_t *S) {
218 void seq_subject_free(seq_subject *S) {
228219 free(S->RS);
229220 S->RS = NULL;
230221 S->RSlen = 0;
242233
243234 *S = (seq_t){.S = strdup(seq), .name = strdup(name)};
244235
245 if (!S->S || !S->name) {
246 seq_free(S);
247 return 2;
248 }
236 CHECK_MALLOC(S->S);
237 CHECK_MALLOC(S->name);
249238
250239 normalize(S);
251240
252241 // recalculate the length because `normalize` might have stripped some
253242 // characters.
254243 S->len = strlen(S->S);
255
256 const size_t LENGTH_LIMIT = (INT_MAX - 1) / 2;
257 if (S->len > LENGTH_LIMIT) {
258 warnx("The input sequence %s is too long. The technical limit is %zu.",
259 S->name, LENGTH_LIMIT);
260 return 3;
261 }
262244
263245 return 0;
264246 }
1717 typedef struct seq_s {
1818 /** This is the DNAs forward strand as a string. */
1919 char *S;
20 /** The length of the forward strand. */
21 size_t len;
22 /** A name for this sequence */
23 char *name;
24 } seq_t;
25
26 /**
27 * @brief This structure enhances the usual sequence with its reverse
28 * complement.
29 */
30 typedef struct seq_subject {
2031 /** This member contains first the reverse strand and then the
2132 forward strand. */
2233 char *RS;
23 /** The length of the forward strand. */
24 size_t len;
2534 /** Corresponds to strlen(RS) */
2635 size_t RSlen;
27 /** A name for this sequence */
28 char *name;
2936 /**
3037 * @brief GC-Content
3138 *
3239 * The relative amount of G or C in the DNA.
3340 */
3441 double gc;
35 } seq_t;
42 } seq_subject;
3643
3744 void seq_free(seq_t *S);
38 int seq_subject_init(seq_t *S);
39 void seq_subject_free(seq_t *S);
45 int seq_subject_init(seq_subject *S, const seq_t *);
46 void seq_subject_free(seq_subject *S);
4047 int seq_init(seq_t *S, const char *seq, const char *name);
4148
4249 /**
2424 typedef struct {
2525 esa_s *C;
2626 seq_t *S;
27 seq_subject subject;
2728 } esa_fixture;
2829
2930 void assert_equal_lcp( const lcp_inter_t *a, const lcp_inter_t *b){
5960 };
6061
6162 g_assert( seq_init( ef->S, seq, "S0" ) == 0);
62 seq_subject_init( ef->S);
63 g_assert( ef->S->RS != NULL);
64 int check = esa_init( ef->C, ef->S);
63 seq_subject_init( &ef->subject, ef->S);
64 g_assert( ef->subject.RS != NULL);
65 int check = esa_init( ef->C, &ef->subject);
6566 g_assert( check == 0);
6667 }
6768
8586 };
8687
8788 g_assert( seq_init( ef->S, seq, "S0" ) == 0);
88 seq_subject_init( ef->S);
89 g_assert( ef->S->RS != NULL);
90 int check = esa_init( ef->C, ef->S);
89 seq_subject_init( &ef->subject, ef->S);
90 g_assert( ef->subject.RS != NULL);
91 int check = esa_init( ef->C, &ef->subject);
9192 g_assert( check == 0);
9293 }
9394
2121 void test_seq_full(){
2222
2323 seq_t S;
24 seq_subject subject;
2425
2526 seq_init( &S, "ACGTTGCA", "name");
26 int check = seq_subject_init( &S);
27 int check = seq_subject_init( &subject, &S);
2728
2829 g_assert_cmpint(check, ==, 0);
2930
30 g_assert_cmpstr(S.RS, ==, "TGCAACGT#ACGTTGCA");
31 g_assert_cmpuint(S.RSlen, ==, 8*2+1);
32 g_assert( S.gc == 0.5);
31 g_assert_cmpstr(subject.RS, ==, "TGCAACGT#ACGTTGCA");
32 g_assert_cmpuint(subject.RSlen, ==, 8*2+1);
33 g_assert( subject.gc == 0.5);
3334
35 seq_subject_free( &subject);
3436 seq_free( &S);
3537 }
3638
3739 void test_seq_nonacgt(){
3840 seq_t S;
41 seq_subject subject;
3942
4043 seq_init( &S, "11ACGTNN7682394689NNTGCA11", "name");
41 seq_subject_init( &S);
44 seq_subject_init( &subject, &S);
4245
4346 g_assert_cmpstr(S.S, ==, "ACGTTGCA");
4447 g_assert_cmpuint(S.len, ==, 8 );
4548 g_assert( FLAGS & F_NON_ACGT);
4649
47 g_assert_cmpstr(S.RS, ==, "TGCAACGT#ACGTTGCA");
48 g_assert_cmpuint(S.RSlen, ==, 8*2+1);
49 g_assert( S.gc == 0.5);
50 g_assert_cmpstr(subject.RS, ==, "TGCAACGT#ACGTTGCA");
51 g_assert_cmpuint(subject.RSlen, ==, 8*2+1);
52 g_assert( subject.gc == 0.5);
5053
54 seq_subject_free( &subject);
5155 seq_free( &S);
5256
5357 FLAGS = F_NONE;
5458
5559 seq_init( &S, "@ACGT_!0TGCA ", "name");
56 seq_subject_init( &S);
60 seq_subject_init( &subject, &S);
5761
5862 g_assert_cmpstr(S.S, ==, "ACGT!TGCA");
5963 g_assert_cmpuint(S.len, ==, 9 );
6064 g_assert( FLAGS & F_NON_ACGT);
6165
62 g_assert_cmpstr(S.RS, ==, "TGCA;ACGT#ACGT!TGCA");
63 g_assert_cmpuint(S.RSlen, ==, 9*2+1);
66 g_assert_cmpstr(subject.RS, ==, "TGCA;ACGT#ACGT!TGCA");
67 g_assert_cmpuint(subject.RSlen, ==, 9*2+1);
6468
69 seq_subject_free( &subject);
6570 seq_free( &S);
6671
6772 FLAGS = F_NONE;