Imported Upstream version 0.10
Andreas Tille
7 years ago
0 | 0 | language: cpp |
1 | os: | |
2 | - linux | |
3 | - osx | |
1 | 4 | compiler: |
2 | - gcc | |
5 | - gcc | |
6 | - clang | |
3 | 7 | sudo: false |
4 | 8 | addons: |
5 | 9 | apt: |
6 | 10 | sources: |
7 | - deadsnakes | |
8 | - ubuntu-toolchain-r-test | |
11 | - deadsnakes | |
12 | - ubuntu-toolchain-r-test | |
9 | 13 | packages: |
10 | - cmake | |
11 | - g++-4.8 | |
12 | - libglib2.0-dev | |
13 | - libgsl0-dev | |
14 | - cmake | |
15 | - g++-4.8 | |
16 | - libglib2.0-dev | |
17 | - libgsl0-dev | |
14 | 18 | install: |
15 | 19 | - export LIBDIVDIR="$HOME/libdivsufsort" |
16 | 20 | - pip install --user cpp-coveralls |
19 | 23 | - cd libdivsufsort-master && mkdir build && cd build |
20 | 24 | - cmake -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="$LIBDIVDIR" .. |
21 | 25 | - make && make install |
26 | - if [ "${TRAVIS_OS_NAME}" = "osx" ]; then brew install gsl; fi | |
27 | - if [ "${TRAVIS_OS_NAME}" = "osx" ]; then brew install glib; fi | |
22 | 28 | before_install: |
23 | - if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi | |
29 | - if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi | |
24 | 30 | |
25 | 31 | # - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 90 |
26 | 32 | script: |
33 | - export DYLD_LIBRARY_PATH="$DYLD_LIBRARY_PATH:$LIBDIVDIR/lib" | |
27 | 34 | - export LD_LIBRARY_PATH="$LIBDIVDIR:$LIBDIVDIR/lib" |
28 | 35 | - export LIBRARY_PATH="$LIBDIVDIR:$LIBRARY_PATH" |
29 | 36 | - cd $TRAVIS_BUILD_DIR |
32 | 39 | - ./configure --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS" |
33 | 40 | - make |
34 | 41 | - make check |
42 | - export MYFLAGS="-I$LIBDIVDIR/include" | |
43 | - ./configure --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS" | |
35 | 44 | - make distcheck DISTCHECK_CONFIGURE_FLAGS="LDFLAGS=\"-L$LIBDIVDIR/lib\" CFLAGS=\"-I$LIBDIVDIR/include\" CXXFLAGS=\"-I$LIBDIVDIR/include\"" |
36 | 45 | - tar xzvf andi-*.tar.gz |
37 | 46 | - cd andi-* |
40 | 49 | - make check |
41 | 50 | - cd .. |
42 | 51 | after_success: |
43 | - coveralls --exclude libdivsufsort-master -E '^andi-.*' --exclude libs --exclude test --gcov `which gcov-4.8` --gcov-options '\-lp' | |
52 | - if [ "${TRAVIS_OS_NAME}" = "linux" -a "$CXX" = "g++-4.8" ]; then coveralls --exclude libdivsufsort-master -E '^andi-.*' --exclude libs --exclude test --gcov `which gcov-4.8` --gcov-options '\-lp'; fi |
6 | 6 | This readme covers all necessary instructions for the impatient to get `andi` up and running. For extensive instructions please consult the [manual](andi-manual.pdf). |
7 | 7 | |
8 | 8 | |
9 | # Build Instructions | |
9 | # Installation and Usage | |
10 | 10 | |
11 | For the latest [stable release](https://github.com/EvolBioInf/andi/releases) of `andi` download the tar ball. If you'd like to contribute to this software, feel free to create a fork of our [git repository](https://github.com/EvolBioInf/andi) and send pull requests. | |
11 | Stable versions of `andi` are available via package managers. For manual installation see below. | |
12 | ||
13 | For Debian and Ubuntu (starting 16.04): | |
14 | ||
15 | sudo apt-get install andi | |
16 | ||
17 | For OS X with Homebrew: | |
18 | ||
19 | brew install science/andi | |
20 | ||
21 | For ArchLinux with aura: | |
22 | ||
23 | sudo aura -A andi | |
24 | ||
25 | With a successful installation you can get the usage instructions via `--help` or the man page. | |
26 | ||
27 | $ andi --help | |
28 | $ man andi | |
29 | ||
30 | You can simply use `andi` with your genomes in `FASTA` format. | |
31 | ||
32 | $ andi S1.fasta S2.fasta | |
33 | 2 | |
34 | S1 0.0 0.1 | |
35 | s2 0.1 0.0 | |
36 | ||
37 | From this distance matrix the phylogeny can be inferred via neighbor-joining. Check the [manual](andi-manual.pdf) for a more thorough description. | |
12 | 38 | |
13 | 39 | |
14 | ## Prerequisites | |
40 | ## Manual installation | |
15 | 41 | |
16 | This program has the following external dependencies: [libdivsufsort](https://github.com/y-256/libdivsufsort) and the [GSL](https://www.gnu.org/software/gsl/). Please make sure you installed both before attempting a build. If you did get the source, not as a tarball, but straight from the git repository, you will also need the autotools. Run `autoreconf -i` to generate the configure script and continue with the next step. | |
42 | If your system does not support one of the above package managers you have to manually build the latest [stable release](https://github.com/EvolBioInf/andi/releases) from a tarball. See the [manual](andi-manual.pdf) for extensive building instructions. | |
17 | 43 | |
18 | ||
19 | ## Compiling | |
44 | This program has the following external dependencies: [libdivsufsort](https://github.com/y-256/libdivsufsort) and the [GSL](https://www.gnu.org/software/gsl/). Please make sure you installed both before attempting a build. If you did get the source, not as a tarball, but straight from the git repository, you will also need the autotools. | |
20 | 45 | |
21 | 46 | Assuming you have installed all prerequisites, building is as easy as follows. |
22 | 47 | |
23 | $ ./configure | |
24 | $ make | |
25 | $ make install | |
48 | $ autoreconf -fi -Im4 # optional when build from tarball | |
49 | $ ./configure | |
50 | $ make | |
51 | $ make install | |
26 | 52 | |
27 | Excessive build instructions are located in `INSTALL`. If the build was successful you can get the usage instructions via `--help` or the man page. | |
28 | ||
29 | $ andi --help | |
30 | $ man andi | |
31 | ||
32 | You can simply use `andi` with your genomes in `FASTA` format. | |
33 | ||
34 | $ andi S1.fasta S2.fasta | |
35 | 2 | |
36 | S1 0.0 0.1 | |
37 | s2 0.1 0.0 | |
38 | ||
39 | From this distance matrix the phylogeny can be inferred via neighbor-joining. Check the [manual](andi-manual.pdf) for a more thorough description. | |
40 | ||
53 | Excessive build instructions are located in `INSTALL`. | |
41 | 54 | |
42 | 55 | # Links and Additional Resources |
43 | 56 | |
53 | 66 | |
54 | 67 | ## License |
55 | 68 | |
56 | Copyright © 2014, 2015 Fabian Klötzl | |
69 | Copyright © 2014 - 2016 Fabian Klötzl | |
57 | 70 | License GPLv3+: GNU GPL version 3 or later. |
58 | 71 | |
59 | 72 | This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law. The full license text is available at <http://gnu.org/licenses/gpl.html>. |
Binary diff not shown
0 | AC_INIT([andi], [0.9.6.1]) | |
0 | AC_INIT([andi], [0.10]) | |
1 | 1 | AM_INIT_AUTOMAKE([-Wall foreign ]) |
2 | 2 | |
3 | 3 | AC_CONFIG_MACRO_DIR([m4]) |
31 | 31 | |
32 | 32 | AS_IF([test "x$with_libdivsufsort" != "xno"], |
33 | 33 | [ |
34 | # The libdivsufsort header contains some Microsoft extension making | |
35 | # compilation fail on certain systems (i.e. OS X). Add the following | |
36 | # flag so the build runs smoothly. | |
37 | CPPFLAGS="$CPPFLAGS -fms-extensions" | |
34 | 38 | AC_CHECK_HEADERS([divsufsort.h],[have_libdivsufsort=yes],[have_libdivsufsort=no]) |
35 | 39 | AC_CHECK_LIB(divsufsort, divsufsort, [], [have_libdivsufsort=no]) |
36 | 40 | ], |
0 | .TH ANDI "1" "June 2015" "@VERSION@" "" | |
0 | .TH ANDI "1" "2016-03-05" "@VERSION@" "andi manual" | |
1 | 1 | .SH NAME |
2 | 2 | andi \- estimates evolutionary distance |
3 | 3 | .SH SYNOPSIS |
10 | 10 | The output is a symmetrical distance matrix in \fIPHYLIP\fR format, with each entry representing divergence with a positive real number. A distance of zero means that two sequences are identical, whereas other values are estimates for the nucleotide substitution rate (Jukes-Cantor corrected). For technical reasons the comparison might fail and no estimate can be computed. In such cases \fInan\fR is printed. This either means that the input sequences were too short (<200bp) or too diverse (K>0.5) for our method to work properly. |
11 | 11 | .SH OPTIONS |
12 | 12 | .TP |
13 | \fB\-b, \-\-bootstrap\fR <INT> | |
13 | \fB\-b\fR, \fB\-\-bootstrap\fR <INT> | |
14 | 14 | Compute multiple distance matrices, with \fIn-1\fR bootstrapped from the first. See the paper Klötzl & Haubold (2016, in review) for a detailed explanation. |
15 | 15 | .TP |
16 | 16 | \fB\-j\fR, \fB\-\-join\fR |
25 | 25 | \fB\-p\fR <FLOAT> |
26 | 26 | Significance of an anchor pair; default: 0.05. |
27 | 27 | .TP |
28 | \fB\-t\fR <INT> | |
28 | \fB\-t\fR, \fB\-\-threads\fR <INT> | |
29 | 29 | The number of threads to be used; by default, all available processors are used. |
30 | 30 | .br |
31 | 31 | Multithreading is only available if \fBandi\fR was compiled with OpenMP support. |
39 | 39 | \fB\-\-version\fR |
40 | 40 | Outputs version information and acknowledgments. |
41 | 41 | .SH COPYRIGHT |
42 | Copyright \(co 2014, 2015 Fabian Klötzl | |
42 | Copyright \(co 2014 - 2016 Fabian Klötzl | |
43 | 43 | License GPLv3+: GNU GPL version 3 or later. |
44 | 44 | .br |
45 | 45 | This is free software: you are free to change and redistribute it. |
14 | 14 | \usepackage{amsthm} |
15 | 15 | \usepackage{acronym} |
16 | 16 | \usepackage{amssymb} |
17 | \usepackage{tikz} | |
18 | \usepackage{pgfplots} | |
19 | 17 | \usepackage{caption} |
20 | 18 | \usepackage{subcaption} |
21 | \usepackage{booktabs} | |
22 | 19 | \usepackage{xspace} |
23 | \usepackage{overpic} | |
24 | 20 | |
25 | 21 | \bibliographystyle{alpha} |
26 | 22 | |
108 | 104 | |
109 | 105 | The easiest way to install \andi is via a package manager. This also handles all dependencies for you. |
110 | 106 | |
107 | ||
108 | \noindent Debian and Ubuntu (since 16.04): | |
109 | ||
110 | \begin{lstlisting} | |
111 | ~ % sudo apt-get install andi | |
112 | \end{lstlisting} | |
113 | ||
111 | 114 | \noindent OS X with homebrew: |
112 | 115 | |
113 | 116 | \begin{lstlisting} |
118 | 121 | |
119 | 122 | \begin{lstlisting} |
120 | 123 | ~ % aura -A andi |
121 | \end{lstlisting} | |
122 | ||
123 | \noindent Debian and Ubuntu (soon): | |
124 | ||
125 | \begin{lstlisting} | |
126 | ~ % sudo apt-get install andi | |
127 | 124 | \end{lstlisting} |
128 | 125 | |
129 | 126 | \andi is intended to run in a \algo{Unix} commandline such as \lstinline$bash$ or \lstinline$zsh$. All examples in this document are also intended for that environment. You can verify that \andi was installed correctly by executing \lstinline$andi -h$. This should give you a list of all available options (see Section~\ref{sec:options}). |
188 | 185 | \begin{lstlisting} |
189 | 186 | ~ % git clone git@github.com:EvolBioInf/andi.git |
190 | 187 | ~ % cd andi |
191 | ~/andi % autoreconf -i | |
192 | \end{lstlisting} | |
193 | ||
194 | \noindent For old versions of \algo{autoconf} you may instead have to use \lstinline$autoreconf -i -Im4$. | |
188 | ~/andi % autoreconf -fi -Im4 | |
189 | \end{lstlisting} | |
195 | 190 | |
196 | 191 | \noindent Continue with the \algo{Gnu} trinity as described in Section~\ref{sub:regular}. |
197 | 192 | |
268 | 263 | |
269 | 264 | By default, the distances computed by \andi are \emph{Jukes-Cantor} corrected \cite{jukescantor}. Other evolutionary models are also implemented (Kimura, raw). The \lstinline$--model$ parameter can be used to switch between them. |
270 | 265 | |
271 | Since version 0.9.4 \andi includes a bootstrapping method. It can be activated via the \lstinline$--bootstrap$ or \lstinline$-b$ switch. This option takes a numeric argument representing the number of matrices to create. The output can then be piped into \algo{phylip}. | |
266 | Since version 0.9.4 \andi includes a bootstrapping method. It can be activated via the \lstinline$--bootstrap$ or \lstinline$-b$ switch. This option takes a numeric argument representing the number of matrices to create. The output can then be piped into \algo{phylip}. For more information on computing support values from distance matrices see \cite{afra}. | |
272 | 267 | |
273 | 268 | \begin{lstlisting} |
274 | 269 | ~ % andi -b 2 ../test/1M.1.fasta |
332 | 327 | \end{figure} |
333 | 328 | |
334 | 329 | |
330 | \chapter{Warnings and Errors} | |
331 | ||
332 | Here be an explanation of all possible errors. Other errors may occur and are due to the failure of underlying functions (e.\,g.~\lstinline$read(3)$). All warning messages are printed to \lstinline$stderr$. Most errors are non-recoverable and will result in \andi exiting with a non-zero state. | |
333 | ||
334 | \section{Sequence Related Messages} | |
335 | ||
336 | \subsection*{Unexpected Character} | |
337 | ||
338 | \andi is pretty pedantic about the formatting of \algo{FASTA} files. If you violate the syntax, \andi will print the file name, the line and the problematic character. These errors are non-recovering, meaning no further sequences are read from the invalid file. The checks are implemented by the \href{https://github.com/kloetzl/pfasta}{\algo{pfasta}} library. | |
339 | ||
340 | ||
341 | \subsection*{Non acgtACGT Nucleotides Stripped} | |
342 | ||
343 | Our models of genome evolution (JC, Kimura) only work on the four canonical types of nucleotides. All others are stripped from the sequences. This can be ignored in most cases. | |
344 | ||
345 | \subsection*{Too Short Sequence} | |
346 | ||
347 | \andi was designed for big data sets of whole genomes. On short sequences the distance estimates are inaccurate. Use an multiple sequence alignment instead. | |
348 | ||
349 | \subsection*{Too Long Sequence} | |
350 | ||
351 | \algo{libdivsufsort} limits the length of a sequence to 31 bits. That count includes the reverse complement. So the technical limit for a sequence analysis is $2^{30} = 1.073.741.824$. Unfortunately, that excludes (full) human and mice genomes. Per-chromosome analysis works just fine. | |
352 | ||
353 | \subsection*{Empty Sequence} | |
354 | ||
355 | One of the given sequences contained either no nucleotides at all, or only non-canonical ones. | |
356 | ||
357 | \subsection*{Less than two sequences given} | |
358 | ||
359 | As \andi tries to compare sequences, at least two need to be supplied. Note that \andi may have regarded some of your given sequences as unusable. | |
360 | ||
361 | \subsection*{Maximum Number of Sequences} | |
362 | ||
363 | The maximum number of sequences \andi can possible compare is huge (roughly $457.845.052$). I doubt anyone will ever reach that limit. Please send me a mail, if you do. | |
364 | ||
365 | \section{Technical Messages} | |
366 | ||
367 | \subsection*{Out of Memory} | |
368 | ||
369 | If \andi runs out of memory, it gives up. Either free memory, run \andi on a bigger machine, try the \lstinline$--low-memory$ mode or reduce the number of threads. | |
370 | ||
371 | \subsection*{RNG allocation} | |
372 | ||
373 | Some technical thing failed. If it keeps failing repeatedly, file a bug. | |
374 | ||
375 | \subsection*{Bootstrapping failed} | |
376 | ||
377 | This should not happen. | |
378 | ||
379 | \subsection*{Failed index creation} | |
380 | ||
381 | This should not happen, either. | |
382 | ||
383 | \subsection*{Skipped and ignored Arguments} | |
384 | ||
385 | Some command line parameters of \andi require arguments. If these are not of the expected type, a warning is given. See Section~\ref{sec:options} for their correct usage. | |
386 | ||
387 | ||
388 | \section{Output-related Warnings} | |
389 | ||
390 | As the input sequences get more evolutionary divergent, \andi finds less anchors. With less anchors, less nucleotides are cosidered homologous between two sequences. If no anchors are found comparison fails and \lstinline!nan! is printed instead. See out paper and especiallt Figure~2 for details. | |
391 | ||
392 | \subsection*{NaN} | |
393 | ||
394 | No anchors were found. Your sequences are very divergent ($d>0.5$) or sprout a lot of indels that make comparison difficult. | |
395 | ||
396 | \subsection*{Little Homology} | |
397 | ||
398 | Very few anchors were found and thus only a tiny part of the sequences is considered homologous. Expect that the given distance is errorneous. | |
335 | 399 | |
336 | 400 | |
337 | 401 | \chapter{DevOps} %%%%% |
375 | 439 | ~/andi % make check |
376 | 440 | \end{lstlisting} |
377 | 441 | |
378 | \noindent The unit tests are also checked each time a commit is sent to the repository. This is done via \algo{TravisCI}.\footnote{\url{https://travis-ci.org/EvolBioInf/andi}} Thus, a warning is produced, when the builds fail, or the unit tests to not run successfully. Currently, the unit tests cover more than 80\% of the code. This is computed via the \algo{Travis} builds and a service called \algo{Coveralls}.\footnote{\url{https://coveralls.io/r/EvolBioInf/andi}} | |
442 | \noindent The unit tests are also checked each time a commit is sent to the repository. This is done via \algo{TravisCI}.\footnote{\url{https://travis-ci.org/EvolBioInf/andi}} Thus, a warning is produced, when the builds fail, or the unit tests to not run successfully. Currently, the unit tests cover more than 75\% of the code. This is computed via the \algo{Travis} builds and a service called \algo{Coveralls}.\footnote{\url{https://coveralls.io/r/EvolBioInf/andi}} Unfortunately, coveralls is broken at this point in time. | |
379 | 443 | |
380 | 444 | \section{Known Issues} |
381 | 445 | |
382 | 446 | These minor issues are known. I intend to fix them, when I have time. |
383 | 447 | |
384 | 448 | \begin{enumerate} |
385 | \item This code will not work under Windows. At two places Unix-only code is used: filepath-seperators are assumed to be \lstinline$/$ and file-descriptors are used for I/O. | |
449 | \item This code will not work under Windows. At two places Unix-only code is used: filepath-separators are assumed to be \lstinline$/$ and file-descriptors are used for I/O. | |
450 | \item Only the first 10 characters are printed in the output matrix. This can make long gi numbers indistinguishable. In version 0.10 we started printing a warning for this. | |
451 | \item Unit tests for the bootstrapped matrices are missing. | |
452 | \item Cached intervals are sometimes not “as deep as they could be”. If that got fixed \lstinline$get_match_cache$ could bail out on \lstinline$ij.lcp < CACHE_LENGTH$. However the \lstinline$esa_init_cache$ code is the most fragile part and should be handled with care. | |
386 | 453 | \end{enumerate} |
387 | 454 | |
388 | 455 | |
392 | 459 | |
393 | 460 | %\subsection{Preparing a new Release} |
394 | 461 | |
395 | Once \andi is matured, the new features implemented, and all tests were run, a new release can be created. First, increase the version number in \lstinline$configure.ac$. Commit that change in git, and tag this commit with \lstinline$vX.y$. Create another commit, where you set the version number to the next release (e.\,g., \lstinline$vX.z-beta$). This assures that there is only one commit and build with that specific version. Now return to the previous commit \lstinline$git checkout vX.y$. | |
462 | Once \andi is matured, the new features implemented, and all tests were run, a new release can be created. First, increase the version number in \lstinline$configure.ac$. Commit that change in git, and tag this commit with \lstinline$vX.y$. Tags should be annotated and signed, if possible. This manual then needs manual rebuilding. | |
396 | 463 | |
397 | 464 | Ensure that \andi is ready for packaging with \algo{autoconf}. |
398 | 465 | |
413 | 480 | ================================================= |
414 | 481 | \end{lstlisting} |
415 | 482 | |
416 | If the command does not build successfully, no tarballs will be created. This may necessitate further study of \algo{autoconf} and \algo{automake}. | |
417 | ||
418 | ||
483 | If the command does not build successfully, no tarballs will be created. This may necessitate further study of \algo{autoconf} and \algo{automake}. | |
484 | ||
485 | Also verify that the recent changes did not create a performance regression. This includes testing both ends of the scale: \eco and \pneu. Both should be reasonable close to previous releases. | |
486 | ||
487 | Create another commit, where you set the version number to the next release (e.\,g., \lstinline$vX.z-beta$). This assures that there is only one commit and build with that specific version. | |
419 | 488 | |
420 | 489 | \backmatter |
421 | 490 | %\addcontentsline{toc}{chapter}{Bibliography} |
44 | 44 | Author = {Chris Lattner and Vikram Adve}, |
45 | 45 | Title = {{LLVM}: A Compilation Framework for Lifelong Program |
46 | 46 | Analysis and Transformation}, |
47 | Booktitle = "Code Generation and Optimizatio", | |
47 | Booktitle = "Code Generation and Optimization", | |
48 | 48 | Month = {Mar}, |
49 | 49 | Year = {2004}, |
50 | 50 | pages = {75--88}, |
108 | 108 | title={Efficient Estimation of Evolutionary Distances} |
109 | 109 | } |
110 | 110 | |
111 | @article{afra, | |
112 | AUTHOR = {Klötzl, Fabian and Haubold, Bernhard}, | |
113 | TITLE = {Support Values for Genome Phylogenies}, | |
114 | JOURNAL = {Life}, | |
115 | VOLUME = {6}, | |
116 | YEAR = {2016}, | |
117 | NUMBER = {1}, | |
118 | PAGES = {11}, | |
119 | URL = {http://www.mdpi.com/2075-1729/6/1/11}, | |
120 | ISSN = {2075-1729}, | |
121 | DOI = {10.3390/life6010011} | |
122 | } | |
111 | 123 | |
124 | ||
125 |
0 | 0 | /* @brief Here follows a simple implementation of the GNU function `strchrnul`. |
1 | 1 | * Please check the gnulib manual for details. |
2 | 2 | */ |
3 | #include <string.h> | |
4 | ||
3 | 5 | char *strchrnul(const char *s, int c){ |
4 | 6 | char *p = strchr(s,c); |
5 | 7 |
0 | #!/usr/bin/zsh | |
1 | ||
2 | # Compute the number of failing comparisons for different distances. | |
3 | ||
4 | DISTS=(0.1 0.2 0.3 0.35 0.4 0.45 0.5 0.55 0.6 0.65 0.7) | |
5 | ||
6 | LENGTH=100000 | |
7 | ||
8 | for dist in $DISTS; do | |
9 | echo "" > est_$dist.dist | |
10 | for (( i = 0; i < 1000; i++ )); do | |
11 | ../test/test_fasta -l $LENGTH -d $dist > temp.fa | |
12 | ../src/andi ./temp.fa > est.dist 2> /dev/null | |
13 | tail -n 1 est.dist >> est_$dist.dist | |
14 | done | |
15 | avg=$(cat est_$dist.dist | awk '"nan" !~ $2 {sum+=$2;c++}END{print sum/c}') | |
16 | sd=$(grep -v 'nan' est_$dist.dist | awk '{a[c++]=$2;aa+=$2}END{aa/=NR;for(c=0;c<NR;c++){t=a[c]-aa;sd+=t*t}print sqrt(sd/(NR-1))}') | |
17 | failed=$(cat est_$dist.dist | grep -c 'nan') | |
18 | echo $dist "\t" $avg"\t±" $sd "\t" $failed | |
19 | done |
15 | 15 | |
16 | 16 | .PHONY: perf |
17 | 17 | perf: CFLAGS+= -g -O3 -ggdb -fno-omit-frame-pointer |
18 | perf: CXXFLAGS+= -g -O3 -ggdb -fno-omit-frame-pointer | |
18 | 19 | perf: andi |
24 | 24 | #include <assert.h> |
25 | 25 | #include <errno.h> |
26 | 26 | #include <getopt.h> |
27 | #include <limits.h> | |
27 | 28 | #include <stdio.h> |
28 | 29 | #include <stdlib.h> |
29 | 30 | #include <string.h> |
54 | 55 | * |
55 | 56 | * The main function reads and parses the commandline arguments. Depending on |
56 | 57 | * the set flags it reads the input files and forwards the contained sequences |
57 | * to processing. | |
58 | * to processing. Also it verifies the input for correctness and issues warnings | |
59 | * and errors. | |
58 | 60 | */ |
59 | 61 | int main(int argc, char *argv[]) { |
60 | 62 | int c; |
133 | 135 | |
134 | 136 | if (threads > (long unsigned int)omp_get_num_procs()) { |
135 | 137 | warnx( |
136 | "The number of threads to be used, is greater then the " | |
138 | "The number of threads to be used, is greater than the " | |
137 | 139 | "number of available processors; Ignoring -t %lu " |
138 | 140 | "argument.", |
139 | 141 | threads); |
173 | 175 | } else if (strcasecmp(optarg, "KIMURA") == 0) { |
174 | 176 | MODEL = M_KIMURA; |
175 | 177 | } else { |
176 | warnx("Ignoring argument for --model. Expected Raw JC or Kimura"); | |
178 | warnx("Ignoring argument for --model. Expected Raw, JC or " | |
179 | "Kimura"); | |
177 | 180 | } |
178 | 181 | break; |
179 | 182 | } |
195 | 198 | } |
196 | 199 | |
197 | 200 | dsa_t dsa; |
198 | if (dsa_init(&dsa)) { | |
199 | errx(errno, "Out of memory."); | |
200 | } | |
201 | dsa_init(&dsa); | |
201 | 202 | |
202 | 203 | const char *file_name; |
203 | 204 | |
222 | 223 | |
223 | 224 | size_t n = dsa_size(&dsa); |
224 | 225 | |
226 | if (n < 2) { | |
227 | errx(1, | |
228 | "I am truly sorry, but with less than two sequences (%zu given) " | |
229 | "there is nothing to compare.", | |
230 | n); | |
231 | } | |
232 | ||
225 | 233 | if (FLAGS & F_VERBOSE) { |
226 | 234 | fprintf(stderr, "Comparing %zu sequences\n", n); |
227 | 235 | fflush(stderr); |
235 | 243 | // seed the random number generator with the current time |
236 | 244 | gsl_rng_set(RNG, time(NULL)); |
237 | 245 | |
238 | seq_t *sequences = dsa_data(&dsa); | |
246 | // Warn about non ACGT residues. | |
247 | if (FLAGS & F_NON_ACGT) { | |
248 | warnx("The input sequences contained characters other than acgtACGT. " | |
249 | "These were automatically stripped to ensure correct results."); | |
250 | } | |
251 | ||
252 | // validate sequence correctness | |
253 | const seq_t *seq = dsa_data(&dsa); | |
254 | for (size_t i = 0; i < n; ++i, seq++) { | |
255 | if (strlen(seq->name) > 10) { | |
256 | warnx("The sequence name '%s' is longer than ten characters. It " | |
257 | "will be truncated in the output to '%.10s'.", | |
258 | seq->name, seq->name); | |
259 | } | |
260 | ||
261 | const size_t LENGTH_LIMIT = (INT_MAX - 1) / 2; | |
262 | if (seq->len > LENGTH_LIMIT) { | |
263 | errx(1, "The sequence %s is too long. The technical limit is %zu.", | |
264 | seq->name, LENGTH_LIMIT); | |
265 | } | |
266 | ||
267 | if (seq->len == 0) { | |
268 | errx(1, "The sequence %s is empty.", seq->name); | |
269 | } | |
270 | ||
271 | if (seq->len < 1000) { | |
272 | FLAGS |= F_SHORT; | |
273 | } | |
274 | } | |
275 | ||
276 | if (FLAGS & F_SHORT) { | |
277 | warnx("One of the given input sequences is shorter than a thousand " | |
278 | "nucleotides. This may result in inaccurate distances. Try an " | |
279 | "alignment instead."); | |
280 | } | |
281 | ||
239 | 282 | // compute distance matrix |
240 | if (n >= 2) { | |
241 | calculate_distances(sequences, n); | |
242 | } else { | |
243 | warnx("I am truly sorry, but with less than two sequences (%zu given) " | |
244 | "there is nothing to compare.", | |
245 | n); | |
246 | } | |
283 | calculate_distances(dsa_data(&dsa), n); | |
247 | 284 | |
248 | 285 | dsa_free(&dsa); |
249 | 286 | gsl_rng_free(RNG); |
288 | 325 | void version(void) { |
289 | 326 | const char str[] = { |
290 | 327 | "andi " VERSION "\n" |
291 | "Copyright (C) 2014, 2015 Fabian Klötzl\n" | |
328 | "Copyright (C) 2014 - 2016 Fabian Klötzl\n" | |
292 | 329 | "License GPLv3+: GNU GPL version 3 or later " |
293 | 330 | "<http://gnu.org/licenses/gpl.html>\n" |
294 | 331 | "This is free software: you are free to change and redistribute it.\n" |
301 | 338 | "Sequence Analysis, Genome Rearrangements, and Phylogenetic " |
302 | 339 | "Reconstruction. pp 118f.\n" |
303 | 340 | "3) SA construction: Mori, Y. (2005). Short description of improved " |
304 | "two-stage suffix sorting alorithm. " | |
341 | "two-stage suffix sorting algorithm. " | |
305 | 342 | "http://homepage3.nifty.com/wpage/software/itssort.txt\n"}; |
306 | 343 | printf("%s", str); |
307 | 344 | exit(EXIT_SUCCESS); |
28 | 28 | * @param n - The number of sequences |
29 | 29 | * @param M - A matrix for additional output data |
30 | 30 | */ |
31 | void NAME(struct model *M, seq_t *sequences, size_t n) { | |
31 | void NAME(struct model *M, const seq_t *sequences, size_t n) { | |
32 | 32 | size_t i; |
33 | 33 | |
34 | 34 | //#pragma |
35 | 35 | P_OUTER |
36 | 36 | for (i = 0; i < n; i++) { |
37 | seq_t *subject = &sequences[i]; | |
37 | seq_subject subject; | |
38 | 38 | esa_s E; |
39 | 39 | |
40 | if (seq_subject_init(subject) || esa_init(&E, subject)) { | |
41 | errx(1, "Failed to create index for %s.", subject->name); | |
40 | if (seq_subject_init(&subject, &sequences[i]) || | |
41 | esa_init(&E, &subject)) { | |
42 | errx(1, "Failed to create index for %s.", sequences[i].name); | |
42 | 43 | } |
43 | 44 | |
44 | 45 | // now compare every other sequence to i |
59 | 60 | |
60 | 61 | size_t ql = sequences[j].len; |
61 | 62 | |
62 | M(i, j) = dist_anchor(&E, sequences[j].S, ql, subject->gc); | |
63 | M(i, j) = dist_anchor(&E, sequences[j].S, ql, subject.gc); | |
63 | 64 | } |
64 | 65 | |
65 | 66 | esa_free(&E); |
66 | seq_subject_free(subject); | |
67 | seq_subject_free(&subject); | |
67 | 68 | } |
68 | 69 | } |
16 | 16 | #include <string.h> |
17 | 17 | #include <assert.h> |
18 | 18 | #include "esa.h" |
19 | #include "global.h" | |
19 | 20 | |
20 | 21 | static void esa_init_cache_dfs(esa_s *, char *str, size_t pos, lcp_inter_t in); |
21 | 22 | static void esa_init_cache_fill(esa_s *, char *str, size_t pos, lcp_inter_t in); |
69 | 70 | * @returns 0 iff successful |
70 | 71 | */ |
71 | 72 | int esa_init_cache(esa_s *self) { |
72 | lcp_inter_t *cache = | |
73 | malloc((1 << (2 * CACHE_LENGTH)) * sizeof(lcp_inter_t)); | |
74 | ||
75 | if (!cache) { | |
76 | return 1; | |
77 | } | |
73 | lcp_inter_t *cache = malloc((1 << (2 * CACHE_LENGTH)) * sizeof(*cache)); | |
74 | CHECK_MALLOC(cache); | |
78 | 75 | |
79 | 76 | self->cache = cache; |
80 | 77 | |
127 | 124 | continue; |
128 | 125 | } |
129 | 126 | |
127 | if (ij.l <= (ssize_t)(pos + 1)) { | |
128 | // Continue one level deeper | |
129 | // This is the usual case | |
130 | esa_init_cache_dfs(C, str, pos + 1, ij); | |
131 | continue; | |
132 | } | |
133 | ||
130 | 134 | // The LCP-interval is deeper than expected |
131 | if (ij.l > (ssize_t)(pos + 1)) { | |
132 | ||
133 | // Check if it still fits into the cache | |
134 | if ((size_t)ij.l < CACHE_LENGTH) { | |
135 | ||
136 | // fill with dummy value | |
137 | esa_init_cache_fill(C, str, pos + 1, in); | |
138 | ||
139 | char non_acgt = 0; | |
140 | ||
141 | // fast forward | |
142 | size_t k = pos + 1; | |
143 | for (; k < (size_t)ij.l; k++) { | |
144 | // In some very edgy edge cases the lcp-interval `ij` | |
145 | // contains a `;` or another non-acgt character. Since we | |
146 | // cannot cache those, break. | |
147 | char c = C->S[C->SA[ij.i] + k]; | |
148 | if (char2code(c) < 0) { | |
149 | non_acgt = 1; | |
150 | break; | |
151 | } | |
152 | ||
153 | str[k] = c; | |
154 | } | |
155 | ||
156 | if (non_acgt) { | |
157 | esa_init_cache_fill(C, str, k, ij); | |
158 | } else { | |
159 | esa_init_cache_dfs(C, str, k, ij); | |
160 | } | |
161 | ||
162 | continue; | |
163 | } | |
164 | ||
135 | // Check if it still fits into the cache | |
136 | if ((size_t)ij.l >= CACHE_LENGTH) { | |
165 | 137 | // If the lcp-interval exceeds the cache depth, stop here and fill |
166 | 138 | esa_init_cache_fill(C, str, pos + 1, in); |
167 | 139 | continue; |
168 | 140 | } |
169 | 141 | |
170 | // Continue one level deeper | |
171 | esa_init_cache_dfs(C, str, pos + 1, ij); | |
142 | /* At this point the prefix `str` of length `pos` has been found. | |
143 | * However, the call to `getInterval` above found an interval with | |
144 | * an LCP value bigger than `pos`. This means that not all elongations | |
145 | * (more precise: just one) of `str` appear in the subject. Thus fill | |
146 | * all values with the matched result to far and continue only with | |
147 | * the one special substring. | |
148 | */ | |
149 | esa_init_cache_fill(C, str, pos + 1, in); | |
150 | ||
151 | char non_acgt = 0; | |
152 | ||
153 | // fast forward | |
154 | size_t k = pos + 1; | |
155 | for (; k < (size_t)ij.l; k++) { | |
156 | // In some very edgy edge cases the lcp-interval `ij` | |
157 | // contains a `;` or another non-acgt character. Since we | |
158 | // cannot cache those, break. | |
159 | char c = C->S[C->SA[ij.i] + k]; | |
160 | if (char2code(c) < 0) { | |
161 | non_acgt = 1; | |
162 | break; | |
163 | } | |
164 | ||
165 | str[k] = c; | |
166 | } | |
167 | ||
168 | if (non_acgt) { | |
169 | esa_init_cache_fill(C, str, k, ij); | |
170 | } else { | |
171 | esa_init_cache_dfs(C, str, k, ij); | |
172 | } | |
172 | 173 | } |
173 | 174 | } |
174 | 175 | |
215 | 216 | size_t len = self->len; |
216 | 217 | |
217 | 218 | char *FVC = self->FVC = malloc(len); |
218 | if (!FVC) { | |
219 | return 1; | |
220 | } | |
219 | CHECK_MALLOC(FVC); | |
221 | 220 | |
222 | 221 | const char *S = self->S; |
223 | 222 | const int *SA = self->SA; |
238 | 237 | * @param S - The sequence |
239 | 238 | * @returns 0 iff successful |
240 | 239 | */ |
241 | int esa_init(esa_s *C, const seq_t *S) { | |
242 | if (!C || !S || !S->S) return 1; | |
240 | int esa_init(esa_s *C, const seq_subject *S) { | |
241 | if (!C || !S || !S->RS) return 1; | |
243 | 242 | |
244 | 243 | *C = (esa_s){.S = S->RS, .len = S->RSlen}; |
245 | 244 | |
284 | 283 | return 1; |
285 | 284 | } |
286 | 285 | |
287 | C->SA = malloc(C->len * sizeof(saidx_t)); | |
288 | if (!C->SA) { | |
289 | return 2; | |
290 | } | |
286 | C->SA = malloc(C->len * sizeof(*C->SA)); | |
287 | CHECK_MALLOC(C->SA); | |
291 | 288 | |
292 | 289 | saidx_t result = 1; |
293 | 290 | |
310 | 307 | if (!C || !C->LCP) { |
311 | 308 | return 1; |
312 | 309 | } |
313 | saidx_t *CLD = C->CLD = malloc((C->len + 1) * sizeof(saidx_t)); | |
314 | if (!C->CLD) { | |
315 | return 2; | |
316 | } | |
310 | saidx_t *CLD = C->CLD = malloc((C->len + 1) * sizeof(*CLD)); | |
311 | CHECK_MALLOC(CLD); | |
317 | 312 | |
318 | 313 | saidx_t *LCP = C->LCP; |
319 | 314 | |
320 | 315 | typedef struct pair_s { saidx_t idx, lcp; } pair_t; |
321 | 316 | |
322 | pair_t *stack = malloc((C->len + 1) * sizeof(pair_t)); | |
317 | pair_t *stack = malloc((C->len + 1) * sizeof(*stack)); | |
318 | CHECK_MALLOC(stack); | |
323 | 319 | pair_t *top = stack; // points at the topmost filled element |
324 | 320 | pair_t last; |
325 | 321 | |
378 | 374 | |
379 | 375 | // Allocate new memory |
380 | 376 | // The LCP array is one element longer than S. |
381 | saidx_t *LCP = C->LCP = malloc((len + 1) * sizeof(saidx_t)); | |
382 | if (!LCP) { | |
383 | return 3; | |
384 | } | |
377 | saidx_t *LCP = C->LCP = malloc((len + 1) * sizeof(*LCP)); | |
378 | CHECK_MALLOC(LCP); | |
385 | 379 | |
386 | 380 | LCP[0] = -1; |
387 | 381 | LCP[len] = -1; |
388 | 382 | |
389 | 383 | // Allocate temporary arrays |
390 | saidx_t *PHI = malloc(len * sizeof(saidx_t)); | |
384 | saidx_t *PHI = malloc(len * sizeof(*PHI)); | |
391 | 385 | saidx_t *PLCP = PHI; |
392 | if (!PHI) { | |
393 | free(PHI); | |
394 | return 2; | |
395 | } | |
386 | CHECK_MALLOC(PHI); | |
396 | 387 | |
397 | 388 | PHI[SA[0]] = -1; |
398 | 389 | saidx_t k; |
5 | 5 | #ifndef _ESA_H_ |
6 | 6 | #define _ESA_H_ |
7 | 7 | |
8 | #include <sys/types.h> | |
8 | 9 | #include "sequence.h" |
9 | 10 | #include "config.h" |
10 | 11 | |
67 | 68 | |
68 | 69 | lcp_inter_t get_match_cached(const esa_s *, const char *query, size_t qlen); |
69 | 70 | lcp_inter_t get_match(const esa_s *, const char *query, size_t qlen); |
70 | int esa_init(esa_s *, const seq_t *S); | |
71 | int esa_init(esa_s *, const seq_subject *S); | |
71 | 72 | void esa_free(esa_s *); |
72 | 73 | |
73 | 74 | #ifdef DEBUG |
8 | 8 | #define _GLOBAL_H_ |
9 | 9 | #include <gsl/gsl_rng.h> |
10 | 10 | |
11 | #include <err.h> | |
11 | 12 | #include "config.h" |
12 | 13 | |
13 | 14 | /** |
61 | 62 | F_SHORT = 64 |
62 | 63 | }; |
63 | 64 | |
65 | /** | |
66 | * @brief This macro is used to unify the checks for the return value of malloc. | |
67 | * | |
68 | * @param PTR - The pointer getting checked. | |
69 | */ | |
70 | #define CHECK_MALLOC(PTR) \ | |
71 | do { \ | |
72 | if (PTR == NULL) { \ | |
73 | err(errno, "Out of memory"); \ | |
74 | } \ | |
75 | } while (0); | |
76 | ||
64 | 77 | #endif |
4 | 4 | #define _GNU_SOURCE |
5 | 5 | #include <fcntl.h> |
6 | 6 | #include <math.h> |
7 | #include <limits.h> | |
7 | 8 | #include <stdio.h> |
8 | 9 | #include <string.h> |
9 | 10 | #include <unistd.h> |
23 | 24 | * "I didn't learn joined up handwriting for nothing, you know." |
24 | 25 | * ~ Gilderoy Lockhart |
25 | 26 | * |
26 | * @param in - The file pointer to read from. | |
27 | * @param dsa - An array that holds found sequences. | |
28 | * @param name - The name of the file to be used for the name of the sequence. | |
27 | * @param file_name - The name of the file to be used for reading. The name is | |
28 | * also used to infer the sequence name. | |
29 | * @param dsa - (output parameter) An array that holds found sequences. | |
29 | 30 | */ |
30 | 31 | void read_fasta_join(const char *file_name, dsa_t *dsa) { |
31 | 32 | if (!file_name || !dsa) return; |
46 | 47 | */ |
47 | 48 | |
48 | 49 | const char *left = strrchr(file_name, '/'); // find the last path separator |
49 | left = (left == NULL) ? file_name : left + 1; // left is the position one of | |
50 | // to the right of the path | |
51 | // separator | |
50 | left = (left == NULL) ? file_name : left + 1; | |
51 | // left is the position one of to the right of the path separator | |
52 | 52 | |
53 | 53 | const char *dot = strchrnul(left, '.'); // find the extension |
54 | 54 | |
55 | joined.name = strndup( | |
56 | left, dot - left); // copy only the file name, not its path or extension | |
55 | // copy only the file name, not its path or extension | |
56 | joined.name = strndup(left, dot - left); | |
57 | CHECK_MALLOC(joined.name); | |
58 | ||
57 | 59 | dsa_push(dsa, joined); |
58 | 60 | dsa_free(&single); |
59 | 61 | } |
60 | 62 | |
61 | 63 | /** |
62 | 64 | * @brief This function reads sequences from a file. |
63 | * @param in - The file pointer to read from. | |
64 | * @param dsa - An array that holds found sequences. | |
65 | * @param file_name - The file to read. | |
66 | * @param dsa - (output parameter) An array that holds found sequences. | |
65 | 67 | */ |
66 | 68 | void read_fasta(const char *file_name, dsa_t *dsa) { |
67 | 69 | if (!file_name || !dsa) return; |
123 | 125 | int use_scientific = 0; |
124 | 126 | |
125 | 127 | double *DD = malloc(n * n * sizeof(*DD)); |
126 | if (!DD) err(errno, "meh."); | |
128 | CHECK_MALLOC(DD); | |
127 | 129 | |
128 | 130 | #define DD(X, Y) (DD[(X)*n + (Y)]) |
129 | 131 | |
132 | 134 | |
133 | 135 | switch (MODEL) { |
134 | 136 | case M_RAW: estimate = &estimate_RAW; break; |
137 | default: | |
138 | /* intentional fall-through. This is just here to silence any | |
139 | * compiler warnings. The real default is set in andi.c.*/ | |
135 | 140 | case M_JC: estimate = &estimate_JC; break; |
136 | 141 | case M_KIMURA: estimate = &estimate_KIMURA; break; |
137 | 142 | } |
167 | 172 | |
168 | 173 | printf("%zu\n", n); |
169 | 174 | for (i = 0; i < n; i++) { |
170 | // Print exactly nine characters of the name. Pad with spaces if | |
175 | // Print exactly ten characters of the name. Pad with spaces if | |
171 | 176 | // necessary. |
172 | printf("%-9.9s", sequences[i].name); | |
177 | printf("%-10.10s", sequences[i].name); | |
173 | 178 | |
174 | 179 | for (j = 0; j < n; j++) { |
175 | 180 | // use scientific notation for small numbers |
157 | 157 | * @brief Given an anchor, classify nucleotides. |
158 | 158 | * |
159 | 159 | * For anchors we already know that the nucleotides of the subject and the query |
160 | * are equal. Thus only one sequence has to be analysed. See `model_count` for | |
161 | * an explanation of the algorithm. | |
160 | * are equal. Thus only one sequence has to be analysed. Most models don't | |
161 | * actually care about the individual nucleotides as long as they are equal in | |
162 | * the two sequences. For these models, we just assume equal distribution. | |
162 | 163 | * |
163 | 164 | * @param MM - The mutation matrix |
164 | 165 | * @param S - The subject |
165 | 166 | * @param len - The anchor length |
166 | 167 | */ |
167 | 168 | void model_count_equal(model *MM, const char *S, size_t len) { |
169 | if (MODEL == M_RAW || MODEL == M_JC || MODEL == M_KIMURA) { | |
170 | size_t fourth = len / 4; | |
171 | MM->counts[AtoA] += fourth; | |
172 | MM->counts[CtoC] += fourth; | |
173 | MM->counts[GtoG] += fourth; | |
174 | MM->counts[TtoT] += fourth + (len & 3); | |
175 | return; | |
176 | } | |
177 | ||
178 | // Fall-back algorithm for future models. Note, as this works on a | |
179 | // per-character basis it is slow. | |
180 | ||
168 | 181 | size_t local_counts[4] = {0}; |
169 | 182 | |
170 | 183 | for (; len--;) { |
171 | 184 | char s = *S++; |
172 | 185 | |
186 | // ';!#' are all smaller than 'A' | |
173 | 187 | if (s < 'A') { |
188 | // Technically, s can only be ';' at this point. | |
174 | 189 | continue; |
175 | 190 | } |
176 | 191 | |
177 | unsigned char nibble_s = s & 7; | |
178 | ||
179 | static const unsigned int mm1 = 0x20031000; | |
180 | ||
181 | unsigned char index = (mm1 >> (4 * nibble_s)) & 0x3; | |
182 | local_counts[index]++; | |
192 | // The four canonical nucleotides can be uniquely identified by the bits | |
193 | // 0x6: A -> 0, C → 1, G → 3, T → 2. Thus the order below is changed. | |
194 | local_counts[(s >> 1) & 3]++; | |
183 | 195 | } |
184 | 196 | |
185 | 197 | MM->counts[AtoA] += local_counts[0]; |
186 | 198 | MM->counts[CtoC] += local_counts[1]; |
187 | MM->counts[GtoG] += local_counts[2]; | |
188 | MM->counts[TtoT] += local_counts[3]; | |
199 | MM->counts[GtoG] += local_counts[3]; | |
200 | MM->counts[TtoT] += local_counts[2]; | |
189 | 201 | } |
190 | 202 | |
191 | 203 | /** |
3 | 3 | * @file |
4 | 4 | * @brief This file contains various distance methods. |
5 | 5 | */ |
6 | #include <assert.h> | |
7 | #include <math.h> | |
8 | #include <stdint.h> | |
9 | #include <stdio.h> | |
6 | 10 | #include <stdlib.h> |
7 | 11 | #include <string.h> |
8 | #include <assert.h> | |
9 | #include <math.h> | |
10 | #include <stdio.h> | |
11 | 12 | #include "esa.h" |
12 | 13 | #include "global.h" |
13 | 14 | #include "io.h" |
15 | 16 | #include "process.h" |
16 | 17 | #include "sequence.h" |
17 | 18 | |
18 | #include <time.h> | |
19 | 19 | #include <gsl/gsl_rng.h> |
20 | 20 | #include <gsl/gsl_randist.h> |
21 | 21 | |
53 | 53 | /** |
54 | 54 | * @brief Calculates the binomial coefficient of n and k. |
55 | 55 | * |
56 | * We used to use gsl_sf_lnchoose(xx,kk) for this functionality. | |
57 | * After all, why implement something that has already been done? | |
58 | * Well, the reason is simplicity: GSL is used for only this one | |
59 | * function and the input (n<=20) is not even considered big. | |
60 | * Hence its much easier to have our own implementation and ditch | |
61 | * the GSL dependency even if that means our code is a tiny bit | |
62 | * less optimized and slower. | |
56 | * We could (and probalby should) use gsl_sf_lnchoose(xx,kk) for this. | |
63 | 57 | * |
64 | 58 | * @param n - The n part of the binomial coefficient. |
65 | 59 | * @param k - analog. |
206 | 200 | model_count_equal(&ret, query + last_pos_Q, last_length); |
207 | 201 | |
208 | 202 | // Count the SNPs in between. |
209 | model_count(&ret, C->S + last_pos_S + last_length, query + last_pos_Q + last_length, | |
203 | model_count(&ret, C->S + last_pos_S + last_length, | |
204 | query + last_pos_Q + last_length, | |
210 | 205 | this_pos_Q - last_pos_Q - last_length); |
211 | 206 | last_was_right_anchor = 1; |
212 | 207 | } else { |
221 | 216 | if (last_was_right_anchor) { |
222 | 217 | // If the last was a right anchor, but with the current one, |
223 | 218 | // we cannot extend, then add its length. |
224 | model_count_equal(&ret, C->S + last_pos_S, last_length); | |
219 | model_count_equal(&ret, query + last_pos_Q, last_length); | |
225 | 220 | } else if ((last_length / 2) >= threshold) { |
226 | 221 | // The last anchor wasn't neither a left or right anchor. |
227 | 222 | // But, it was as long as an anchor pair. So still count it. |
228 | model_count_equal(&ret, C->S + last_pos_S, last_length); | |
223 | model_count_equal(&ret, query + last_pos_Q, last_length); | |
229 | 224 | } |
230 | 225 | |
231 | 226 | last_was_right_anchor = 0; |
302 | 297 | * @param sequences - An array of pointers to the sequences. |
303 | 298 | * @param n - The number of sequences. |
304 | 299 | */ |
305 | void calculate_distances(seq_t *sequences, int n) { | |
306 | int i; | |
307 | ||
308 | // check the sequences | |
309 | for (i = 0; i < n; i++) { | |
310 | if (sequences[i].S == NULL || sequences[i].len == 0) { | |
311 | errx(1, "Missing sequence: %s", sequences[i].name); | |
312 | } | |
313 | ||
314 | if (sequences[i].len < 1000) { | |
315 | FLAGS |= F_SHORT; | |
316 | } | |
317 | } | |
318 | ||
319 | if (FLAGS & F_SHORT) { | |
320 | warnx("One of the given input sequences is shorter than a thousand " | |
321 | "nucleotides. This may result in inaccurate distances. Try an " | |
322 | "alignment instead."); | |
323 | } | |
324 | ||
325 | // Warn about non ACGT residues. | |
326 | if (FLAGS & F_NON_ACGT) { | |
327 | warnx("The input sequences contained characters other than acgtACGT. " | |
328 | "These were automatically stripped to ensure correct results."); | |
329 | } | |
330 | ||
331 | model *M = malloc(n * n * sizeof(*M)); | |
300 | void calculate_distances(seq_t *sequences, size_t n) { | |
301 | struct model *M = NULL; | |
302 | ||
303 | // The maximum number of sequences is near 457'845'052. | |
304 | size_t intermediate = SIZE_MAX / sizeof(*M) / n; | |
305 | if (intermediate < n) { | |
306 | size_t root = (size_t)sqrt(SIZE_MAX / sizeof(*M)); | |
307 | err(1, "Comparison is limited to %zu sequences (%zu given).", root, n); | |
308 | } | |
309 | ||
310 | M = malloc(n * n * sizeof(*M)); | |
332 | 311 | if (!M) { |
333 | 312 | err(errno, "Could not allocate enough memory for the comparison " |
334 | 313 | "matrix. Try using --join or --low-memory."); |
385 | 364 | |
386 | 365 | // B is the new bootstrap matrix |
387 | 366 | struct model *B = malloc(n * n * sizeof(*B)); |
388 | if (!B) return 2; | |
367 | CHECK_MALLOC(B); | |
389 | 368 | |
390 | 369 | // Compute a number of new distance matrices |
391 | 370 | while (BOOTSTRAP--) { |
7 | 7 | |
8 | 8 | #include "sequence.h" |
9 | 9 | |
10 | void calculate_distances(seq_t *sequences, int n); | |
10 | void calculate_distances(seq_t *sequences, size_t n); | |
11 | 11 | |
12 | 12 | #endif |
18 | 18 | int dsa_init(dsa_t *A) { |
19 | 19 | // allocate at least 4 slots so the growth by 1.5 below doesn't get stuck |
20 | 20 | // at 3 slots. |
21 | A->data = malloc(sizeof(seq_t) * 4); | |
22 | if (!A->data) { | |
23 | return 1; | |
24 | } | |
21 | A->data = malloc(sizeof(*A->data) * 4); | |
22 | CHECK_MALLOC(A->data); | |
25 | 23 | |
26 | 24 | A->capacity = 4; |
27 | 25 | A->size = 0; |
35 | 33 | } else { |
36 | 34 | // use the near-optimal growth factor of 1.5 |
37 | 35 | seq_t *ptr = reallocarray(A->data, A->capacity / 2, sizeof(seq_t) * 3); |
38 | if (ptr == NULL) { | |
39 | err(errno, "out of memory?"); | |
40 | } | |
36 | CHECK_MALLOC(ptr); | |
41 | 37 | |
42 | 38 | A->capacity = (A->capacity / 2) * 3; |
43 | 39 | A->data = ptr; |
100 | 96 | |
101 | 97 | // A single malloc for the whole new sequence |
102 | 98 | char *ptr = malloc(total); |
103 | if (ptr == NULL) { | |
104 | return joined; | |
105 | } | |
99 | CHECK_MALLOC(ptr); | |
106 | 100 | char *next = ptr; |
107 | 101 | |
108 | 102 | // Copy all old sequences and add a `!` in between |
132 | 126 | */ |
133 | 127 | void seq_free(seq_t *S) { |
134 | 128 | free(S->S); |
135 | free(S->RS); | |
136 | 129 | free(S->name); |
137 | 130 | *S = (seq_t){}; |
138 | 131 | } |
144 | 137 | * @return The reverse complement. The caller has to free it! |
145 | 138 | */ |
146 | 139 | char *revcomp(const char *str, size_t len) { |
140 | if (!str) return NULL; | |
147 | 141 | char *rev = malloc(len + 1); |
148 | if (!str || !rev) return NULL; | |
142 | CHECK_MALLOC(rev); | |
149 | 143 | |
150 | 144 | char *r = rev; |
151 | 145 | const char *s = &str[len - 1]; |
182 | 176 | char *rev = revcomp(s, len); |
183 | 177 | |
184 | 178 | char *temp = realloc(rev, 2 * len + 2); |
185 | if (!temp) { | |
186 | free(rev); | |
187 | return NULL; | |
188 | } | |
179 | CHECK_MALLOC(temp); | |
189 | 180 | |
190 | 181 | rev = temp; |
191 | 182 | rev[len] = '#'; |
214 | 205 | } |
215 | 206 | |
216 | 207 | /** @brief Prepares a sequences to be used as the subject in a comparison. */ |
217 | int seq_subject_init(seq_t *S) { | |
218 | S->gc = calc_gc(S); | |
219 | S->RS = catcomp(S->S, S->len); | |
208 | int seq_subject_init(seq_subject *S, const seq_t *base) { | |
209 | S->gc = calc_gc(base); | |
210 | S->RS = catcomp(base->S, base->len); | |
220 | 211 | if (!S->RS) return 1; |
221 | S->RSlen = 2 * S->len + 1; | |
212 | S->RSlen = 2 * base->len + 1; | |
222 | 213 | return 0; |
223 | 214 | } |
224 | 215 | |
225 | 216 | /** @brief Frees some memory unused for when a sequence is only used as query. |
226 | 217 | */ |
227 | void seq_subject_free(seq_t *S) { | |
218 | void seq_subject_free(seq_subject *S) { | |
228 | 219 | free(S->RS); |
229 | 220 | S->RS = NULL; |
230 | 221 | S->RSlen = 0; |
242 | 233 | |
243 | 234 | *S = (seq_t){.S = strdup(seq), .name = strdup(name)}; |
244 | 235 | |
245 | if (!S->S || !S->name) { | |
246 | seq_free(S); | |
247 | return 2; | |
248 | } | |
236 | CHECK_MALLOC(S->S); | |
237 | CHECK_MALLOC(S->name); | |
249 | 238 | |
250 | 239 | normalize(S); |
251 | 240 | |
252 | 241 | // recalculate the length because `normalize` might have stripped some |
253 | 242 | // characters. |
254 | 243 | S->len = strlen(S->S); |
255 | ||
256 | const size_t LENGTH_LIMIT = (INT_MAX - 1) / 2; | |
257 | if (S->len > LENGTH_LIMIT) { | |
258 | warnx("The input sequence %s is too long. The technical limit is %zu.", | |
259 | S->name, LENGTH_LIMIT); | |
260 | return 3; | |
261 | } | |
262 | 244 | |
263 | 245 | return 0; |
264 | 246 | } |
17 | 17 | typedef struct seq_s { |
18 | 18 | /** This is the DNAs forward strand as a string. */ |
19 | 19 | char *S; |
20 | /** The length of the forward strand. */ | |
21 | size_t len; | |
22 | /** A name for this sequence */ | |
23 | char *name; | |
24 | } seq_t; | |
25 | ||
26 | /** | |
27 | * @brief This structure enhances the usual sequence with its reverse | |
28 | * complement. | |
29 | */ | |
30 | typedef struct seq_subject { | |
20 | 31 | /** This member contains first the reverse strand and then the |
21 | 32 | forward strand. */ |
22 | 33 | char *RS; |
23 | /** The length of the forward strand. */ | |
24 | size_t len; | |
25 | 34 | /** Corresponds to strlen(RS) */ |
26 | 35 | size_t RSlen; |
27 | /** A name for this sequence */ | |
28 | char *name; | |
29 | 36 | /** |
30 | 37 | * @brief GC-Content |
31 | 38 | * |
32 | 39 | * The relative amount of G or C in the DNA. |
33 | 40 | */ |
34 | 41 | double gc; |
35 | } seq_t; | |
42 | } seq_subject; | |
36 | 43 | |
37 | 44 | void seq_free(seq_t *S); |
38 | int seq_subject_init(seq_t *S); | |
39 | void seq_subject_free(seq_t *S); | |
45 | int seq_subject_init(seq_subject *S, const seq_t *); | |
46 | void seq_subject_free(seq_subject *S); | |
40 | 47 | int seq_init(seq_t *S, const char *seq, const char *name); |
41 | 48 | |
42 | 49 | /** |
24 | 24 | typedef struct { |
25 | 25 | esa_s *C; |
26 | 26 | seq_t *S; |
27 | seq_subject subject; | |
27 | 28 | } esa_fixture; |
28 | 29 | |
29 | 30 | void assert_equal_lcp( const lcp_inter_t *a, const lcp_inter_t *b){ |
59 | 60 | }; |
60 | 61 | |
61 | 62 | g_assert( seq_init( ef->S, seq, "S0" ) == 0); |
62 | seq_subject_init( ef->S); | |
63 | g_assert( ef->S->RS != NULL); | |
64 | int check = esa_init( ef->C, ef->S); | |
63 | seq_subject_init( &ef->subject, ef->S); | |
64 | g_assert( ef->subject.RS != NULL); | |
65 | int check = esa_init( ef->C, &ef->subject); | |
65 | 66 | g_assert( check == 0); |
66 | 67 | } |
67 | 68 | |
85 | 86 | }; |
86 | 87 | |
87 | 88 | g_assert( seq_init( ef->S, seq, "S0" ) == 0); |
88 | seq_subject_init( ef->S); | |
89 | g_assert( ef->S->RS != NULL); | |
90 | int check = esa_init( ef->C, ef->S); | |
89 | seq_subject_init( &ef->subject, ef->S); | |
90 | g_assert( ef->subject.RS != NULL); | |
91 | int check = esa_init( ef->C, &ef->subject); | |
91 | 92 | g_assert( check == 0); |
92 | 93 | } |
93 | 94 |
21 | 21 | void test_seq_full(){ |
22 | 22 | |
23 | 23 | seq_t S; |
24 | seq_subject subject; | |
24 | 25 | |
25 | 26 | seq_init( &S, "ACGTTGCA", "name"); |
26 | int check = seq_subject_init( &S); | |
27 | int check = seq_subject_init( &subject, &S); | |
27 | 28 | |
28 | 29 | g_assert_cmpint(check, ==, 0); |
29 | 30 | |
30 | g_assert_cmpstr(S.RS, ==, "TGCAACGT#ACGTTGCA"); | |
31 | g_assert_cmpuint(S.RSlen, ==, 8*2+1); | |
32 | g_assert( S.gc == 0.5); | |
31 | g_assert_cmpstr(subject.RS, ==, "TGCAACGT#ACGTTGCA"); | |
32 | g_assert_cmpuint(subject.RSlen, ==, 8*2+1); | |
33 | g_assert( subject.gc == 0.5); | |
33 | 34 | |
35 | seq_subject_free( &subject); | |
34 | 36 | seq_free( &S); |
35 | 37 | } |
36 | 38 | |
37 | 39 | void test_seq_nonacgt(){ |
38 | 40 | seq_t S; |
41 | seq_subject subject; | |
39 | 42 | |
40 | 43 | seq_init( &S, "11ACGTNN7682394689NNTGCA11", "name"); |
41 | seq_subject_init( &S); | |
44 | seq_subject_init( &subject, &S); | |
42 | 45 | |
43 | 46 | g_assert_cmpstr(S.S, ==, "ACGTTGCA"); |
44 | 47 | g_assert_cmpuint(S.len, ==, 8 ); |
45 | 48 | g_assert( FLAGS & F_NON_ACGT); |
46 | 49 | |
47 | g_assert_cmpstr(S.RS, ==, "TGCAACGT#ACGTTGCA"); | |
48 | g_assert_cmpuint(S.RSlen, ==, 8*2+1); | |
49 | g_assert( S.gc == 0.5); | |
50 | g_assert_cmpstr(subject.RS, ==, "TGCAACGT#ACGTTGCA"); | |
51 | g_assert_cmpuint(subject.RSlen, ==, 8*2+1); | |
52 | g_assert( subject.gc == 0.5); | |
50 | 53 | |
54 | seq_subject_free( &subject); | |
51 | 55 | seq_free( &S); |
52 | 56 | |
53 | 57 | FLAGS = F_NONE; |
54 | 58 | |
55 | 59 | seq_init( &S, "@ACGT_!0TGCA ", "name"); |
56 | seq_subject_init( &S); | |
60 | seq_subject_init( &subject, &S); | |
57 | 61 | |
58 | 62 | g_assert_cmpstr(S.S, ==, "ACGT!TGCA"); |
59 | 63 | g_assert_cmpuint(S.len, ==, 9 ); |
60 | 64 | g_assert( FLAGS & F_NON_ACGT); |
61 | 65 | |
62 | g_assert_cmpstr(S.RS, ==, "TGCA;ACGT#ACGT!TGCA"); | |
63 | g_assert_cmpuint(S.RSlen, ==, 9*2+1); | |
66 | g_assert_cmpstr(subject.RS, ==, "TGCA;ACGT#ACGT!TGCA"); | |
67 | g_assert_cmpuint(subject.RSlen, ==, 9*2+1); | |
64 | 68 | |
69 | seq_subject_free( &subject); | |
65 | 70 | seq_free( &S); |
66 | 71 | |
67 | 72 | FLAGS = F_NONE; |