Commit upstream/0.10 - andi

+18

-9

.travis.yml less more

0	0	language: cpp
	1	os:
	2	- linux
	3	- osx
1	4	compiler:
2		- gcc
	5	- gcc
	6	- clang
3	7	sudo: false
4	8	addons:
5	9	apt:
6	10	sources:
7		- deadsnakes
8		- ubuntu-toolchain-r-test
	11	- deadsnakes
	12	- ubuntu-toolchain-r-test
9	13	packages:
10		- cmake
11		- g++-4.8
12		- libglib2.0-dev
13		- libgsl0-dev
	14	- cmake
	15	- g++-4.8
	16	- libglib2.0-dev
	17	- libgsl0-dev
14	18	install:
15	19	- export LIBDIVDIR="$HOME/libdivsufsort"
16	20	- pip install --user cpp-coveralls

19	23	- cd libdivsufsort-master && mkdir build && cd build
20	24	- cmake -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="$LIBDIVDIR" ..
21	25	- make && make install
	26	- if [ "${TRAVIS_OS_NAME}" = "osx" ]; then brew install gsl; fi
	27	- if [ "${TRAVIS_OS_NAME}" = "osx" ]; then brew install glib; fi
22	28	before_install:
23		- if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
	29	- if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
24	30
25	31	# - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 90
26	32	script:
	33	- export DYLD_LIBRARY_PATH="$DYLD_LIBRARY_PATH:$LIBDIVDIR/lib"
27	34	- export LD_LIBRARY_PATH="$LIBDIVDIR:$LIBDIVDIR/lib"
28	35	- export LIBRARY_PATH="$LIBDIVDIR:$LIBRARY_PATH"
29	36	- cd $TRAVIS_BUILD_DIR

32	39	- ./configure --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
33	40	- make
34	41	- make check
	42	- export MYFLAGS="-I$LIBDIVDIR/include"
	43	- ./configure --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
35	44	- make distcheck DISTCHECK_CONFIGURE_FLAGS="LDFLAGS=\"-L$LIBDIVDIR/lib\" CFLAGS=\"-I$LIBDIVDIR/include\" CXXFLAGS=\"-I$LIBDIVDIR/include\""
36	45	- tar xzvf andi-*.tar.gz
37	46	- cd andi-*

40	49	- make check
41	50	- cd ..
42	51	after_success:
43		- coveralls --exclude libdivsufsort-master -E '^andi-.*' --exclude libs --exclude test --gcov `which gcov-4.8` --gcov-options '\-lp'
	52	- if [ "${TRAVIS_OS_NAME}" = "linux" -a "$CXX" = "g++-4.8" ]; then coveralls --exclude libdivsufsort-master -E '^andi-.*' --exclude libs --exclude test --gcov `which gcov-4.8` --gcov-options '\-lp'; fi

+37

-24

README.md less more

6	6	This readme covers all necessary instructions for the impatient to get `andi` up and running. For extensive instructions please consult the [manual](andi-manual.pdf).
7	7
8	8
9		# Build Instructions
	9	# Installation and Usage
10	10
11		For the latest [stable release](https://github.com/EvolBioInf/andi/releases) of `andi` download the tar ball. If you'd like to contribute to this software, feel free to create a fork of our [git repository](https://github.com/EvolBioInf/andi) and send pull requests.
	11	Stable versions of `andi` are available via package managers. For manual installation see below.
	12
	13	For Debian and Ubuntu (starting 16.04):
	14
	15	sudo apt-get install andi
	16
	17	For OS X with Homebrew:
	18
	19	brew install science/andi
	20
	21	For ArchLinux with aura:
	22
	23	sudo aura -A andi
	24
	25	With a successful installation you can get the usage instructions via `--help` or the man page.
	26
	27	$ andi --help
	28	$ man andi
	29
	30	You can simply use `andi` with your genomes in `FASTA` format.
	31
	32	$ andi S1.fasta S2.fasta
	33	2
	34	S1 0.0 0.1
	35	s2 0.1 0.0
	36
	37	From this distance matrix the phylogeny can be inferred via neighbor-joining. Check the [manual](andi-manual.pdf) for a more thorough description.
12	38
13	39
14		## Prerequisites
	40	## Manual installation
15	41
16		This program has the following external dependencies: [libdivsufsort](https://github.com/y-256/libdivsufsort) and the [GSL](https://www.gnu.org/software/gsl/). Please make sure you installed both before attempting a build. If you did get the source, not as a tarball, but straight from the git repository, you will also need the autotools. Run `autoreconf -i` to generate the configure script and continue with the next step.
	42	If your system does not support one of the above package managers you have to manually build the latest [stable release](https://github.com/EvolBioInf/andi/releases) from a tarball. See the [manual](andi-manual.pdf) for extensive building instructions.
17	43
18
19		## Compiling
	44	This program has the following external dependencies: [libdivsufsort](https://github.com/y-256/libdivsufsort) and the [GSL](https://www.gnu.org/software/gsl/). Please make sure you installed both before attempting a build. If you did get the source, not as a tarball, but straight from the git repository, you will also need the autotools.
20	45
21	46	Assuming you have installed all prerequisites, building is as easy as follows.
22	47
23		$ ./configure
24		$ make
25		$ make install
	48	$ autoreconf -fi -Im4 # optional when build from tarball
	49	$ ./configure
	50	$ make
	51	$ make install
26	52
27		Excessive build instructions are located in `INSTALL`. If the build was successful you can get the usage instructions via `--help` or the man page.
28
29		$ andi --help
30		$ man andi
31
32		You can simply use `andi` with your genomes in `FASTA` format.
33
34		$ andi S1.fasta S2.fasta
35		2
36		S1 0.0 0.1
37		s2 0.1 0.0
38
39		From this distance matrix the phylogeny can be inferred via neighbor-joining. Check the [manual](andi-manual.pdf) for a more thorough description.
40
	53	Excessive build instructions are located in `INSTALL`.
41	54
42	55	# Links and Additional Resources
43	56

53	66
54	67	## License
55	68
56		Copyright © 2014, 2015 Fabian Klötzl
	69	Copyright © 2014 - 2016 Fabian Klötzl
57	70	License GPLv3+: GNU GPL version 3 or later.
58	71
59	72	This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law. The full license text is available at <http://gnu.org/licenses/gpl.html>.

andi-manual.pdf less more

Binary diff not shown

+5

-1

configure.ac less more

0		AC_INIT([andi], [0.9.6.1])
	0	AC_INIT([andi], [0.10])
1	1	AM_INIT_AUTOMAKE([-Wall foreign ])
2	2
3	3	AC_CONFIG_MACRO_DIR([m4])

31	31
32	32	AS_IF([test "x$with_libdivsufsort" != "xno"],
33	33	[
	34	# The libdivsufsort header contains some Microsoft extension making
	35	# compilation fail on certain systems (i.e. OS X). Add the following
	36	# flag so the build runs smoothly.
	37	CPPFLAGS="$CPPFLAGS -fms-extensions"
34	38	AC_CHECK_HEADERS([divsufsort.h],[have_libdivsufsort=yes],[have_libdivsufsort=no])
35	39	AC_CHECK_LIB(divsufsort, divsufsort, [], [have_libdivsufsort=no])
36	40	],

+4

-4

docs/andi.1.in less more

0		.TH ANDI "1" "June 2015" "@VERSION@" ""
	0	.TH ANDI "1" "2016-03-05" "@VERSION@" "andi manual"
1	1	.SH NAME
2	2	andi \- estimates evolutionary distance
3	3	.SH SYNOPSIS

10	10	The output is a symmetrical distance matrix in \fIPHYLIP\fR format, with each entry representing divergence with a positive real number. A distance of zero means that two sequences are identical, whereas other values are estimates for the nucleotide substitution rate (Jukes-Cantor corrected). For technical reasons the comparison might fail and no estimate can be computed. In such cases \fInan\fR is printed. This either means that the input sequences were too short (<200bp) or too diverse (K>0.5) for our method to work properly.
11	11	.SH OPTIONS
12	12	.TP
13		\fB\-b, \-\-bootstrap\fR <INT>
	13	\fB\-b\fR, \fB\-\-bootstrap\fR <INT>
14	14	Compute multiple distance matrices, with \fIn-1\fR bootstrapped from the first. See the paper Klötzl & Haubold (2016, in review) for a detailed explanation.
15	15	.TP
16	16	\fB\-j\fR, \fB\-\-join\fR

25	25	\fB\-p\fR <FLOAT>
26	26	Significance of an anchor pair; default: 0.05.
27	27	.TP
28		\fB\-t\fR <INT>
	28	\fB\-t\fR, \fB\-\-threads\fR <INT>
29	29	The number of threads to be used; by default, all available processors are used.
30	30	.br
31	31	Multithreading is only available if \fBandi\fR was compiled with OpenMP support.

39	39	\fB\-\-version\fR
40	40	Outputs version information and acknowledgments.
41	41	.SH COPYRIGHT
42		Copyright \(co 2014, 2015 Fabian Klötzl
	42	Copyright \(co 2014 - 2016 Fabian Klötzl
43	43	License GPLv3+: GNU GPL version 3 or later.
44	44	.br
45	45	This is free software: you are free to change and redistribute it.

+90

-21

docs/manual/andi-manual.tex less more

14	14	\usepackage{amsthm}
15	15	\usepackage{acronym}
16	16	\usepackage{amssymb}
17		\usepackage{tikz}
18		\usepackage{pgfplots}
19	17	\usepackage{caption}
20	18	\usepackage{subcaption}
21		\usepackage{booktabs}
22	19	\usepackage{xspace}
23		\usepackage{overpic}
24	20
25	21	\bibliographystyle{alpha}
26	22

108	104
109	105	The easiest way to install \andi is via a package manager. This also handles all dependencies for you.
110	106
	107
	108	\noindent Debian and Ubuntu (since 16.04):
	109
	110	\begin{lstlisting}
	111	~ % sudo apt-get install andi
	112	\end{lstlisting}
	113
111	114	\noindent OS X with homebrew:
112	115
113	116	\begin{lstlisting}

118	121
119	122	\begin{lstlisting}
120	123	~ % aura -A andi
121		\end{lstlisting}
122
123		\noindent Debian and Ubuntu (soon):
124
125		\begin{lstlisting}
126		~ % sudo apt-get install andi
127	124	\end{lstlisting}
128	125
129	126	\andi is intended to run in a \algo{Unix} commandline such as \lstinline$bash$ or \lstinline$zsh$. All examples in this document are also intended for that environment. You can verify that \andi was installed correctly by executing \lstinline$andi -h$. This should give you a list of all available options (see Section~\ref{sec:options}).

188	185	\begin{lstlisting}
189	186	~ % git clone git@github.com:EvolBioInf/andi.git
190	187	~ % cd andi
191		~/andi % autoreconf -i
192		\end{lstlisting}
193
194		\noindent For old versions of \algo{autoconf} you may instead have to use \lstinline$autoreconf -i -Im4$.
	188	~/andi % autoreconf -fi -Im4
	189	\end{lstlisting}
195	190
196	191	\noindent Continue with the \algo{Gnu} trinity as described in Section~\ref{sub:regular}.
197	192

268	263
269	264	By default, the distances computed by \andi are \emph{Jukes-Cantor} corrected \cite{jukescantor}. Other evolutionary models are also implemented (Kimura, raw). The \lstinline$--model$ parameter can be used to switch between them.
270	265
271		Since version 0.9.4 \andi includes a bootstrapping method. It can be activated via the \lstinline$--bootstrap$ or \lstinline$-b$ switch. This option takes a numeric argument representing the number of matrices to create. The output can then be piped into \algo{phylip}.
	266	Since version 0.9.4 \andi includes a bootstrapping method. It can be activated via the \lstinline$--bootstrap$ or \lstinline$-b$ switch. This option takes a numeric argument representing the number of matrices to create. The output can then be piped into \algo{phylip}. For more information on computing support values from distance matrices see \cite{afra}.
272	267
273	268	\begin{lstlisting}
274	269	~ % andi -b 2 ../test/1M.1.fasta

332	327	\end{figure}
333	328
334	329
	330	\chapter{Warnings and Errors}
	331
	332	Here be an explanation of all possible errors. Other errors may occur and are due to the failure of underlying functions (e.\,g.~\lstinline$read(3)$). All warning messages are printed to \lstinline$stderr$. Most errors are non-recoverable and will result in \andi exiting with a non-zero state.
	333
	334	\section{Sequence Related Messages}
	335
	336	\subsection*{Unexpected Character}
	337
	338	\andi is pretty pedantic about the formatting of \algo{FASTA} files. If you violate the syntax, \andi will print the file name, the line and the problematic character. These errors are non-recovering, meaning no further sequences are read from the invalid file. The checks are implemented by the \href{https://github.com/kloetzl/pfasta}{\algo{pfasta}} library.
	339
	340
	341	\subsection*{Non acgtACGT Nucleotides Stripped}
	342
	343	Our models of genome evolution (JC, Kimura) only work on the four canonical types of nucleotides. All others are stripped from the sequences. This can be ignored in most cases.
	344
	345	\subsection*{Too Short Sequence}
	346
	347	\andi was designed for big data sets of whole genomes. On short sequences the distance estimates are inaccurate. Use an multiple sequence alignment instead.
	348
	349	\subsection*{Too Long Sequence}
	350
	351	\algo{libdivsufsort} limits the length of a sequence to 31 bits. That count includes the reverse complement. So the technical limit for a sequence analysis is $2^{30} = 1.073.741.824$. Unfortunately, that excludes (full) human and mice genomes. Per-chromosome analysis works just fine.
	352
	353	\subsection*{Empty Sequence}
	354
	355	One of the given sequences contained either no nucleotides at all, or only non-canonical ones.
	356
	357	\subsection*{Less than two sequences given}
	358
	359	As \andi tries to compare sequences, at least two need to be supplied. Note that \andi may have regarded some of your given sequences as unusable.
	360
	361	\subsection*{Maximum Number of Sequences}
	362
	363	The maximum number of sequences \andi can possible compare is huge (roughly $457.845.052$). I doubt anyone will ever reach that limit. Please send me a mail, if you do.
	364
	365	\section{Technical Messages}
	366
	367	\subsection*{Out of Memory}
	368
	369	If \andi runs out of memory, it gives up. Either free memory, run \andi on a bigger machine, try the \lstinline$--low-memory$ mode or reduce the number of threads.
	370
	371	\subsection*{RNG allocation}
	372
	373	Some technical thing failed. If it keeps failing repeatedly, file a bug.
	374
	375	\subsection*{Bootstrapping failed}
	376
	377	This should not happen.
	378
	379	\subsection*{Failed index creation}
	380
	381	This should not happen, either.
	382
	383	\subsection*{Skipped and ignored Arguments}
	384
	385	Some command line parameters of \andi require arguments. If these are not of the expected type, a warning is given. See Section~\ref{sec:options} for their correct usage.
	386
	387
	388	\section{Output-related Warnings}
	389
	390	As the input sequences get more evolutionary divergent, \andi finds less anchors. With less anchors, less nucleotides are cosidered homologous between two sequences. If no anchors are found comparison fails and \lstinline!nan! is printed instead. See out paper and especiallt Figure~2 for details.
	391
	392	\subsection*{NaN}
	393
	394	No anchors were found. Your sequences are very divergent ($d>0.5$) or sprout a lot of indels that make comparison difficult.
	395
	396	\subsection*{Little Homology}
	397
	398	Very few anchors were found and thus only a tiny part of the sequences is considered homologous. Expect that the given distance is errorneous.
335	399
336	400
337	401	\chapter{DevOps} %%%%%

375	439	~/andi % make check
376	440	\end{lstlisting}
377	441
378		\noindent The unit tests are also checked each time a commit is sent to the repository. This is done via \algo{TravisCI}.\footnote{\url{https://travis-ci.org/EvolBioInf/andi}} Thus, a warning is produced, when the builds fail, or the unit tests to not run successfully. Currently, the unit tests cover more than 80\% of the code. This is computed via the \algo{Travis} builds and a service called \algo{Coveralls}.\footnote{\url{https://coveralls.io/r/EvolBioInf/andi}}
	442	\noindent The unit tests are also checked each time a commit is sent to the repository. This is done via \algo{TravisCI}.\footnote{\url{https://travis-ci.org/EvolBioInf/andi}} Thus, a warning is produced, when the builds fail, or the unit tests to not run successfully. Currently, the unit tests cover more than 75\% of the code. This is computed via the \algo{Travis} builds and a service called \algo{Coveralls}.\footnote{\url{https://coveralls.io/r/EvolBioInf/andi}} Unfortunately, coveralls is broken at this point in time.
379	443
380	444	\section{Known Issues}
381	445
382	446	These minor issues are known. I intend to fix them, when I have time.
383	447
384	448	\begin{enumerate}
385		\item This code will not work under Windows. At two places Unix-only code is used: filepath-seperators are assumed to be \lstinline$/$ and file-descriptors are used for I/O.
	449	\item This code will not work under Windows. At two places Unix-only code is used: filepath-separators are assumed to be \lstinline$/$ and file-descriptors are used for I/O.
	450	\item Only the first 10 characters are printed in the output matrix. This can make long gi numbers indistinguishable. In version 0.10 we started printing a warning for this.
	451	\item Unit tests for the bootstrapped matrices are missing.
	452	\item Cached intervals are sometimes not “as deep as they could be”. If that got fixed \lstinline$get_match_cache$ could bail out on \lstinline$ij.lcp < CACHE_LENGTH$. However the \lstinline$esa_init_cache$ code is the most fragile part and should be handled with care.
386	453	\end{enumerate}
387	454
388	455

392	459
393	460	%\subsection{Preparing a new Release}
394	461
395		Once \andi is matured, the new features implemented, and all tests were run, a new release can be created. First, increase the version number in \lstinline$configure.ac$. Commit that change in git, and tag this commit with \lstinline$vX.y$. Create another commit, where you set the version number to the next release (e.\,g., \lstinline$vX.z-beta$). This assures that there is only one commit and build with that specific version. Now return to the previous commit \lstinline$git checkout vX.y$.
	462	Once \andi is matured, the new features implemented, and all tests were run, a new release can be created. First, increase the version number in \lstinline$configure.ac$. Commit that change in git, and tag this commit with \lstinline$vX.y$. Tags should be annotated and signed, if possible. This manual then needs manual rebuilding.
396	463
397	464	Ensure that \andi is ready for packaging with \algo{autoconf}.
398	465

413	480	=================================================
414	481	\end{lstlisting}
415	482
416		If the command does not build successfully, no tarballs will be created. This may necessitate further study of \algo{autoconf} and \algo{automake}.
417
418
	483	If the command does not build successfully, no tarballs will be created. This may necessitate further study of \algo{autoconf} and \algo{automake}.
	484
	485	Also verify that the recent changes did not create a performance regression. This includes testing both ends of the scale: \eco and \pneu. Both should be reasonable close to previous releases.
	486
	487	Create another commit, where you set the version number to the next release (e.\,g., \lstinline$vX.z-beta$). This assures that there is only one commit and build with that specific version.
419	488
420	489	\backmatter
421	490	%\addcontentsline{toc}{chapter}{Bibliography}

+15

-1

docs/manual/references.bib less more

44	44	Author = {Chris Lattner and Vikram Adve},
45	45	Title = {{LLVM}: A Compilation Framework for Lifelong Program
46	46	Analysis and Transformation},
47		Booktitle = "Code Generation and Optimizatio",
	47	Booktitle = "Code Generation and Optimization",
48	48	Month = {Mar},
49	49	Year = {2004},
50	50	pages = {75--88},

108	108	title={Efficient Estimation of Evolutionary Distances}
109	109	}
110	110
	111	@article{afra,
	112	AUTHOR = {Klötzl, Fabian and Haubold, Bernhard},
	113	TITLE = {Support Values for Genome Phylogenies},
	114	JOURNAL = {Life},
	115	VOLUME = {6},
	116	YEAR = {2016},
	117	NUMBER = {1},
	118	PAGES = {11},
	119	URL = {http://www.mdpi.com/2075-1729/6/1/11},
	120	ISSN = {2075-1729},
	121	DOI = {10.3390/life6010011}
	122	}
111	123
	124
	125

+2

-0

opt/strchrnul.c less more

0	0	/* @brief Here follows a simple implementation of the GNU function `strchrnul`.
1	1	* Please check the gnulib manual for details.
2	2	*/
	3	#include <string.h>
	4
3	5	char strchrnul(const char s, int c){
4	6	char *p = strchr(s,c);
5	7

+20

-0

scripts/failed.zsh less more

	0	#!/usr/bin/zsh
	1
	2	# Compute the number of failing comparisons for different distances.
	3
	4	DISTS=(0.1 0.2 0.3 0.35 0.4 0.45 0.5 0.55 0.6 0.65 0.7)
	5
	6	LENGTH=100000
	7
	8	for dist in $DISTS; do
	9	echo "" > est_$dist.dist
	10	for (( i = 0; i < 1000; i++ )); do
	11	../test/test_fasta -l $LENGTH -d $dist > temp.fa
	12	../src/andi ./temp.fa > est.dist 2> /dev/null
	13	tail -n 1 est.dist >> est_$dist.dist
	14	done
	15	avg=$(cat est_$dist.dist \| awk '"nan" !~ $2 {sum+=$2;c++}END{print sum/c}')
	16	sd=$(grep -v 'nan' est_$dist.dist \| awk '{a[c++]=$2;aa+=$2}END{aa/=NR;for(c=0;c<NR;c++){t=a[c]-aa;sd+=t*t}print sqrt(sd/(NR-1))}')
	17	failed=$(cat est_$dist.dist \| grep -c 'nan')
	18	echo $dist "\t" $avg"\t±" $sd "\t" $failed
	19	done

+1

-0

src/Makefile.am less more

15	15
16	16	.PHONY: perf
17	17	perf: CFLAGS+= -g -O3 -ggdb -fno-omit-frame-pointer
	18	perf: CXXFLAGS+= -g -O3 -ggdb -fno-omit-frame-pointer
18	19	perf: andi

+53

-16

src/andi.c less more

24	24	#include <assert.h>
25	25	#include <errno.h>
26	26	#include <getopt.h>
	27	#include <limits.h>
27	28	#include <stdio.h>
28	29	#include <stdlib.h>
29	30	#include <string.h>

54	55	*
55	56	* The main function reads and parses the commandline arguments. Depending on
56	57	* the set flags it reads the input files and forwards the contained sequences
57		* to processing.
	58	* to processing. Also it verifies the input for correctness and issues warnings
	59	* and errors.
58	60	*/
59	61	int main(int argc, char *argv[]) {
60	62	int c;

133	135
134	136	if (threads > (long unsigned int)omp_get_num_procs()) {
135	137	warnx(
136		"The number of threads to be used, is greater then the "
	138	"The number of threads to be used, is greater than the "
137	139	"number of available processors; Ignoring -t %lu "
138	140	"argument.",
139	141	threads);

173	175	} else if (strcasecmp(optarg, "KIMURA") == 0) {
174	176	MODEL = M_KIMURA;
175	177	} else {
176		warnx("Ignoring argument for --model. Expected Raw JC or Kimura");
	178	warnx("Ignoring argument for --model. Expected Raw, JC or "
	179	"Kimura");
177	180	}
178	181	break;
179	182	}

195	198	}
196	199
197	200	dsa_t dsa;
198		if (dsa_init(&dsa)) {
199		errx(errno, "Out of memory.");
200		}
	201	dsa_init(&dsa);
201	202
202	203	const char *file_name;
203	204

222	223
223	224	size_t n = dsa_size(&dsa);
224	225
	226	if (n < 2) {
	227	errx(1,
	228	"I am truly sorry, but with less than two sequences (%zu given) "
	229	"there is nothing to compare.",
	230	n);
	231	}
	232
225	233	if (FLAGS & F_VERBOSE) {
226	234	fprintf(stderr, "Comparing %zu sequences\n", n);
227	235	fflush(stderr);

235	243	// seed the random number generator with the current time
236	244	gsl_rng_set(RNG, time(NULL));
237	245
238		seq_t *sequences = dsa_data(&dsa);
	246	// Warn about non ACGT residues.
	247	if (FLAGS & F_NON_ACGT) {
	248	warnx("The input sequences contained characters other than acgtACGT. "
	249	"These were automatically stripped to ensure correct results.");
	250	}
	251
	252	// validate sequence correctness
	253	const seq_t *seq = dsa_data(&dsa);
	254	for (size_t i = 0; i < n; ++i, seq++) {
	255	if (strlen(seq->name) > 10) {
	256	warnx("The sequence name '%s' is longer than ten characters. It "
	257	"will be truncated in the output to '%.10s'.",
	258	seq->name, seq->name);
	259	}
	260
	261	const size_t LENGTH_LIMIT = (INT_MAX - 1) / 2;
	262	if (seq->len > LENGTH_LIMIT) {
	263	errx(1, "The sequence %s is too long. The technical limit is %zu.",
	264	seq->name, LENGTH_LIMIT);
	265	}
	266
	267	if (seq->len == 0) {
	268	errx(1, "The sequence %s is empty.", seq->name);
	269	}
	270
	271	if (seq->len < 1000) {
	272	FLAGS \|= F_SHORT;
	273	}
	274	}
	275
	276	if (FLAGS & F_SHORT) {
	277	warnx("One of the given input sequences is shorter than a thousand "
	278	"nucleotides. This may result in inaccurate distances. Try an "
	279	"alignment instead.");
	280	}
	281
239	282	// compute distance matrix
240		if (n >= 2) {
241		calculate_distances(sequences, n);
242		} else {
243		warnx("I am truly sorry, but with less than two sequences (%zu given) "
244		"there is nothing to compare.",
245		n);
246		}
	283	calculate_distances(dsa_data(&dsa), n);
247	284
248	285	dsa_free(&dsa);
249	286	gsl_rng_free(RNG);

288	325	void version(void) {
289	326	const char str[] = {
290	327	"andi " VERSION "\n"
291		"Copyright (C) 2014, 2015 Fabian Klötzl\n"
	328	"Copyright (C) 2014 - 2016 Fabian Klötzl\n"
292	329	"License GPLv3+: GNU GPL version 3 or later "
293	330	"<http://gnu.org/licenses/gpl.html>\n"
294	331	"This is free software: you are free to change and redistribute it.\n"

301	338	"Sequence Analysis, Genome Rearrangements, and Phylogenetic "
302	339	"Reconstruction. pp 118f.\n"
303	340	"3) SA construction: Mori, Y. (2005). Short description of improved "
304		"two-stage suffix sorting alorithm. "
	341	"two-stage suffix sorting algorithm. "
305	342	"http://homepage3.nifty.com/wpage/software/itssort.txt\n"};
306	343	printf("%s", str);
307	344	exit(EXIT_SUCCESS);

+7

-6

src/dist_hack.h less more

28	28	* @param n - The number of sequences
29	29	* @param M - A matrix for additional output data
30	30	*/
31		void NAME(struct model M, seq_t sequences, size_t n) {
	31	void NAME(struct model M, const seq_t sequences, size_t n) {
32	32	size_t i;
33	33
34	34	//#pragma
35	35	P_OUTER
36	36	for (i = 0; i < n; i++) {
37		seq_t *subject = &sequences[i];
	37	seq_subject subject;
38	38	esa_s E;
39	39
40		if (seq_subject_init(subject) \|\| esa_init(&E, subject)) {
41		errx(1, "Failed to create index for %s.", subject->name);
	40	if (seq_subject_init(&subject, &sequences[i]) \|\|
	41	esa_init(&E, &subject)) {
	42	errx(1, "Failed to create index for %s.", sequences[i].name);
42	43	}
43	44
44	45	// now compare every other sequence to i

59	60
60	61	size_t ql = sequences[j].len;
61	62
62		M(i, j) = dist_anchor(&E, sequences[j].S, ql, subject->gc);
	63	M(i, j) = dist_anchor(&E, sequences[j].S, ql, subject.gc);
63	64	}
64	65
65	66	esa_free(&E);
66		seq_subject_free(subject);
	67	seq_subject_free(&subject);
67	68	}
68	69	}

+56

-65

src/esa.c less more

16	16	#include <string.h>
17	17	#include <assert.h>
18	18	#include "esa.h"
	19	#include "global.h"
19	20
20	21	static void esa_init_cache_dfs(esa_s , char str, size_t pos, lcp_inter_t in);
21	22	static void esa_init_cache_fill(esa_s , char str, size_t pos, lcp_inter_t in);

69	70	* @returns 0 iff successful
70	71	*/
71	72	int esa_init_cache(esa_s *self) {
72		lcp_inter_t *cache =
73		malloc((1 << (2 * CACHE_LENGTH)) * sizeof(lcp_inter_t));
74
75		if (!cache) {
76		return 1;
77		}
	73	lcp_inter_t cache = malloc((1 << (2 CACHE_LENGTH)) * sizeof(*cache));
	74	CHECK_MALLOC(cache);
78	75
79	76	self->cache = cache;
80	77

127	124	continue;
128	125	}
129	126
	127	if (ij.l <= (ssize_t)(pos + 1)) {
	128	// Continue one level deeper
	129	// This is the usual case
	130	esa_init_cache_dfs(C, str, pos + 1, ij);
	131	continue;
	132	}
	133
130	134	// The LCP-interval is deeper than expected
131		if (ij.l > (ssize_t)(pos + 1)) {
132
133		// Check if it still fits into the cache
134		if ((size_t)ij.l < CACHE_LENGTH) {
135
136		// fill with dummy value
137		esa_init_cache_fill(C, str, pos + 1, in);
138
139		char non_acgt = 0;
140
141		// fast forward
142		size_t k = pos + 1;
143		for (; k < (size_t)ij.l; k++) {
144		// In some very edgy edge cases the lcp-interval `ij`
145		// contains a `;` or another non-acgt character. Since we
146		// cannot cache those, break.
147		char c = C->S[C->SA[ij.i] + k];
148		if (char2code(c) < 0) {
149		non_acgt = 1;
150		break;
151		}
152
153		str[k] = c;
154		}
155
156		if (non_acgt) {
157		esa_init_cache_fill(C, str, k, ij);
158		} else {
159		esa_init_cache_dfs(C, str, k, ij);
160		}
161
162		continue;
163		}
164
	135	// Check if it still fits into the cache
	136	if ((size_t)ij.l >= CACHE_LENGTH) {
165	137	// If the lcp-interval exceeds the cache depth, stop here and fill
166	138	esa_init_cache_fill(C, str, pos + 1, in);
167	139	continue;
168	140	}
169	141
170		// Continue one level deeper
171		esa_init_cache_dfs(C, str, pos + 1, ij);
	142	/* At this point the prefix `str` of length `pos` has been found.
	143	* However, the call to `getInterval` above found an interval with
	144	* an LCP value bigger than `pos`. This means that not all elongations
	145	* (more precise: just one) of `str` appear in the subject. Thus fill
	146	* all values with the matched result to far and continue only with
	147	* the one special substring.
	148	*/
	149	esa_init_cache_fill(C, str, pos + 1, in);
	150
	151	char non_acgt = 0;
	152
	153	// fast forward
	154	size_t k = pos + 1;
	155	for (; k < (size_t)ij.l; k++) {
	156	// In some very edgy edge cases the lcp-interval `ij`
	157	// contains a `;` or another non-acgt character. Since we
	158	// cannot cache those, break.
	159	char c = C->S[C->SA[ij.i] + k];
	160	if (char2code(c) < 0) {
	161	non_acgt = 1;
	162	break;
	163	}
	164
	165	str[k] = c;
	166	}
	167
	168	if (non_acgt) {
	169	esa_init_cache_fill(C, str, k, ij);
	170	} else {
	171	esa_init_cache_dfs(C, str, k, ij);
	172	}
172	173	}
173	174	}
174	175

215	216	size_t len = self->len;
216	217
217	218	char *FVC = self->FVC = malloc(len);
218		if (!FVC) {
219		return 1;
220		}
	219	CHECK_MALLOC(FVC);
221	220
222	221	const char *S = self->S;
223	222	const int *SA = self->SA;

238	237	* @param S - The sequence
239	238	* @returns 0 iff successful
240	239	*/
241		int esa_init(esa_s C, const seq_t S) {
242		if (!C \|\| !S \|\| !S->S) return 1;
	240	int esa_init(esa_s C, const seq_subject S) {
	241	if (!C \|\| !S \|\| !S->RS) return 1;
243	242
244	243	*C = (esa_s){.S = S->RS, .len = S->RSlen};
245	244

284	283	return 1;
285	284	}
286	285
287		C->SA = malloc(C->len * sizeof(saidx_t));
288		if (!C->SA) {
289		return 2;
290		}
	286	C->SA = malloc(C->len * sizeof(*C->SA));
	287	CHECK_MALLOC(C->SA);
291	288
292	289	saidx_t result = 1;
293	290

310	307	if (!C \|\| !C->LCP) {
311	308	return 1;
312	309	}
313		saidx_t CLD = C->CLD = malloc((C->len + 1) sizeof(saidx_t));
314		if (!C->CLD) {
315		return 2;
316		}
	310	saidx_t CLD = C->CLD = malloc((C->len + 1) sizeof(*CLD));
	311	CHECK_MALLOC(CLD);
317	312
318	313	saidx_t *LCP = C->LCP;
319	314
320	315	typedef struct pair_s { saidx_t idx, lcp; } pair_t;
321	316
322		pair_t stack = malloc((C->len + 1) sizeof(pair_t));
	317	pair_t stack = malloc((C->len + 1) sizeof(*stack));
	318	CHECK_MALLOC(stack);
323	319	pair_t *top = stack; // points at the topmost filled element
324	320	pair_t last;
325	321

378	374
379	375	// Allocate new memory
380	376	// The LCP array is one element longer than S.
381		saidx_t LCP = C->LCP = malloc((len + 1) sizeof(saidx_t));
382		if (!LCP) {
383		return 3;
384		}
	377	saidx_t LCP = C->LCP = malloc((len + 1) sizeof(*LCP));
	378	CHECK_MALLOC(LCP);
385	379
386	380	LCP[0] = -1;
387	381	LCP[len] = -1;
388	382
389	383	// Allocate temporary arrays
390		saidx_t PHI = malloc(len sizeof(saidx_t));
	384	saidx_t PHI = malloc(len sizeof(*PHI));
391	385	saidx_t *PLCP = PHI;
392		if (!PHI) {
393		free(PHI);
394		return 2;
395		}
	386	CHECK_MALLOC(PHI);
396	387
397	388	PHI[SA[0]] = -1;
398	389	saidx_t k;

+2

-1

src/esa.h less more

5	5	#ifndef _ESA_H_
6	6	#define _ESA_H_
7	7
	8	#include <sys/types.h>
8	9	#include "sequence.h"
9	10	#include "config.h"
10	11

67	68
68	69	lcp_inter_t get_match_cached(const esa_s , const char query, size_t qlen);
69	70	lcp_inter_t get_match(const esa_s , const char query, size_t qlen);
70		int esa_init(esa_s , const seq_t S);
	71	int esa_init(esa_s , const seq_subject S);
71	72	void esa_free(esa_s *);
72	73
73	74	#ifdef DEBUG

+13

-0

src/global.h less more

8	8	#define _GLOBAL_H_
9	9	#include <gsl/gsl_rng.h>
10	10
	11	#include <err.h>
11	12	#include "config.h"
12	13
13	14	/**

61	62	F_SHORT = 64
62	63	};
63	64
	65	/**
	66	* @brief This macro is used to unify the checks for the return value of malloc.
	67	*
	68	* @param PTR - The pointer getting checked.
	69	*/
	70	#define CHECK_MALLOC(PTR) \
	71	do { \
	72	if (PTR == NULL) { \
	73	err(errno, "Out of memory"); \
	74	} \
	75	} while (0);
	76
64	77	#endif

+18

-13

src/io.c less more

4	4	#define _GNU_SOURCE
5	5	#include <fcntl.h>
6	6	#include <math.h>
	7	#include <limits.h>
7	8	#include <stdio.h>
8	9	#include <string.h>
9	10	#include <unistd.h>

23	24	* "I didn't learn joined up handwriting for nothing, you know."
24	25	* ~ Gilderoy Lockhart
25	26	*
26		* @param in - The file pointer to read from.
27		* @param dsa - An array that holds found sequences.
28		* @param name - The name of the file to be used for the name of the sequence.
	27	* @param file_name - The name of the file to be used for reading. The name is
	28	* also used to infer the sequence name.
	29	* @param dsa - (output parameter) An array that holds found sequences.
29	30	*/
30	31	void read_fasta_join(const char file_name, dsa_t dsa) {
31	32	if (!file_name \|\| !dsa) return;

46	47	*/
47	48
48	49	const char *left = strrchr(file_name, '/'); // find the last path separator
49		left = (left == NULL) ? file_name : left + 1; // left is the position one of
50		// to the right of the path
51		// separator
	50	left = (left == NULL) ? file_name : left + 1;
	51	// left is the position one of to the right of the path separator
52	52
53	53	const char *dot = strchrnul(left, '.'); // find the extension
54	54
55		joined.name = strndup(
56		left, dot - left); // copy only the file name, not its path or extension
	55	// copy only the file name, not its path or extension
	56	joined.name = strndup(left, dot - left);
	57	CHECK_MALLOC(joined.name);
	58
57	59	dsa_push(dsa, joined);
58	60	dsa_free(&single);
59	61	}
60	62
61	63	/**
62	64	* @brief This function reads sequences from a file.
63		* @param in - The file pointer to read from.
64		* @param dsa - An array that holds found sequences.
	65	* @param file_name - The file to read.
	66	* @param dsa - (output parameter) An array that holds found sequences.
65	67	*/
66	68	void read_fasta(const char file_name, dsa_t dsa) {
67	69	if (!file_name \|\| !dsa) return;

123	125	int use_scientific = 0;
124	126
125	127	double DD = malloc(n n * sizeof(*DD));
126		if (!DD) err(errno, "meh.");
	128	CHECK_MALLOC(DD);
127	129
128	130	#define DD(X, Y) (DD[(X)*n + (Y)])
129	131

132	134
133	135	switch (MODEL) {
134	136	case M_RAW: estimate = &estimate_RAW; break;
	137	default:
	138	/* intentional fall-through. This is just here to silence any
	139	* compiler warnings. The real default is set in andi.c.*/
135	140	case M_JC: estimate = &estimate_JC; break;
136	141	case M_KIMURA: estimate = &estimate_KIMURA; break;
137	142	}

167	172
168	173	printf("%zu\n", n);
169	174	for (i = 0; i < n; i++) {
170		// Print exactly nine characters of the name. Pad with spaces if
	175	// Print exactly ten characters of the name. Pad with spaces if
171	176	// necessary.
172		printf("%-9.9s", sequences[i].name);
	177	printf("%-10.10s", sequences[i].name);
173	178
174	179	for (j = 0; j < n; j++) {
175	180	// use scientific notation for small numbers

+22

-10

src/model.c less more

157	157	* @brief Given an anchor, classify nucleotides.
158	158	*
159	159	* For anchors we already know that the nucleotides of the subject and the query
160		* are equal. Thus only one sequence has to be analysed. See `model_count` for
161		* an explanation of the algorithm.
	160	* are equal. Thus only one sequence has to be analysed. Most models don't
	161	* actually care about the individual nucleotides as long as they are equal in
	162	* the two sequences. For these models, we just assume equal distribution.
162	163	*
163	164	* @param MM - The mutation matrix
164	165	* @param S - The subject
165	166	* @param len - The anchor length
166	167	*/
167	168	void model_count_equal(model MM, const char S, size_t len) {
	169	if (MODEL == M_RAW \|\| MODEL == M_JC \|\| MODEL == M_KIMURA) {
	170	size_t fourth = len / 4;
	171	MM->counts[AtoA] += fourth;
	172	MM->counts[CtoC] += fourth;
	173	MM->counts[GtoG] += fourth;
	174	MM->counts[TtoT] += fourth + (len & 3);
	175	return;
	176	}
	177
	178	// Fall-back algorithm for future models. Note, as this works on a
	179	// per-character basis it is slow.
	180
168	181	size_t local_counts[4] = {0};
169	182
170	183	for (; len--;) {
171	184	char s = *S++;
172	185
	186	// ';!#' are all smaller than 'A'
173	187	if (s < 'A') {
	188	// Technically, s can only be ';' at this point.
174	189	continue;
175	190	}
176	191
177		unsigned char nibble_s = s & 7;
178
179		static const unsigned int mm1 = 0x20031000;
180
181		unsigned char index = (mm1 >> (4 * nibble_s)) & 0x3;
182		local_counts[index]++;
	192	// The four canonical nucleotides can be uniquely identified by the bits
	193	// 0x6: A -> 0, C → 1, G → 3, T → 2. Thus the order below is changed.
	194	local_counts[(s >> 1) & 3]++;
183	195	}
184	196
185	197	MM->counts[AtoA] += local_counts[0];
186	198	MM->counts[CtoC] += local_counts[1];
187		MM->counts[GtoG] += local_counts[2];
188		MM->counts[TtoT] += local_counts[3];
	199	MM->counts[GtoG] += local_counts[3];
	200	MM->counts[TtoT] += local_counts[2];
189	201	}
190	202
191	203	/**

+21

-42

src/process.c less more

3	3	* @file
4	4	* @brief This file contains various distance methods.
5	5	*/
	6	#include <assert.h>
	7	#include <math.h>
	8	#include <stdint.h>
	9	#include <stdio.h>
6	10	#include <stdlib.h>
7	11	#include <string.h>
8		#include <assert.h>
9		#include <math.h>
10		#include <stdio.h>
11	12	#include "esa.h"
12	13	#include "global.h"
13	14	#include "io.h"

15	16	#include "process.h"
16	17	#include "sequence.h"
17	18
18		#include <time.h>
19	19	#include <gsl/gsl_rng.h>
20	20	#include <gsl/gsl_randist.h>
21	21

53	53	/**
54	54	* @brief Calculates the binomial coefficient of n and k.
55	55	*
56		* We used to use gsl_sf_lnchoose(xx,kk) for this functionality.
57		* After all, why implement something that has already been done?
58		* Well, the reason is simplicity: GSL is used for only this one
59		* function and the input (n<=20) is not even considered big.
60		* Hence its much easier to have our own implementation and ditch
61		* the GSL dependency even if that means our code is a tiny bit
62		* less optimized and slower.
	56	* We could (and probalby should) use gsl_sf_lnchoose(xx,kk) for this.
63	57	*
64	58	* @param n - The n part of the binomial coefficient.
65	59	* @param k - analog.

206	200	model_count_equal(&ret, query + last_pos_Q, last_length);
207	201
208	202	// Count the SNPs in between.
209		model_count(&ret, C->S + last_pos_S + last_length, query + last_pos_Q + last_length,
	203	model_count(&ret, C->S + last_pos_S + last_length,
	204	query + last_pos_Q + last_length,
210	205	this_pos_Q - last_pos_Q - last_length);
211	206	last_was_right_anchor = 1;
212	207	} else {

221	216	if (last_was_right_anchor) {
222	217	// If the last was a right anchor, but with the current one,
223	218	// we cannot extend, then add its length.
224		model_count_equal(&ret, C->S + last_pos_S, last_length);
	219	model_count_equal(&ret, query + last_pos_Q, last_length);
225	220	} else if ((last_length / 2) >= threshold) {
226	221	// The last anchor wasn't neither a left or right anchor.
227	222	// But, it was as long as an anchor pair. So still count it.
228		model_count_equal(&ret, C->S + last_pos_S, last_length);
	223	model_count_equal(&ret, query + last_pos_Q, last_length);
229	224	}
230	225
231	226	last_was_right_anchor = 0;

302	297	* @param sequences - An array of pointers to the sequences.
303	298	* @param n - The number of sequences.
304	299	*/
305		void calculate_distances(seq_t *sequences, int n) {
306		int i;
307
308		// check the sequences
309		for (i = 0; i < n; i++) {
310		if (sequences[i].S == NULL \|\| sequences[i].len == 0) {
311		errx(1, "Missing sequence: %s", sequences[i].name);
312		}
313
314		if (sequences[i].len < 1000) {
315		FLAGS \|= F_SHORT;
316		}
317		}
318
319		if (FLAGS & F_SHORT) {
320		warnx("One of the given input sequences is shorter than a thousand "
321		"nucleotides. This may result in inaccurate distances. Try an "
322		"alignment instead.");
323		}
324
325		// Warn about non ACGT residues.
326		if (FLAGS & F_NON_ACGT) {
327		warnx("The input sequences contained characters other than acgtACGT. "
328		"These were automatically stripped to ensure correct results.");
329		}
330
331		model M = malloc(n n * sizeof(*M));
	300	void calculate_distances(seq_t *sequences, size_t n) {
	301	struct model *M = NULL;
	302
	303	// The maximum number of sequences is near 457'845'052.
	304	size_t intermediate = SIZE_MAX / sizeof(*M) / n;
	305	if (intermediate < n) {
	306	size_t root = (size_t)sqrt(SIZE_MAX / sizeof(*M));
	307	err(1, "Comparison is limited to %zu sequences (%zu given).", root, n);
	308	}
	309
	310	M = malloc(n * n * sizeof(*M));
332	311	if (!M) {
333	312	err(errno, "Could not allocate enough memory for the comparison "
334	313	"matrix. Try using --join or --low-memory.");

385	364
386	365	// B is the new bootstrap matrix
387	366	struct model B = malloc(n n * sizeof(*B));
388		if (!B) return 2;
	367	CHECK_MALLOC(B);
389	368
390	369	// Compute a number of new distance matrices
391	370	while (BOOTSTRAP--) {

+1

-1

src/process.h less more

7	7
8	8	#include "sequence.h"
9	9
10		void calculate_distances(seq_t *sequences, int n);
	10	void calculate_distances(seq_t *sequences, size_t n);
11	11
12	12	#endif

+14

-32

src/sequence.c less more

18	18	int dsa_init(dsa_t *A) {
19	19	// allocate at least 4 slots so the growth by 1.5 below doesn't get stuck
20	20	// at 3 slots.
21		A->data = malloc(sizeof(seq_t) * 4);
22		if (!A->data) {
23		return 1;
24		}
	21	A->data = malloc(sizeof(A->data) 4);
	22	CHECK_MALLOC(A->data);
25	23
26	24	A->capacity = 4;
27	25	A->size = 0;

35	33	} else {
36	34	// use the near-optimal growth factor of 1.5
37	35	seq_t ptr = reallocarray(A->data, A->capacity / 2, sizeof(seq_t) 3);
38		if (ptr == NULL) {
39		err(errno, "out of memory?");
40		}
	36	CHECK_MALLOC(ptr);
41	37
42	38	A->capacity = (A->capacity / 2) * 3;
43	39	A->data = ptr;

100	96
101	97	// A single malloc for the whole new sequence
102	98	char *ptr = malloc(total);
103		if (ptr == NULL) {
104		return joined;
105		}
	99	CHECK_MALLOC(ptr);
106	100	char *next = ptr;
107	101
108	102	// Copy all old sequences and add a `!` in between

132	126	*/
133	127	void seq_free(seq_t *S) {
134	128	free(S->S);
135		free(S->RS);
136	129	free(S->name);
137	130	*S = (seq_t){};
138	131	}

144	137	* @return The reverse complement. The caller has to free it!
145	138	*/
146	139	char revcomp(const char str, size_t len) {
	140	if (!str) return NULL;
147	141	char *rev = malloc(len + 1);
148		if (!str \|\| !rev) return NULL;
	142	CHECK_MALLOC(rev);
149	143
150	144	char *r = rev;
151	145	const char *s = &str[len - 1];

182	176	char *rev = revcomp(s, len);
183	177
184	178	char temp = realloc(rev, 2 len + 2);
185		if (!temp) {
186		free(rev);
187		return NULL;
188		}
	179	CHECK_MALLOC(temp);
189	180
190	181	rev = temp;
191	182	rev[len] = '#';

214	205	}
215	206
216	207	/** @brief Prepares a sequences to be used as the subject in a comparison. */
217		int seq_subject_init(seq_t *S) {
218		S->gc = calc_gc(S);
219		S->RS = catcomp(S->S, S->len);
	208	int seq_subject_init(seq_subject S, const seq_t base) {
	209	S->gc = calc_gc(base);
	210	S->RS = catcomp(base->S, base->len);
220	211	if (!S->RS) return 1;
221		S->RSlen = 2 * S->len + 1;
	212	S->RSlen = 2 * base->len + 1;
222	213	return 0;
223	214	}
224	215
225	216	/** @brief Frees some memory unused for when a sequence is only used as query.
226	217	*/
227		void seq_subject_free(seq_t *S) {
	218	void seq_subject_free(seq_subject *S) {
228	219	free(S->RS);
229	220	S->RS = NULL;
230	221	S->RSlen = 0;

242	233
243	234	*S = (seq_t){.S = strdup(seq), .name = strdup(name)};
244	235
245		if (!S->S \|\| !S->name) {
246		seq_free(S);
247		return 2;
248		}
	236	CHECK_MALLOC(S->S);
	237	CHECK_MALLOC(S->name);
249	238
250	239	normalize(S);
251	240
252	241	// recalculate the length because `normalize` might have stripped some
253	242	// characters.
254	243	S->len = strlen(S->S);
255
256		const size_t LENGTH_LIMIT = (INT_MAX - 1) / 2;
257		if (S->len > LENGTH_LIMIT) {
258		warnx("The input sequence %s is too long. The technical limit is %zu.",
259		S->name, LENGTH_LIMIT);
260		return 3;
261		}
262	244
263	245	return 0;
264	246	}

+14

-7

src/sequence.h less more

17	17	typedef struct seq_s {
18	18	/** This is the DNAs forward strand as a string. */
19	19	char *S;
	20	/** The length of the forward strand. */
	21	size_t len;
	22	/** A name for this sequence */
	23	char *name;
	24	} seq_t;
	25
	26	/**
	27	* @brief This structure enhances the usual sequence with its reverse
	28	* complement.
	29	*/
	30	typedef struct seq_subject {
20	31	/** This member contains first the reverse strand and then the
21	32	forward strand. */
22	33	char *RS;
23		/** The length of the forward strand. */
24		size_t len;
25	34	/** Corresponds to strlen(RS) */
26	35	size_t RSlen;
27		/** A name for this sequence */
28		char *name;
29	36	/**
30	37	* @brief GC-Content
31	38	*
32	39	* The relative amount of G or C in the DNA.
33	40	*/
34	41	double gc;
35		} seq_t;
	42	} seq_subject;
36	43
37	44	void seq_free(seq_t *S);
38		int seq_subject_init(seq_t *S);
39		void seq_subject_free(seq_t *S);
	45	int seq_subject_init(seq_subject S, const seq_t );
	46	void seq_subject_free(seq_subject *S);
40	47	int seq_init(seq_t S, const char seq, const char *name);
41	48
42	49	/**

+7

-6

test/test_esa.c less more

24	24	typedef struct {
25	25	esa_s *C;
26	26	seq_t *S;
	27	seq_subject subject;
27	28	} esa_fixture;
28	29
29	30	void assert_equal_lcp( const lcp_inter_t a, const lcp_inter_t b){

59	60	};
60	61
61	62	g_assert( seq_init( ef->S, seq, "S0" ) == 0);
62		seq_subject_init( ef->S);
63		g_assert( ef->S->RS != NULL);
64		int check = esa_init( ef->C, ef->S);
	63	seq_subject_init( &ef->subject, ef->S);
	64	g_assert( ef->subject.RS != NULL);
	65	int check = esa_init( ef->C, &ef->subject);
65	66	g_assert( check == 0);
66	67	}
67	68

85	86	};
86	87
87	88	g_assert( seq_init( ef->S, seq, "S0" ) == 0);
88		seq_subject_init( ef->S);
89		g_assert( ef->S->RS != NULL);
90		int check = esa_init( ef->C, ef->S);
	89	seq_subject_init( &ef->subject, ef->S);
	90	g_assert( ef->subject.RS != NULL);
	91	int check = esa_init( ef->C, &ef->subject);
91	92	g_assert( check == 0);
92	93	}
93	94

+16

-11

test/test_seq.c less more

21	21	void test_seq_full(){
22	22
23	23	seq_t S;
	24	seq_subject subject;
24	25
25	26	seq_init( &S, "ACGTTGCA", "name");
26		int check = seq_subject_init( &S);
	27	int check = seq_subject_init( &subject, &S);
27	28
28	29	g_assert_cmpint(check, ==, 0);
29	30
30		g_assert_cmpstr(S.RS, ==, "TGCAACGT#ACGTTGCA");
31		g_assert_cmpuint(S.RSlen, ==, 8*2+1);
32		g_assert( S.gc == 0.5);
	31	g_assert_cmpstr(subject.RS, ==, "TGCAACGT#ACGTTGCA");
	32	g_assert_cmpuint(subject.RSlen, ==, 8*2+1);
	33	g_assert( subject.gc == 0.5);
33	34
	35	seq_subject_free( &subject);
34	36	seq_free( &S);
35	37	}
36	38
37	39	void test_seq_nonacgt(){
38	40	seq_t S;
	41	seq_subject subject;
39	42
40	43	seq_init( &S, "11ACGTNN7682394689NNTGCA11", "name");
41		seq_subject_init( &S);
	44	seq_subject_init( &subject, &S);
42	45
43	46	g_assert_cmpstr(S.S, ==, "ACGTTGCA");
44	47	g_assert_cmpuint(S.len, ==, 8 );
45	48	g_assert( FLAGS & F_NON_ACGT);
46	49
47		g_assert_cmpstr(S.RS, ==, "TGCAACGT#ACGTTGCA");
48		g_assert_cmpuint(S.RSlen, ==, 8*2+1);
49		g_assert( S.gc == 0.5);
	50	g_assert_cmpstr(subject.RS, ==, "TGCAACGT#ACGTTGCA");
	51	g_assert_cmpuint(subject.RSlen, ==, 8*2+1);
	52	g_assert( subject.gc == 0.5);
50	53
	54	seq_subject_free( &subject);
51	55	seq_free( &S);
52	56
53	57	FLAGS = F_NONE;
54	58
55	59	seq_init( &S, "@ACGT_!0TGCA ", "name");
56		seq_subject_init( &S);
	60	seq_subject_init( &subject, &S);
57	61
58	62	g_assert_cmpstr(S.S, ==, "ACGT!TGCA");
59	63	g_assert_cmpuint(S.len, ==, 9 );
60	64	g_assert( FLAGS & F_NON_ACGT);
61	65
62		g_assert_cmpstr(S.RS, ==, "TGCA;ACGT#ACGT!TGCA");
63		g_assert_cmpuint(S.RSlen, ==, 9*2+1);
	66	g_assert_cmpstr(subject.RS, ==, "TGCA;ACGT#ACGT!TGCA");
	67	g_assert_cmpuint(subject.RSlen, ==, 9*2+1);
64	68
	69	seq_subject_free( &subject);
65	70	seq_free( &S);
66	71
67	72	FLAGS = F_NONE;