/*******************************************************************************
*
* This file is part of the General Hidden Markov Model Library,
* GHMM version __VERSION__, see http://ghmm.org
*
* Filename: ghmm/ghmm/sequence.h
* Authors: Bernd Wichern, Benjamin Georgi
*
* Copyright (C) 1998-2004 Alexander Schliep
* Copyright (C) 1998-2001 ZAIK/ZPR, Universitaet zu Koeln
* Copyright (C) 2002-2004 Max-Planck-Institut fuer Molekulare Genetik,
* Berlin
*
* Contact: schliep@ghmm.org
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the Free
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*
* This file is version $Revision: 2277 $
* from $Date: 2009-04-28 08:44:31 -0400 (Tue, 28 Apr 2009) $
* last change by $Author: grunau $.
*
*******************************************************************************/
#ifndef GHMM_SEQUENCE_H
#define GHMM_SEQUENCE_H
#ifdef __cplusplus
extern "C" {
#endif
#include <stdio.h>
#include "ghmmconfig.h"
/**@name sequences (double and int) */
/*@{ (Doc++-Group: sequence) */
/** Sequence structure for integer sequences.
Contains an array of sequences and corresponding
data like sequence label, sequence weight, etc. Sequences may have
different length.
*/
typedef struct ghmm_dseq {
/** sequence array. sequence[i] [j] = j-th symbol of i-th seq.
*/
int **seq;
/** matrix of state ids, can be used to save the viterbi path during sequence generation.
ATTENTION: is NOT allocated by ghmm_dseq_calloc */
int **states;
/** array of sequence length */
int *seq_len;
/** array of state path lengths */
int *states_len;
#ifdef GHMM_OBSOLETE
/** array of sequence labels */
long *seq_label;
#endif /* GHMM_OBSOLETE */
/** array of sequence IDs*/
double *seq_id;
/** positiv! sequence weights. default is 1 = no weight */
double *seq_w;
/** total number of sequences */
long seq_number;
/** reserved space for sequences is always >= seq_number */
long capacity;
/** sum of sequence weights */
double total_w;
/** matrix of state labels corresponding to seq */
int **state_labels;
/** number of labels for each sequence */
int *state_labels_len;
/** flags (internal) */
unsigned int flags;
} ghmm_dseq;
/** Sequence structure for double sequences.
Contains an array of sequences and corresponding
data like sequnce label, sequence weight, etc. Sequences may have
different length. Multi-dimension sequences are linearized.
*/
typedef struct ghmm_cseq {
/** sequence array. sequence[i][j] = j-th symbol of i-th seq.
sequence[i][D * j] = first dimension of j-th observation of i-th sequence
*/
double **seq;
/** array of sequence length */
int *seq_len;
#ifdef GHMM_OBSOLETE
/** array of sequence labels */
long *seq_label;
#endif /* GHMM_OBSOLETE */
/** array of sequence IDs*/
double *seq_id;
/** positive! sequence weights. default is 1 = no weight */
double *seq_w;
/** total number of sequences */
long seq_number;
/** reserved space for sequences is always >= seq_number */
long capacity;
/** sum of sequence weights */
double total_w;
/** total number of dimensions */
int dim;
/** flags (internal) */
unsigned int flags;
} ghmm_cseq;
#ifdef __cplusplus
}
#endif
/* don't include model.h at the beginning of this file. struct ghmm_dseq has
to be known in model.h */
#include "model.h"
#include "smodel.h"
#ifdef __cplusplus
extern "C" {
#endif
/** Truncate double sequences in a given sequence array.
Useful for Testing;
@return truncated sqd_field;
@param sqd_in sequence arrays for truncation
@param sqd_arrays number of sequence arrays
@param trunc_ratio 0 means no truncation, 1 max. truncation
@param seed rng seed
*/
ghmm_cseq **ghmm_cseq_truncate (ghmm_cseq ** sqd_in, int sqd_arrays,
double trunc_ratio, int seed);
/**
Extract a single sequence from a larger ghmm_dseq into a new struct.
@return ghmm_dseq struct containing a single sequence
@param sq source ghmm_dseq
@param index index of sequence to extract
*/
ghmm_dseq *ghmm_dseq_get_singlesequence(ghmm_dseq *sq, int index);
/**
Extract a single sequence_d from a larger ghmm_cseq into a new struct.
@return ghmm_cseq struct containing a single sequence
@param sq source ghmm_cseq
@param index index of sequence to extract
*/
ghmm_cseq *ghmm_cseq_get_singlesequence(ghmm_cseq *sq, int index);
/*XXX TEST: frees everything but the seq field */
/**
Free a ghmm_dseq struct which holds as sequence a reference to a sequence in a different
sequence_t. The function deallocates everything but the reference.
*/
int ghmm_dseq_subseq_free (ghmm_dseq *sq);
/**
Free a ghmm_cseq struct which holds as sequence a reference to a sequence in a different
sequence_d_t. The function deallocates everything but the reference.
*/
int ghmm_cseq_subseq_free (ghmm_cseq *sqd);
/**
Reads a FastA file and returns a ghmm_dseq object
@param filename filemane of the fasta file
@param alphabet alphabet
@return ghmm_dseq of the fasta file
*/
ghmm_dseq *ghmm_dseq_open_fasta(const char *filename, ghmm_alphabet *alphabet);
/** Generates all possible integer sequence of lenght n from an alphabet with
M letters. Use lexicographical ordering. Memory allocation here.
@param n length of sequences
@param M size of alphabet
@return array of generated integer sequences
*/
ghmm_dseq *ghmm_dseq_lexWords (int n, int M);
/**
Determine best model for a given integer sequence.
Choose from the set of models the
one with the highest likelihood for the given sequence.
@param mo array of models
@param model_number number of models
@param sequence sequence
@param seq_len length of sequence
@param log_p log likelihood of the sequence given the best model
@return index of best model (between 0 and model_number - 1)
*/
int ghmm_dseq_best_model (ghmm_dmodel ** mo, int model_number, int *sequence,
int seq_len, double *log_p);
/**
Make sure that the sequences only contain allowed symbols.
(between 0 and max_symb - 1)
@param sq sequences
@param max_symb number of different symbols
@return -1 for error, 0 for no errors
*/
int ghmm_dseq_check (ghmm_dseq * sq, int max_symb);
/**
copy one integer sequence. Memory for target has to be allocated outside.
@param target target sequence
@param source source sequence
@param len length of source sequence
*/
void ghmm_dseq_copy (int *target, int *source, int len);
/**
copy one double sequence. Memory for target has to be allocated outside.
@param target target sequence
@param source source sequence
@param len length of source sequence
*/
void ghmm_cseq_copy (double *target, double *source, int len);
/**
Adds all integer sequences, sequence lengths etc
from source to target. Memory allocation is done here.
@param target target sequence structure
@param source source sequence structure
@return -1 for error, 0 for success
*/
int ghmm_dseq_add (ghmm_dseq * target, ghmm_dseq * source);
/**
Adds all double sequences, sequence lengths etc
from source to target. Memory allocation is done here.
@param target target sequence structure
@param source source sequence structure
@return -1 for error, 0 for success
*/
int ghmm_cseq_add (ghmm_cseq * target, ghmm_cseq * source);
/**
Prints one array of integer sequences in a file.
@param file output file
@param sequence array of sequences
*/
void ghmm_dseq_print (ghmm_dseq * sequence, FILE * file);
/**
Prints one array of integer sequences in a xml file
@param file output file
@param sequence array of sequences
*/
void ghmm_dseq_print_xml (ghmm_dseq * sequence, FILE * file);
/**
Prints one array of integer sequences in Mathematica format.
(List of lists)
@param file output file
@param sq array of sequences
@param name arbitrary sequence name for usage in Mathematica.
*/
void ghmm_dseq_mathematica_print (ghmm_dseq * sq, FILE * file, char *name);
/**
Prints one array of double sequences in a file.
@param file output file
@param sqd array of sequences
@param discrete switch: 0 means double output for symbols,
1 means truncate symbols to integer
*/
void ghmm_cseq_print (ghmm_cseq * sqd, FILE * file, int discrete);
/**
Prints one array of double sequences in Mathematica format.
(List of lists)
@param file output file
@param sqd array of sequences
@param name arbitrary sequence name for usage in Mathematica.
*/
void ghmm_cseq_mathematica_print (ghmm_cseq * sqd, FILE * file,
char *name);
/** Output of double sequences suitable for gnuplot. One symbol per line,
sequences seperated by double newline.
@param file output file
@param sqd array of double sequences
*/
void ghmm_cseq_gnu_print (ghmm_cseq * sqd, FILE * file);
/**
Cleans integer sequence pointers in sequence struct. sets
seq_number to zero.
Differs from sequence_free since memory is not freed here.
@param sq sequence structure
*/
void ghmm_dseq_clean (ghmm_dseq * sq);
/**
Cleans double sequence pointers in sequence struct. sets
seq_number to zero.
Differs from sequence_free since memory is not freed here.
@param sqd sequence structure
*/
void ghmm_cseq_clean (ghmm_cseq * sqd);
/**
Frees all memory in a given array of integer sequences.
@param sq sequence structure
@return 0 for succes, -1 for error
*/
int ghmm_dseq_free (ghmm_dseq ** sq);
/**
Frees all memory in a given array of double sequences.
@param sq sequence structure
@return 0 for succes, -1 for error
*/
int ghmm_cseq_free (ghmm_cseq ** sq);
/**
Return biggest symbol in an interger sequence.
@param sq sequence structure
@return max value
*/
int ghmm_dseq_max_symbol (ghmm_dseq * sq);
/**
Memory allocation for an integer sequence struct. Allocates arrays of lenght
seq_number. NO allocation for the actual sequence, since its length is
unknown.
@param seq_number: number of sequences
@return: pointer of sequence struct
*/
ghmm_dseq *ghmm_dseq_calloc (long seq_number);
/**
Completes Memory allocation for an integer sequence struct.
NO allocation for the actual sequence, since its length is
unknown.
*/
int ghmm_dseq_calloc_state_labels (ghmm_dseq *sq);
/**
Memory allocation for a double sequence struct. Allocates arrays of lenght
seq_number. NO allocation for the actual sequence, since its length is
unknown.
@param seq_number: number of sequences
@return: pointer of sequence struct
*/
ghmm_cseq *ghmm_cseq_calloc (long seq_number);
/**
Copies array of integer sequences to double sequences.
@return double sequence struct (target)
@param sq integer sequence struct (source)
*/
ghmm_cseq *ghmm_cseq_create_from_dseq (const ghmm_dseq * sq);
/**
Copies array of double sequences into an array of integer
sequences. Truncates positions after decimal point.
@return integer sequence struct (target)
@param sqd double sequence struct (source)
*/
ghmm_dseq *ghmm_dseq_create_from_cseq (const ghmm_cseq * sqd);
/**
Determines max sequence length in a given int sequence struct.
@author Peter Pipenbacher
@param sqd sequence struct
@return max sequence length
*/
int ghmm_dseq_max_len (const ghmm_dseq * sqd);
/**
Determines max sequence length in a given double sequence struct.
@param sqd sequence struct
@return max sequence length
*/
int ghmm_cseq_max_len (const ghmm_cseq * sqd);
/**
Calculates a mean sequence of a given array of double sequences.
Missing values of shorter sequences a assumed to be zero.
@param sqd sequence struct
@return pointer of sequence struct containing the mean sequence
*/
ghmm_cseq *ghmm_cseq_mean (const ghmm_cseq * sqd);
/**
Calculates the scatter matrix of an array of double sequences.
Missing parts of short sequences are NOT taken into account.
@return scatter matrix
@param sqd sequence struct
@param dim (calculated) dimension of scatter matrix
*/
double **ghmm_cseq_scatter_matrix (const ghmm_cseq * sqd, int *dim);
/**
Calculates transition class for a given double sequence
at a specified position. Very application specific!!! Currently
implemented only dummy function: allways returns 0 which
means no usage of multiple transition classes.
@param O double sequence
@param index position for class calculation
@param osum sum of symbols upto index
@return currently always 0
*/
int ghmm_cseq_class (const double *O, int index, double *osum);
/*int ghmm_cseq_class(const ghmm_cseq *sqd, const int seq_number, int index, double *osum, );*/
/** Divides randomly a given array of double sequences into two sets.
Useful if a training and test set is needed. Memory allocation is done
here.
@param sqd input sequence array
@param sqd_train training sequences
@param sqd_test test sequences
@param train_ratio ratio of number of train vs number of test sequences
@return 0 for success, -1 for error
*/
int ghmm_cseq_partition (ghmm_cseq * sqd, ghmm_cseq * sqd_train,
ghmm_cseq * sqd_test, double train_ratio);
/**
Copies all entries from one sequence in a source array to a target array.
No memory allocation here.
@param target double sequence target
@param source double sequence source
@param t_num position in target array
@param s_num position in source array
*/
void ghmm_cseq_copy_all (ghmm_cseq * target, long t_num,
ghmm_cseq * source, long s_num);
/** Log-Likelihood function in a mixture model:
(mathe mode?)
\f$\sum_k w^k \log( \sum_c (\alpha_c p(O^k | \lambda_c)))\f$
@param smo pointer to array of smodels
@param smo_number number of models
@param sqd sequence struct
@param like log likelihood
*/
int ghmm_cseq_mix_like (ghmm_cmodel ** smo, int smo_number, ghmm_cseq * sqd,
double *like);
#ifdef __cplusplus
}
#endif
#endif /* GHMM_SEQUENCE_H */
/*@} (Doc++-Group: sequence) */