Codebase list ghmm / HEAD ghmm / sequence.h
HEAD

Tree @HEAD (Download .tar.gz)

sequence.h @HEADraw · history · blame

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
/*******************************************************************************
*
*       This file is part of the General Hidden Markov Model Library,
*       GHMM version __VERSION__, see http://ghmm.org
*
*       Filename: ghmm/ghmm/sequence.h
*       Authors:  Bernd Wichern, Benjamin Georgi
*
*       Copyright (C) 1998-2004 Alexander Schliep 
*       Copyright (C) 1998-2001 ZAIK/ZPR, Universitaet zu Koeln
*	Copyright (C) 2002-2004 Max-Planck-Institut fuer Molekulare Genetik, 
*                               Berlin
*                                   
*       Contact: schliep@ghmm.org             
*
*       This library is free software; you can redistribute it and/or
*       modify it under the terms of the GNU Library General Public
*       License as published by the Free Software Foundation; either
*       version 2 of the License, or (at your option) any later version.
*
*       This library is distributed in the hope that it will be useful,
*       but WITHOUT ANY WARRANTY; without even the implied warranty of
*       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
*       Library General Public License for more details.
*
*       You should have received a copy of the GNU Library General Public
*       License along with this library; if not, write to the Free
*       Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*
*       This file is version $Revision: 2277 $ 
*                       from $Date: 2009-04-28 08:44:31 -0400 (Tue, 28 Apr 2009) $
*             last change by $Author: grunau $.
*
*******************************************************************************/

#ifndef GHMM_SEQUENCE_H
#define GHMM_SEQUENCE_H

#ifdef __cplusplus
extern "C" {
#endif

#include <stdio.h>

#include "ghmmconfig.h"

/**@name sequences  (double and int) */
/*@{ (Doc++-Group: sequence) */
/** Sequence structure for integer sequences. 

    Contains an array of sequences and corresponding
    data like sequence label, sequence weight, etc. Sequences may have
    different length.    
 */

  typedef struct ghmm_dseq {
  /** sequence array. sequence[i] [j] = j-th symbol of i-th seq.
   */
    int **seq;

  /** matrix of state ids, can be used to save the viterbi path during sequence generation.
   ATTENTION: is NOT allocated by ghmm_dseq_calloc  */
    int **states;

  /** array of sequence length */
    int *seq_len;
  /** array of state path lengths */
    int *states_len;

#ifdef GHMM_OBSOLETE
  /**  array of sequence labels */
    long *seq_label;
#endif /* GHMM_OBSOLETE */
  /**  array of sequence IDs*/
    double *seq_id;
  /** positiv! sequence weights.  default is 1 = no weight */
    double *seq_w;
  /** total number of sequences */
    long seq_number;
  /** reserved space for sequences is always >= seq_number */
    long capacity;
  /** sum of sequence weights */
    double total_w;

  /** matrix of state labels corresponding to seq */
    int **state_labels;
  /** number of labels for each sequence */
    int *state_labels_len;

  /** flags (internal) */
    unsigned int flags;
  } ghmm_dseq;

/** Sequence structure for double sequences.
 
    Contains an array of sequences and corresponding
    data like sequnce label, sequence weight, etc. Sequences may have
    different length. Multi-dimension sequences are linearized.
 */
  typedef struct ghmm_cseq {
  /** sequence array. sequence[i][j] = j-th symbol of i-th seq.
      sequence[i][D * j] = first dimension of j-th observation of i-th sequence
  */
    double **seq;
  /** array of sequence length */
    int *seq_len;
#ifdef GHMM_OBSOLETE
  /**  array of sequence labels */
    long *seq_label;
#endif /* GHMM_OBSOLETE */
  /**  array of sequence IDs*/
    double *seq_id;
  /** positive! sequence weights.  default is 1 = no weight */
    double *seq_w;
  /** total number of sequences */
    long seq_number;
  /** reserved space for sequences is always >= seq_number */
    long capacity;
  /** sum of sequence weights */
    double total_w;
  /** total number of dimensions */
    int dim;

  /** flags (internal) */
    unsigned int flags;
  } ghmm_cseq;


#ifdef __cplusplus
}
#endif
/* don't include model.h at the beginning of this file. struct ghmm_dseq has
   to be known in model.h */
#include "model.h"
#include "smodel.h"
#ifdef __cplusplus
extern "C" {
#endif

/** Truncate double sequences in a given sequence array. 
    Useful for Testing;
   @return truncated sqd_field; 
   @param sqd_in sequence arrays for truncation
   @param sqd_arrays number of sequence arrays
   @param  trunc_ratio 0 means  no truncation, 1 max. truncation
   @param seed rng seed
*/

  ghmm_cseq **ghmm_cseq_truncate (ghmm_cseq ** sqd_in, int sqd_arrays,
                                      double trunc_ratio, int seed);



/**
  Extract a single sequence from a larger ghmm_dseq into a new struct.
  
  @return ghmm_dseq struct containing a single sequence
  @param sq   source ghmm_dseq
  @param index   index of sequence to extract
*/
ghmm_dseq *ghmm_dseq_get_singlesequence(ghmm_dseq *sq, int index);

/**
  Extract a single sequence_d from a larger ghmm_cseq into a new struct.
  
  @return ghmm_cseq struct containing a single sequence
  @param sq   source ghmm_cseq
  @param index   index of sequence to extract
*/
ghmm_cseq *ghmm_cseq_get_singlesequence(ghmm_cseq *sq, int index);

/*XXX TEST: frees everything but the seq field */

/**
  Free a ghmm_dseq struct which holds as sequence a reference to a sequence in a different
  sequence_t. The function deallocates everything but the reference.
*/
int ghmm_dseq_subseq_free (ghmm_dseq *sq);

/**
  Free a ghmm_cseq struct which holds as sequence a reference to a sequence in a different
  sequence_d_t. The function deallocates everything but the reference.
*/
int ghmm_cseq_subseq_free (ghmm_cseq *sqd);


/**
   Reads a FastA file and returns a ghmm_dseq object
   @param filename filemane of the fasta file
   @param alphabet  alphabet
   @return  ghmm_dseq of the fasta file
*/
ghmm_dseq *ghmm_dseq_open_fasta(const char *filename, ghmm_alphabet *alphabet);


/** Generates all possible integer sequence of lenght n from an alphabet with
    M letters. Use lexicographical ordering. Memory allocation here.
    @param n      length of sequences
    @param M     size of alphabet
    @return array of generated integer sequences
*/
  ghmm_dseq *ghmm_dseq_lexWords (int n, int M);

/**
   Determine best model for a given integer sequence. 
   Choose from the set of models the 
   one with the highest likelihood for the given sequence.
   @param mo            array of models
   @param model_number  number of models
   @param sequence      sequence
   @param seq_len      length of sequence
   @param log_p         log likelihood of the sequence given the best model
   @return index of best model (between 0 and model_number - 1)
*/
  int ghmm_dseq_best_model (ghmm_dmodel ** mo, int model_number, int *sequence,
                           int seq_len, double *log_p);

/**
   Make sure that the sequences only contain allowed symbols. 
   (between 0 and max_symb - 1)
   @param sq          sequences
   @param max_symb    number of different symbols
   @return            -1 for error, 0 for no errors
*/
  int ghmm_dseq_check (ghmm_dseq * sq, int max_symb);

/**
  copy one integer sequence. Memory for target has to be allocated outside.
  @param target  target sequence
  @param source source sequence
  @param len     length of source sequence
  */
  void ghmm_dseq_copy (int *target, int *source, int len);

/**
  copy one double sequence. Memory for target has to be allocated outside.
  @param target  target sequence
  @param source source sequence
  @param len     length of source sequence
  */
  void ghmm_cseq_copy (double *target, double *source, int len);

/**
  Adds all integer sequences, sequence lengths etc 
  from source to target. Memory allocation is done here.
  @param target target sequence structure
  @param source  source sequence structure
  @return -1 for error, 0 for success
  */
  int ghmm_dseq_add (ghmm_dseq * target, ghmm_dseq * source);


/**
  Adds all double sequences, sequence lengths etc 
  from source to target. Memory allocation is done here.
  @param target target sequence structure
  @param source  source sequence structure
  @return -1 for error, 0 for success
  */
  int ghmm_cseq_add (ghmm_cseq * target, ghmm_cseq * source);

/**
  Prints one array of integer sequences in a file.
  @param file       output file
  @param sequence    array of sequences
  */
  void ghmm_dseq_print (ghmm_dseq * sequence, FILE * file);

/**
  Prints one array of integer sequences in a xml file
  @param file       output file
  @param sequence   array of sequences
  */
  void ghmm_dseq_print_xml (ghmm_dseq * sequence, FILE * file);

/**
   Prints one array of integer sequences in Mathematica format.
   (List of lists)
   @param file       output file
   @param sq    array of sequences
   @param name arbitrary sequence name for usage in Mathematica.
 */
  void ghmm_dseq_mathematica_print (ghmm_dseq * sq, FILE * file, char *name);

/**
  Prints one array of double sequences in a file.
  @param file       output file
  @param sqd    array of sequences
  @param discrete   switch: 0 means double output for symbols,  
     1 means truncate symbols to integer
  */
  void ghmm_cseq_print (ghmm_cseq * sqd, FILE * file, int discrete);

/**
   Prints one array of double sequences in Mathematica format.
   (List of lists)
   @param file       output file
   @param sqd    array of sequences
   @param name arbitrary sequence name for usage in Mathematica.
 */
  void ghmm_cseq_mathematica_print (ghmm_cseq * sqd, FILE * file,
                                     char *name);

/** Output of double sequences suitable for gnuplot. One symbol per line,
    sequences seperated by double newline.
    @param file output file
    @param sqd array of double sequences
*/
  void ghmm_cseq_gnu_print (ghmm_cseq * sqd, FILE * file);

/**
   Cleans integer sequence pointers in sequence struct. sets 
   seq_number to zero.
   Differs from sequence_free since memory is not freed here. 
   @param sq sequence structure
  */
  void ghmm_dseq_clean (ghmm_dseq * sq);

/**
   Cleans double sequence pointers in sequence struct. sets 
   seq_number to zero.
   Differs from sequence_free since memory is not freed here. 
   @param sqd sequence structure
  */
  void ghmm_cseq_clean (ghmm_cseq * sqd);

/**
  Frees all memory in a given array of integer sequences.
  @param sq sequence  structure
  @return 0 for succes, -1 for error
  */
  int ghmm_dseq_free (ghmm_dseq ** sq);

/**
  Frees all memory in a given array of double sequences.
  @param sq sequence  structure
  @return 0 for succes, -1 for error
  */
  int ghmm_cseq_free (ghmm_cseq ** sq);

/**
   Return biggest symbol in an interger sequence.
   @param sq sequence structure
   @return max value
 */
  int ghmm_dseq_max_symbol (ghmm_dseq * sq);

/**
   Memory allocation for an integer sequence struct. Allocates arrays of lenght
   seq_number. NO allocation for the actual sequence, since its length is 
   unknown.
   @param seq_number:  number of sequences
   @return:     pointer of sequence struct
*/
  ghmm_dseq *ghmm_dseq_calloc (long seq_number);

/**
   Completes Memory allocation for an integer sequence struct.
   NO allocation for the actual sequence, since its length is 
   unknown.
*/
  int ghmm_dseq_calloc_state_labels (ghmm_dseq *sq);

/**
   Memory allocation for a double  sequence struct. Allocates arrays of lenght
   seq_number. NO allocation for the actual sequence, since its length is 
   unknown.
   @param seq_number:  number of sequences
   @return:     pointer of sequence struct
*/
  ghmm_cseq *ghmm_cseq_calloc (long seq_number);

/**
   Copies array of integer sequences to double sequences.
   @return       double sequence struct (target)
   @param sq    integer sequence struct (source)
   */
  ghmm_cseq *ghmm_cseq_create_from_dseq (const ghmm_dseq * sq);

/**
   Copies array of double sequences into an array of integer
   sequences. Truncates positions after decimal point.
   @return       integer sequence struct (target)
   @param sqd    double sequence struct (source)
   */
  ghmm_dseq *ghmm_dseq_create_from_cseq (const ghmm_cseq * sqd);

/** 
    Determines max sequence length in a given int sequence struct.
    @author Peter Pipenbacher
    @param sqd sequence struct
    @return max sequence length
 */
  int ghmm_dseq_max_len (const ghmm_dseq * sqd);

/** 
    Determines max sequence length in a given double sequence struct.
    @param sqd sequence struct
    @return max sequence length
 */
  int ghmm_cseq_max_len (const ghmm_cseq * sqd);

/**
  Calculates a mean sequence of a given array of double sequences.
  Missing values of shorter sequences a assumed to be zero.
  @param sqd sequence struct
  @return pointer of sequence struct containing the mean sequence
  */
  ghmm_cseq *ghmm_cseq_mean (const ghmm_cseq * sqd);

/**
   Calculates the scatter matrix of an array of double sequences. 
   Missing parts of short sequences are NOT taken into account.
   @return        scatter matrix
   @param sqd     sequence struct
   @param dim     (calculated) dimension of scatter matrix
  */
  double **ghmm_cseq_scatter_matrix (const ghmm_cseq * sqd, int *dim);

/**
   Calculates transition class for a given double sequence
   at a specified position. Very application specific!!! Currently 
   implemented only dummy function: allways returns 0 which
   means no usage of multiple transition classes.
   @param O double sequence
   @param index position for class calculation
   @param osum sum of symbols upto index
   @return currently always 0
 */
  int ghmm_cseq_class (const double *O, int index, double *osum);
  /*int ghmm_cseq_class(const ghmm_cseq *sqd, const int seq_number, int index, double *osum, );*/

/** Divides randomly a given array of double sequences into two sets. 
    Useful if a training and test set is needed. Memory allocation is done 
    here.
    @param sqd input sequence array
    @param sqd_train training sequences
    @param sqd_test test sequences
    @param train_ratio ratio of number of train vs number of test sequences
    @return 0 for success, -1 for error
*/
  int ghmm_cseq_partition (ghmm_cseq * sqd, ghmm_cseq * sqd_train,
                            ghmm_cseq * sqd_test, double train_ratio);


/** 
    Copies all entries from one sequence in a source array to a target array.
    No memory allocation here.
    @param target double sequence target
    @param source double sequence source
    @param t_num position in target array
    @param s_num position in source array
*/
  void ghmm_cseq_copy_all (ghmm_cseq * target, long t_num,
                            ghmm_cseq * source, long s_num);

/** Log-Likelihood function in a mixture model:
    (mathe mode?)
    \f$\sum_k w^k \log( \sum_c (\alpha_c p(O^k | \lambda_c)))\f$
    @param smo pointer to array of smodels
    @param smo_number number of models
    @param sqd sequence struct
    @param like log likelihood
*/
  int ghmm_cseq_mix_like (ghmm_cmodel ** smo, int smo_number, ghmm_cseq * sqd,
                           double *like);

#ifdef __cplusplus
}
#endif

#endif /* GHMM_SEQUENCE_H */
/*@} (Doc++-Group: sequence) */