/*******************************************************************************
*
* This file is part of the General Hidden Markov Model Library,
* GHMM version __VERSION__, see http://ghmm.org
*
* Filename: ghmm/ghmm/kbest.c
* Authors: Anyess von Bock, Alexander Riemer, Janne Grunau
*
* Copyright (C) 1998-2004 Alexander Schliep
* Copyright (C) 1998-2001 ZAIK/ZPR, Universitaet zu Koeln
* Copyright (C) 2002-2004 Max-Planck-Institut fuer Molekulare Genetik,
* Berlin
*
* Contact: schliep@ghmm.org
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the Free
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*
* This file is version $Revision: 2264 $
* from $Date: 2009-04-22 11:13:40 -0400 (Wed, 22 Apr 2009) $
* last change by $Author: grunau $.
*
*******************************************************************************/
/** \file kbest.c
*/
#ifdef HAVE_CONFIG_H
# include "../config.h"
#endif
#include <stdlib.h>
#include <math.h>
#include "mes.h"
#include "mprintf.h"
#include "model.h"
#include "kbest.h"
#include "ghmm_internals.h"
/** threshold probability (logarithmized) */
#define KBEST_THRESHOLD -3.50655789732
/** log(0.03) => threshold: 3% of most probable partial hypothesis */
#define KBEST_EPS 1E-15
/*============================================================================*/
/** Data type for single linked list of hypotheses.
*/
typedef struct hypo_List {
int hyp_c; /**< hypothesis */
int refcount; /**< counter of the links to this hypothesis */
int chosen;
int gamma_states;
double *gamma_a;
int *gamma_id;
struct hypo_List *next; /**< next list element */
struct hypo_List *parent; /**< parent hypothesis */
} hypoList;
/*============================================================================*/
/* inserts new hypothesis into list at position indicated by pointer plist */
static void ighmm_hlist_insert (hypoList ** plist, int newhyp,
hypoList * parlist);
/*============================================================================*/
/* removes hypothesis at position indicated by pointer plist from the list */
static void ighmm_hlist_remove (hypoList ** plist);
/*============================================================================*/
/**
Propagates list of hypotheses forward by extending each old hypothesis to
the possible new hypotheses depending on the states in which the old
hypothesis could end and the reachable labels
@return number of old hypotheses
@param mo: pointer to the ghmm_dmodel
@param h: pointer to list of hypotheses
@param hplus: address of a pointer to store the propagated hypotheses
@param labels: number of labels
@param nr_s: number states which have assigned the index aa label
@param max_out: maximum number of out_states over all states with the index aa label
*/
static int ighmm_hlist_prop_forward (ghmm_dmodel * mo, hypoList * h, hypoList ** hplus, int labels,
int *nr_s, int *max_out);
/*============================================================================*/
/**
Calculates the logarithm of sum(exp(log_a[j,a_pos])+exp(log_gamma[j,g_pos]))
which corresponds to the logarithm of the sum of a[j,a_pos]*gamma[j,g_pos]
@return log. sum for products of a row from gamma and a row from matrix A
@param log_a: transition matrix with logarithmic values (1.0 for log(0))
@param s: ghmm_dstate whose gamma-value is calculated
@param parent: a pointer to the parent hypothesis
*/
static double ighmm_log_gamma_sum (double *log_a, ghmm_dstate * s, hypoList * parent);
/*============================================================================*/
/**
Builds logarithmic transition matrix from the states' in_a values
the row for each state is the logarithmic version of the state's in_a
@return transition matrix with logarithmic values, 1.0 if a[i,j] = 0
@param s: array of all states of the model
@param N: number of states in the model
*/
static double **kbest_buildLogMatrix (ghmm_dstate * s, int N)
{
#define CUR_PROC "kbest_buildLogMatrix"
int i, j;
double **log_a; /* log(a(i,j)) => log_a[i*N+j] */
/* create & initialize matrix: */
ARRAY_MALLOC (log_a, N);
for (i = 0; i < N; i++) {
ARRAY_MALLOC (log_a[i], s[i].in_states);
for (j = 0; j < s[i].in_states; j++)
log_a[i][j] = log (s[i].in_a[j]);
}
return log_a;
STOP: /* Label STOP from ARRAY_[CM]ALLOC */
GHMM_LOG(LCONVERTED, "kbest_buildLogMatrix failed\n");
exit (1);
#undef CUR_PROC
}
/*============================================================================*/
int *ghmm_dmodel_label_kbest (ghmm_dmodel * mo, int *o_seq, int seq_len, int k, double *log_p)
{
#define CUR_PROC "ghmm_dl_kbest"
int i, t, c, l, m; /* counters */
int no_oldHyps; /* number of hypotheses until position t-1 */
int b_index, i_id; /* index for addressing states' b arrays */
int no_labels = 0;
int exists, g_nr;
int *states_wlabel;
int *label_max_out;
char *str;
/* logarithmized transition matrix A, log(a(i,j)) => log_a[i*N+j],
1.0 for zero probability */
double **log_a;
/* matrix of hypotheses, holds for every position in the sequence a list
of hypotheses */
hypoList **h;
hypoList *hP;
/* vectors for rows in the matrices */
int *hypothesis;
/* pointer & prob. of the k most probable hypotheses for each state
- matrices of dimensions #states x k: argm(i,l) => argmaxs[i*k+l] */
double *maxima;
hypoList **argmaxs;
/* pointer to & probability of most probable hypothesis in a certain state */
hypoList *argmax;
double sum;
/* break if sequence empty or k<1: */
if (seq_len <= 0 || k <= 0)
return NULL;
ARRAY_CALLOC (h, seq_len);
/* 1. Initialization (extend empty hypothesis to #labels hypotheses of
length 1): */
/* get number of labels (= maximum label + 1)
and number of states with those labels */
ARRAY_CALLOC (states_wlabel, mo->N);
ARRAY_CALLOC (label_max_out, mo->N);
for (i = 0; i < mo->N; i++) {
c = mo->label[i];
states_wlabel[c]++;
if (c > no_labels)
no_labels = c;
if (mo->s[i].out_states > label_max_out[c])
label_max_out[c] = mo->s[i].out_states;
}
/* add one to the maximum label to get the number of labels */
no_labels++;
ARRAY_REALLOC (states_wlabel, no_labels);
ARRAY_REALLOC (label_max_out, no_labels);
/* initialize h: */
hP = h[0];
for (i = 0; i < mo->N; i++) {
if (mo->s[i].pi > KBEST_EPS) {
/* printf("Found State %d with initial probability %f\n", i, mo->s[i].pi); */
exists = 0;
while (hP != NULL) {
if (hP->hyp_c == mo->label[i]) {
/* add entry to the gamma list */
g_nr = hP->gamma_states;
hP->gamma_id[g_nr] = i;
hP->gamma_a[g_nr] =
log (mo->s[i].pi) +
log (mo->s[i].b[get_emission_index (mo, i, o_seq[0], 0)]);
hP->gamma_states = g_nr + 1;
exists = 1;
break;
}
else
hP = hP->next;
}
if (!exists) {
ighmm_hlist_insert (&(h[0]), mo->label[i], NULL);
/* initiallize gamma-array with safe size (number of states) and add the first entry */
ARRAY_MALLOC (h[0]->gamma_a, states_wlabel[mo->label[i]]);
ARRAY_MALLOC (h[0]->gamma_id, states_wlabel[mo->label[i]]);
h[0]->gamma_id[0] = i;
h[0]->gamma_a[0] =
log (mo->s[i].pi) +
log (mo->s[i].b[get_emission_index (mo, i, o_seq[0], 0)]);
h[0]->gamma_states = 1;
h[0]->chosen = 1;
}
hP = h[0];
}
}
/* reallocating the gamma list to the real size */
hP = h[0];
while (hP != NULL) {
ARRAY_REALLOC (hP->gamma_a, hP->gamma_states);
ARRAY_REALLOC (hP->gamma_id, hP->gamma_states);
hP = hP->next;
}
/* calculate transition matrix with logarithmic values: */
log_a = kbest_buildLogMatrix (mo->s, mo->N);
/* initialize temporary arrays: */
ARRAY_MALLOC (maxima, mo->N * k); /* for each state save k */
ARRAY_MALLOC (argmaxs, mo->N * k);
/*------ Main loop: Cycle through the sequence: ------*/
for (t = 1; t < seq_len; t++) {
/* put o_seq[t-1] in emission history: */
update_emission_history (mo, o_seq[t - 1]);
/* 2. Propagate hypotheses forward and update gamma: */
no_oldHyps =
ighmm_hlist_prop_forward (mo, h[t - 1], &(h[t]), no_labels, states_wlabel,
label_max_out);
/* printf("t = %d (%d), no of old hypotheses = %d\n", t, seq_len, no_oldHyps); */
/*-- calculate new gamma: --*/
hP = h[t];
/* cycle through list of hypotheses */
while (hP != NULL) {
for (i = 0; i < hP->gamma_states; i++) {
/* if hypothesis hP ends with label of state i:
gamma(i,c):= log(sum(exp(a(j,i)*exp(oldgamma(j,old_c)))))
+ log(b[i](o_seq[t]))
else: gamma(i,c):= -INF (represented by 1.0) */
i_id = hP->gamma_id[i];
hP->gamma_a[i] = ighmm_log_gamma_sum (log_a[i_id], &mo->s[i_id], hP->parent);
b_index = get_emission_index (mo, i_id, o_seq[t], t);
if (b_index < 0) {
hP->gamma_a[i] = 1.0;
if (mo->order[i_id] > t)
continue;
else {
str = ighmm_mprintf (NULL, 0,
"i_id: %d, o_seq[%d]=%d\ninvalid emission index!\n",
i_id, t, o_seq[t]);
GHMM_LOG(LCONVERTED, str);
m_free (str);
}
}
else
hP->gamma_a[i] += log (mo->s[i_id].b[b_index]);
/*printf("%g = %g\n", log(mo->s[i_id].b[b_index]), hP->gamma_a[i]); */
if (hP->gamma_a[i] > 0.0) {
GHMM_LOG(LCONVERTED, "gamma to large. ghmm_dl_kbest failed\n");
exit (1);
}
}
hP = hP->next;
}
/* 3. Choose the k most probable hypotheses for each state and discard all
hypotheses that were not chosen: */
/* initialize temporary arrays: */
for (i = 0; i < mo->N * k; i++) {
maxima[i] = 1.0;
argmaxs[i] = NULL;
}
/* cycle through hypotheses & calculate the k most probable hypotheses for
each state: */
hP = h[t];
while (hP != NULL) {
for (i = 0; i < hP->gamma_states; i++) {
i_id = hP->gamma_id[i];
if (hP->gamma_a[i] > KBEST_EPS)
continue;
/* find first best hypothesis that is worse than current hypothesis: */
for (l = 0;
l < k && maxima[i_id * k + l] < KBEST_EPS
&& maxima[i_id * k + l] > hP->gamma_a[i]; l++);
if (l < k) {
/* for each m>l: m'th best hypothesis becomes (m+1)'th best */
for (m = k - 1; m > l; m--) {
argmaxs[i_id * k + m] = argmaxs[i_id * k + m - 1];
maxima[i_id * k + m] = maxima[i_id * k + m - 1];
}
/* save new l'th best hypothesis: */
maxima[i_id * k + l] = hP->gamma_a[i];
argmaxs[i_id * k + l] = hP;
}
}
hP = hP->next;
}
/* set 'chosen' for all hypotheses from argmaxs array: */
for (i = 0; i < mo->N * k; i++)
/* only choose hypotheses whose prob. is at least threshold*max_prob */
if (maxima[i] != 1.0
&& maxima[i] >= KBEST_THRESHOLD + maxima[(i % mo->N) * k])
argmaxs[i]->chosen = 1;
/* remove hypotheses that were not chosen from the lists: */
/* remove all hypotheses till the first chosen one */
while (h[t] != NULL && 0 == h[t]->chosen)
ighmm_hlist_remove (&(h[t]));
/* remove all other not chosen hypotheses */
if (!h[t]) {
GHMM_LOG(LCONVERTED, "No chosen hypothesis. ghmm_dl_kbest failed\n");
exit (1);
}
hP = h[t];
while (hP->next != NULL) {
if (1 == hP->next->chosen)
hP = hP->next;
else
ighmm_hlist_remove (&(hP->next));
}
}
/* dispose of temporary arrays: */
m_free(states_wlabel);
m_free(label_max_out);
m_free(argmaxs);
m_free(maxima);
/* transition matrix is no longer needed from here on */
for (i=0; i<mo->N; i++)
m_free(log_a[i]);
m_free(log_a);
/* 4. Save the hypothesis with the highest probability over all states: */
hP = h[seq_len - 1];
argmax = NULL;
*log_p = 1.0; /* log_p will store log of maximum summed probability */
while (hP != NULL) {
/* sum probabilities for each hypothesis over all states: */
sum = ighmm_cvector_log_sum (hP->gamma_a, hP->gamma_states);
/* and select maximum sum */
if (sum < KBEST_EPS && (*log_p == 1.0 || sum > *log_p)) {
*log_p = sum;
argmax = hP;
}
hP = hP->next;
}
/* found a valid path? */
if (*log_p < KBEST_EPS) {
/* yes: extract chosen hypothesis: */
ARRAY_MALLOC (hypothesis, seq_len);
for (i = seq_len - 1; i >= 0; i--) {
hypothesis[i] = argmax->hyp_c;
argmax = argmax->parent;
}
}
else
/* no: return 1.0 representing -INF and an empty hypothesis */
hypothesis = NULL;
/* dispose of calculation matrices: */
hP = h[seq_len - 1];
while (hP != NULL)
ighmm_hlist_remove (&hP);
free (h);
return hypothesis;
STOP: /* Label STOP from ARRAY_[CM]ALLOC */
GHMM_LOG(LCONVERTED, "ghmm_dl_kbest failed\n");
exit (1);
#undef CUR_PROC
}
/*================ utility functions ========================================*/
/* inserts new hypothesis into list at position indicated by pointer plist */
static void ighmm_hlist_insert (hypoList ** plist, int newhyp,
hypoList * parlist)
{
#define CUR_PROC "ighmm_hlist_insert"
hypoList *newlist;
ARRAY_CALLOC (newlist, 1);
newlist->hyp_c = newhyp;
if (parlist)
parlist->refcount += 1;
newlist->parent = parlist;
newlist->next = *plist;
*plist = newlist;
return;
STOP: /* Label STOP from ARRAY_[CM]ALLOC */
GHMM_LOG(LCONVERTED, "ighmm_hlist_insert failed\n");
exit (1);
#undef CUR_PROC
}
/*============================================================================*/
/* removes hypothesis at position indicated by pointer plist from the list
removes recursively parent hypothesis with refcount==0 */
static void ighmm_hlist_remove (hypoList ** plist) {
#define CUR_PROC "ighmm_hlist_remove"
hypoList *tempPtr = (*plist)->next;
free ((*plist)->gamma_a);
free ((*plist)->gamma_id);
if ((*plist)->parent) {
(*plist)->parent->refcount -= 1;
if (0 == (*plist)->parent->refcount)
ighmm_hlist_remove (&((*plist)->parent));
}
free (*plist);
*plist = tempPtr;
#undef CUR_PROC
}
/*============================================================================*/
static int ighmm_hlist_prop_forward (ghmm_dmodel * mo, hypoList * h, hypoList ** hplus,
int labels, int *nr_s, int *max_out) {
#define CUR_PROC "ighmm_hlist_prop_forward"
int i, j, c, k;
int i_id, j_id, g_nr;
int no_oldHyps = 0, newHyps = 0;
hypoList *hP = h;
hypoList **created;
ARRAY_MALLOC (created, labels);
/* extend the all hypotheses with the labels of out_states
of all states in the hypotesis */
while (hP != NULL) {
/* lookup table for labels, created[i]!=0 iff the current hypotheses
was propagated forward with label i */
for (c = 0; c < labels; c++)
created[c] = NULL;
/* extend the current hypothesis and add all states which may have
probability greater null */
for (i = 0; i < hP->gamma_states; i++) {
/* skip impossible states */
if (hP->gamma_a[i] == 1.0)
continue;
i_id = hP->gamma_id[i];
for (j = 0; j < mo->s[i_id].out_states; j++) {
j_id = mo->s[i_id].out_id[j];
c = mo->label[j_id];
/* create a new hypothesis with label c */
if (!created[c]) {
ighmm_hlist_insert (hplus, c, hP);
created[c] = *hplus;
/* initiallize gamma-array with safe size (number of states */
ARRAY_MALLOC ((*hplus)->gamma_id, m_min (nr_s[c], hP->gamma_states * max_out[hP->hyp_c]));
(*hplus)->gamma_id[0] = j_id;
(*hplus)->gamma_states = 1;
newHyps++;
}
/* add a new gamma state to the existing hypothesis with c */
else {
g_nr = created[c]->gamma_states;
/* search for state j_id in the gamma list */
for (k = 0; k < g_nr; k++)
if (j_id == created[c]->gamma_id[k])
break;
/* add the state to the gamma list */
if (k == g_nr) {
created[c]->gamma_id[g_nr] = j_id;
created[c]->gamma_states = g_nr + 1;
}
}
}
}
/* reallocating gamma-array to the correct size */
for (c = 0; c < labels; c++) {
if (created[c]) {
ARRAY_CALLOC (created[c]->gamma_a, created[c]->gamma_states);
ARRAY_REALLOC (created[c]->gamma_id, created[c]->gamma_states);
created[c] = NULL;
}
}
hP = hP->next;
no_oldHyps++;
}
/* printf("Created %d new Hypotheses.\n", newHyps); */
free (created);
return (no_oldHyps);
STOP: /* Label STOP from ARRAY_[CM]ALLOC */
GHMM_LOG(LCONVERTED, "ighmm_hlist_prop_forward failed\n");
exit (1);
#undef CUR_PROC
}
/*============================================================================*/
/**
Calculates the logarithm of sum(exp(log_a[j,a_pos])+exp(log_gamma[j,g_pos]))
which corresponds to the logarithm of the sum of a[j,a_pos]*gamma[j,g_pos]
@return ighmm_log_sum for products of a row from gamma and a row from matrix A
@param log_a: row of the transition matrix with logarithmic values (1.0 for log(0))
@param s: ghmm_dstate whose gamma-value is calculated
@param parent: a pointer to the parent hypothesis
*/
static double ighmm_log_gamma_sum (double *log_a, ghmm_dstate * s, hypoList * parent) {
#define CUR_PROC "ighmm_log_gamma_sum"
double result;
int j, j_id, k;
double max = 1.0;
int argmax = 0;
double *logP;
/* shortcut for the trivial case */
if (parent->gamma_states == 1)
for (j = 0; j < s->in_states; j++)
if (parent->gamma_id[0] == s->in_id[j])
return parent->gamma_a[0] + log_a[j];
ARRAY_MALLOC (logP, s->in_states);
/* calculate logs of a[k,l]*gamma[k,hi] as sums of logs and find maximum: */
for (j = 0; j < s->in_states; j++) {
j_id = s->in_id[j];
/* search for state j_id in the gamma list */
for (k = 0; k < parent->gamma_states; k++)
if (parent->gamma_id[k] == j_id)
break;
if (k == parent->gamma_states)
logP[j] = 1.0;
else {
logP[j] = log_a[j] + parent->gamma_a[k];
if (max == 1.0 || (logP[j] > max && logP[j] != 1.0)) {
max = logP[j];
argmax = j;
}
}
}
/* calculate max+log(1+sum[j!=argmax; exp(logP[j]-max)]) */
result = 1.0;
for (j = 0; j < s->in_states; j++)
if (j != argmax && logP[j] != 1.0)
result += exp (logP[j] - max);
result = log (result);
result += max;
free (logP);
return result;
STOP: /* Label STOP from ARRAY_[CM]ALLOC */
GHMM_LOG(LCONVERTED, "ighmm_log_gamma_sum failed\n");
exit (1);
#undef CUR_PROC
}