ghmm/kbest.c - ghmm (HEAD)

Tree @HEAD (Download .tar.gz)

kbest.c @HEAD — raw · history · blame

/*******************************************************************************
*
*       This file is part of the General Hidden Markov Model Library,
*       GHMM version __VERSION__, see http://ghmm.org
*
*       Filename: ghmm/ghmm/kbest.c
*       Authors:  Anyess von Bock, Alexander Riemer, Janne Grunau
*
*       Copyright (C) 1998-2004 Alexander Schliep
*       Copyright (C) 1998-2001 ZAIK/ZPR, Universitaet zu Koeln
*       Copyright (C) 2002-2004 Max-Planck-Institut fuer Molekulare Genetik,
*                               Berlin
*
*       Contact: schliep@ghmm.org
*
*       This library is free software; you can redistribute it and/or
*       modify it under the terms of the GNU Library General Public
*       License as published by the Free Software Foundation; either
*       version 2 of the License, or (at your option) any later version.
*
*       This library is distributed in the hope that it will be useful,
*       but WITHOUT ANY WARRANTY; without even the implied warranty of
*       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
*       Library General Public License for more details.
*
*       You should have received a copy of the GNU Library General Public
*       License along with this library; if not, write to the Free
*       Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*
*       This file is version $Revision: 2264 $
*                       from $Date: 2009-04-22 11:13:40 -0400 (Wed, 22 Apr 2009) $
*             last change by $Author: grunau $.
*
*******************************************************************************/

/** \file kbest.c
 */

#ifdef HAVE_CONFIG_H
#  include "../config.h"
#endif

#include <stdlib.h>
#include <math.h>

#include "mes.h"
#include "mprintf.h"
#include "model.h"
#include "kbest.h"
#include "ghmm_internals.h"


/** threshold probability (logarithmized) */
#define KBEST_THRESHOLD -3.50655789732
/** log(0.03) => threshold: 3% of most probable partial hypothesis */
#define KBEST_EPS 1E-15


/*============================================================================*/

/** Data type for single linked list of hypotheses.
 */
typedef struct hypo_List {
  int hyp_c;                /**< hypothesis */
  int refcount;             /**< counter of the links to this hypothesis */
  int chosen;
  int gamma_states;
  double *gamma_a;
  int *gamma_id;
  struct hypo_List *next;   /**< next list element */
  struct hypo_List *parent; /**< parent hypothesis */
} hypoList;

/*============================================================================*/
/* inserts new hypothesis into list at position indicated by pointer plist */
static void ighmm_hlist_insert (hypoList ** plist, int newhyp,
                                hypoList * parlist);

/*============================================================================*/
/* removes hypothesis at position indicated by pointer plist from the list */
static void ighmm_hlist_remove (hypoList ** plist);


/*============================================================================*/
/**
   Propagates list of hypotheses forward by extending each old hypothesis to
   the possible new hypotheses depending on the states in which the old
   hypothesis could end and the reachable labels
   @return number of old hypotheses
   @param mo:         pointer to the ghmm_dmodel
   @param h:          pointer to list of hypotheses
   @param hplus:      address of a pointer to store the propagated hypotheses
   @param labels:     number of labels
   @param nr_s:       number states which have assigned the index aa label
   @param max_out:    maximum number of out_states over all states with the index aa label
 */
static int ighmm_hlist_prop_forward (ghmm_dmodel * mo, hypoList * h, hypoList ** hplus, int labels,
                     int *nr_s, int *max_out);


/*============================================================================*/
/**
   Calculates the logarithm of sum(exp(log_a[j,a_pos])+exp(log_gamma[j,g_pos]))
   which corresponds to the logarithm of the sum of a[j,a_pos]*gamma[j,g_pos]
   @return log. sum for products of a row from gamma and a row from matrix A
   @param log_a:      transition matrix with logarithmic values (1.0 for log(0))
   @param s:          ghmm_dstate whose gamma-value is calculated
   @param parent:     a pointer to the parent hypothesis
*/
static double ighmm_log_gamma_sum (double *log_a, ghmm_dstate * s, hypoList * parent);


/*============================================================================*/
/**
  Builds logarithmic transition matrix from the states' in_a values
  the row for each state is the logarithmic version of the state's in_a
  @return transition matrix with logarithmic values, 1.0 if a[i,j] = 0
  @param s:           array of all states of the model
  @param N:           number of states in the model
 */
static double **kbest_buildLogMatrix (ghmm_dstate * s, int N)
{
#define CUR_PROC "kbest_buildLogMatrix"
  int i, j;
  double **log_a;               /* log(a(i,j)) => log_a[i*N+j] */

  /* create & initialize matrix: */
  ARRAY_MALLOC (log_a, N);
  for (i = 0; i < N; i++) {
    ARRAY_MALLOC (log_a[i], s[i].in_states);
    for (j = 0; j < s[i].in_states; j++)
      log_a[i][j] = log (s[i].in_a[j]);
  }
  return log_a;
STOP:     /* Label STOP from ARRAY_[CM]ALLOC */
  GHMM_LOG(LCONVERTED, "kbest_buildLogMatrix failed\n");
  exit (1);
#undef CUR_PROC
}


/*============================================================================*/
int *ghmm_dmodel_label_kbest (ghmm_dmodel * mo, int *o_seq, int seq_len, int k, double *log_p)
{
#define CUR_PROC "ghmm_dl_kbest"
  int i, t, c, l, m;            /* counters */
  int no_oldHyps;               /* number of hypotheses until position t-1 */
  int b_index, i_id;            /* index for addressing states' b arrays */
  int no_labels = 0;
  int exists, g_nr;
  int *states_wlabel;
  int *label_max_out;
  char *str;

  /* logarithmized transition matrix A, log(a(i,j)) => log_a[i*N+j],
     1.0 for zero probability */
  double **log_a;

  /* matrix of hypotheses, holds for every position in the sequence a list
     of hypotheses */
  hypoList **h;
  hypoList *hP;

  /* vectors for rows in the matrices */
  int *hypothesis;

  /* pointer & prob. of the k most probable hypotheses for each state
     - matrices of dimensions #states x k:  argm(i,l) => argmaxs[i*k+l] */
  double *maxima;
  hypoList **argmaxs;

  /* pointer to & probability of most probable hypothesis in a certain state */
  hypoList *argmax;
  double sum;

  /* break if sequence empty or k<1: */
  if (seq_len <= 0 || k <= 0)
    return NULL;

  ARRAY_CALLOC (h, seq_len);

  /* 1. Initialization (extend empty hypothesis to #labels hypotheses of
         length 1): */

  /* get number of labels (= maximum label + 1)
     and number of states with those labels */
  ARRAY_CALLOC (states_wlabel, mo->N);
  ARRAY_CALLOC (label_max_out, mo->N);
  for (i = 0; i < mo->N; i++) {
    c = mo->label[i];
    states_wlabel[c]++;
    if (c > no_labels)
      no_labels = c;
    if (mo->s[i].out_states > label_max_out[c])
      label_max_out[c] = mo->s[i].out_states;
  }
  /* add one to the maximum label to get the number of labels */
  no_labels++;
  ARRAY_REALLOC (states_wlabel, no_labels);
  ARRAY_REALLOC (label_max_out, no_labels);

  /* initialize h: */
  hP = h[0];
  for (i = 0; i < mo->N; i++) {
    if (mo->s[i].pi > KBEST_EPS) {
      /* printf("Found State %d with initial probability %f\n", i, mo->s[i].pi); */
      exists = 0;
      while (hP != NULL) {
        if (hP->hyp_c == mo->label[i]) {
          /* add entry to the gamma list */
          g_nr = hP->gamma_states;
          hP->gamma_id[g_nr] = i;
          hP->gamma_a[g_nr] =
            log (mo->s[i].pi) +
            log (mo->s[i].b[get_emission_index (mo, i, o_seq[0], 0)]);
          hP->gamma_states = g_nr + 1;
          exists = 1;
          break;
        }
        else
          hP = hP->next;
      }
      if (!exists) {
        ighmm_hlist_insert (&(h[0]), mo->label[i], NULL);
        /* initiallize gamma-array with safe size (number of states) and add the first entry */
        ARRAY_MALLOC (h[0]->gamma_a, states_wlabel[mo->label[i]]);
        ARRAY_MALLOC (h[0]->gamma_id, states_wlabel[mo->label[i]]);
        h[0]->gamma_id[0] = i;
        h[0]->gamma_a[0] =
          log (mo->s[i].pi) +
          log (mo->s[i].b[get_emission_index (mo, i, o_seq[0], 0)]);
        h[0]->gamma_states = 1;
        h[0]->chosen = 1;
      }
      hP = h[0];
    }
  }
  /* reallocating the gamma list to the real size */
  hP = h[0];
  while (hP != NULL) {
    ARRAY_REALLOC (hP->gamma_a, hP->gamma_states);
    ARRAY_REALLOC (hP->gamma_id, hP->gamma_states);
    hP = hP->next;
  }

  /* calculate transition matrix with logarithmic values: */
  log_a = kbest_buildLogMatrix (mo->s, mo->N);

  /* initialize temporary arrays: */
  ARRAY_MALLOC (maxima, mo->N * k);                             /* for each state save k */
  ARRAY_MALLOC (argmaxs, mo->N * k);


  /*------ Main loop: Cycle through the sequence: ------*/
  for (t = 1; t < seq_len; t++) {

    /* put o_seq[t-1] in emission history: */
    update_emission_history (mo, o_seq[t - 1]);

    /* 2. Propagate hypotheses forward and update gamma: */
    no_oldHyps =
      ighmm_hlist_prop_forward (mo, h[t - 1], &(h[t]), no_labels, states_wlabel,
                     label_max_out);
    /* printf("t = %d (%d), no of old hypotheses = %d\n", t, seq_len, no_oldHyps); */

    /*-- calculate new gamma: --*/
    hP = h[t];
    /* cycle through list of hypotheses */
    while (hP != NULL) {

      for (i = 0; i < hP->gamma_states; i++) {
        /* if hypothesis hP ends with label of state i:
           gamma(i,c):= log(sum(exp(a(j,i)*exp(oldgamma(j,old_c)))))
           + log(b[i](o_seq[t]))
           else: gamma(i,c):= -INF (represented by 1.0) */
        i_id = hP->gamma_id[i];
        hP->gamma_a[i] = ighmm_log_gamma_sum (log_a[i_id], &mo->s[i_id], hP->parent);
        b_index = get_emission_index (mo, i_id, o_seq[t], t);
        if (b_index < 0) {
          hP->gamma_a[i] = 1.0;
          if (mo->order[i_id] > t)
            continue;
          else {
            str = ighmm_mprintf (NULL, 0,
                           "i_id: %d, o_seq[%d]=%d\ninvalid emission index!\n",
                           i_id, t, o_seq[t]);
            GHMM_LOG(LCONVERTED, str);
            m_free (str);
          }
        }
        else
          hP->gamma_a[i] += log (mo->s[i_id].b[b_index]);
        /*printf("%g = %g\n", log(mo->s[i_id].b[b_index]), hP->gamma_a[i]); */
        if (hP->gamma_a[i] > 0.0) {
          GHMM_LOG(LCONVERTED, "gamma to large. ghmm_dl_kbest failed\n");
          exit (1);
        }
      }
      hP = hP->next;
    }

    /* 3. Choose the k most probable hypotheses for each state and discard all
	   hypotheses that were not chosen: */

    /* initialize temporary arrays: */
    for (i = 0; i < mo->N * k; i++) {
      maxima[i] = 1.0;
      argmaxs[i] = NULL;
    }

    /* cycle through hypotheses & calculate the k most probable hypotheses for
       each state: */
    hP = h[t];
    while (hP != NULL) {
      for (i = 0; i < hP->gamma_states; i++) {
        i_id = hP->gamma_id[i];
        if (hP->gamma_a[i] > KBEST_EPS)
          continue;
        /* find first best hypothesis that is worse than current hypothesis: */
        for (l = 0;
             l < k && maxima[i_id * k + l] < KBEST_EPS
             && maxima[i_id * k + l] > hP->gamma_a[i]; l++);
        if (l < k) {
          /* for each m>l: m'th best hypothesis becomes (m+1)'th best */
          for (m = k - 1; m > l; m--) {
            argmaxs[i_id * k + m] = argmaxs[i_id * k + m - 1];
            maxima[i_id * k + m] = maxima[i_id * k + m - 1];
          }
          /* save new l'th best hypothesis: */
          maxima[i_id * k + l] = hP->gamma_a[i];
          argmaxs[i_id * k + l] = hP;
        }
      }
      hP = hP->next;
    }

    /* set 'chosen' for all hypotheses from argmaxs array: */
    for (i = 0; i < mo->N * k; i++)
      /* only choose hypotheses whose prob. is at least threshold*max_prob */
      if (maxima[i] != 1.0
          && maxima[i] >= KBEST_THRESHOLD + maxima[(i % mo->N) * k])
        argmaxs[i]->chosen = 1;

    /* remove hypotheses that were not chosen from the lists: */
    /* remove all hypotheses till the first chosen one */
    while (h[t] != NULL && 0 == h[t]->chosen)
      ighmm_hlist_remove (&(h[t]));
    /* remove all other not chosen hypotheses */
    if (!h[t]) {
      GHMM_LOG(LCONVERTED, "No chosen hypothesis. ghmm_dl_kbest failed\n");
      exit (1);
    }
    hP = h[t];
    while (hP->next != NULL) {
      if (1 == hP->next->chosen)
        hP = hP->next;
      else
        ighmm_hlist_remove (&(hP->next));
    }
  }
  /* dispose of temporary arrays: */
  m_free(states_wlabel);
  m_free(label_max_out);
  m_free(argmaxs);
  m_free(maxima);
  /* transition matrix is no longer needed from here on */
  for (i=0; i<mo->N; i++)
    m_free(log_a[i]);
  m_free(log_a);

  /* 4. Save the hypothesis with the highest probability over all states: */
  hP = h[seq_len - 1];
  argmax = NULL;
  *log_p = 1.0;                 /* log_p will store log of maximum summed probability */
  while (hP != NULL) {
    /* sum probabilities for each hypothesis over all states: */
    sum = ighmm_cvector_log_sum (hP->gamma_a, hP->gamma_states);
    /* and select maximum sum */
    if (sum < KBEST_EPS && (*log_p == 1.0 || sum > *log_p)) {
      *log_p = sum;
      argmax = hP;
    }
    hP = hP->next;
  }

  /* found a valid path? */
  if (*log_p < KBEST_EPS) {
    /* yes: extract chosen hypothesis: */
    ARRAY_MALLOC (hypothesis, seq_len);
    for (i = seq_len - 1; i >= 0; i--) {
      hypothesis[i] = argmax->hyp_c;
      argmax = argmax->parent;
    }
  }
  else
    /* no: return 1.0 representing -INF and an empty hypothesis */
    hypothesis = NULL;

  /* dispose of calculation matrices: */
  hP = h[seq_len - 1];
  while (hP != NULL)
    ighmm_hlist_remove (&hP);
  free (h);
  return hypothesis;
STOP:     /* Label STOP from ARRAY_[CM]ALLOC */
  GHMM_LOG(LCONVERTED, "ghmm_dl_kbest failed\n");
  exit (1);
#undef CUR_PROC
}



/*================ utility functions ========================================*/
/* inserts new hypothesis into list at position indicated by pointer plist */
static void ighmm_hlist_insert (hypoList ** plist, int newhyp,
                              hypoList * parlist)
{
#define CUR_PROC "ighmm_hlist_insert"
  hypoList *newlist;

  ARRAY_CALLOC (newlist, 1);
  newlist->hyp_c = newhyp;
  if (parlist)
    parlist->refcount += 1;
  newlist->parent = parlist;
  newlist->next = *plist;

  *plist = newlist;
  return;
STOP:     /* Label STOP from ARRAY_[CM]ALLOC */
  GHMM_LOG(LCONVERTED, "ighmm_hlist_insert failed\n");
  exit (1);
#undef CUR_PROC
}

/*============================================================================*/
/* removes hypothesis at position indicated by pointer plist from the list
   removes recursively parent hypothesis with refcount==0 */
static void ighmm_hlist_remove (hypoList ** plist) {
#define CUR_PROC "ighmm_hlist_remove"
  hypoList *tempPtr = (*plist)->next;

  free ((*plist)->gamma_a);
  free ((*plist)->gamma_id);
  if ((*plist)->parent) {
    (*plist)->parent->refcount -= 1;
    if (0 == (*plist)->parent->refcount)
      ighmm_hlist_remove (&((*plist)->parent));
  }
  free (*plist);

  *plist = tempPtr;
#undef CUR_PROC
}

/*============================================================================*/
static int ighmm_hlist_prop_forward (ghmm_dmodel * mo, hypoList * h, hypoList ** hplus,
				     int labels, int *nr_s, int *max_out) {
#define CUR_PROC "ighmm_hlist_prop_forward"
  int i, j, c, k;
  int i_id, j_id, g_nr;
  int no_oldHyps = 0, newHyps = 0;
  hypoList *hP = h;
  hypoList **created;

  ARRAY_MALLOC (created, labels);

  /* extend the all hypotheses with the labels of out_states
     of all states in the hypotesis */
  while (hP != NULL) {

    /* lookup table for labels, created[i]!=0 iff the current hypotheses
       was propagated forward with label i */
    for (c = 0; c < labels; c++)
      created[c] = NULL;

    /* extend the current hypothesis and add all states which may have
       probability greater null */
    for (i = 0; i < hP->gamma_states; i++) {
      /* skip impossible states */
      if (hP->gamma_a[i] == 1.0)
        continue;
      i_id = hP->gamma_id[i];
      for (j = 0; j < mo->s[i_id].out_states; j++) {
        j_id = mo->s[i_id].out_id[j];
        c = mo->label[j_id];

        /* create a new hypothesis with label c */
        if (!created[c]) {
          ighmm_hlist_insert (hplus, c, hP);
          created[c] = *hplus;
          /* initiallize gamma-array with safe size (number of states */
          ARRAY_MALLOC ((*hplus)->gamma_id, m_min (nr_s[c], hP->gamma_states * max_out[hP->hyp_c]));
          (*hplus)->gamma_id[0] = j_id;
          (*hplus)->gamma_states = 1;
          newHyps++;
        }
        /* add a new gamma state to the existing hypothesis with c */
        else {
          g_nr = created[c]->gamma_states;
          /* search for state j_id in the gamma list */
          for (k = 0; k < g_nr; k++)
            if (j_id == created[c]->gamma_id[k])
              break;
          /* add the state to the gamma list */
          if (k == g_nr) {
            created[c]->gamma_id[g_nr] = j_id;
            created[c]->gamma_states = g_nr + 1;
          }
        }
      }
    }
    /* reallocating gamma-array to the correct size */
    for (c = 0; c < labels; c++) {
      if (created[c]) {
        ARRAY_CALLOC (created[c]->gamma_a, created[c]->gamma_states);
        ARRAY_REALLOC (created[c]->gamma_id, created[c]->gamma_states);
        created[c] = NULL;
      }
    }
    hP = hP->next;
    no_oldHyps++;
  }

  /* printf("Created %d new Hypotheses.\n", newHyps); */
  free (created);
  return (no_oldHyps);
STOP:     /* Label STOP from ARRAY_[CM]ALLOC */
  GHMM_LOG(LCONVERTED, "ighmm_hlist_prop_forward failed\n");
  exit (1);
#undef CUR_PROC
}


/*============================================================================*/
/**
   Calculates the logarithm of sum(exp(log_a[j,a_pos])+exp(log_gamma[j,g_pos]))
   which corresponds to the logarithm of the sum of a[j,a_pos]*gamma[j,g_pos]
   @return ighmm_log_sum for products of a row from gamma and a row from matrix A
   @param log_a:      row of the transition matrix with logarithmic values (1.0 for log(0))
   @param s:          ghmm_dstate whose gamma-value is calculated
   @param parent:     a pointer to the parent hypothesis
*/
static double ighmm_log_gamma_sum (double *log_a, ghmm_dstate * s, hypoList * parent) {
#define CUR_PROC "ighmm_log_gamma_sum"
  double result;
  int j, j_id, k;
  double max = 1.0;
  int argmax = 0;
  double *logP;

  /* shortcut for the trivial case */
  if (parent->gamma_states == 1)
    for (j = 0; j < s->in_states; j++)
      if (parent->gamma_id[0] == s->in_id[j])
        return parent->gamma_a[0] + log_a[j];

  ARRAY_MALLOC (logP, s->in_states);

  /* calculate logs of a[k,l]*gamma[k,hi] as sums of logs and find maximum: */
  for (j = 0; j < s->in_states; j++) {
    j_id = s->in_id[j];
    /* search for state j_id in the gamma list */
    for (k = 0; k < parent->gamma_states; k++)
      if (parent->gamma_id[k] == j_id)
        break;
    if (k == parent->gamma_states)
      logP[j] = 1.0;
    else {
      logP[j] = log_a[j] + parent->gamma_a[k];
      if (max == 1.0 || (logP[j] > max && logP[j] != 1.0)) {
        max = logP[j];
        argmax = j;
      }
    }
  }

  /* calculate max+log(1+sum[j!=argmax; exp(logP[j]-max)])  */
  result = 1.0;
  for (j = 0; j < s->in_states; j++)
    if (j != argmax && logP[j] != 1.0)
      result += exp (logP[j] - max);

  result = log (result);
  result += max;

  free (logP);
  return result;
STOP:     /* Label STOP from ARRAY_[CM]ALLOC */
  GHMM_LOG(LCONVERTED, "ighmm_log_gamma_sum failed\n");
  exit (1);
#undef CUR_PROC
}