/*******************************************************************************
*
* This file is part of the General Hidden Markov Model Library,
* GHMM version __VERSION__, see http://ghmm.org
*
* Filename: ghmm/ghmm/gradescent.c
* Authors: Janne Grunau, Alexander Riemer
*
* Copyright (C) 1998-2004 Alexander Schliep
* Copyright (C) 1998-2001 ZAIK/ZPR, Universitaet zu Koeln
* Copyright (C) 2002-2004 Max-Planck-Institut fuer Molekulare Genetik,
* Berlin
*
* Contact: schliep@ghmm.org
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the Free
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*
* This file is version $Revision: 2262 $
* from $Date: 2009-04-22 09:44:25 -0400 (Wed, 22 Apr 2009) $
* last change by $Author: grunau $.
*
*******************************************************************************/
#ifdef HAVE_CONFIG_H
# include "../config.h"
#endif
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "ghmm.h"
#include "mes.h"
#include "mprintf.h"
#include "matrix.h"
#include "model.h"
#include "foba.h"
#include "gradescent.h"
#include "ghmm_internals.h"
static double compute_performance (ghmm_dmodel * mo, ghmm_dseq * sq);
/*----------------------------------------------------------------------------*/
static void gradient_descent_gfree (double **matrix_b, double *matrix_a,
double *matrix_pi, int N)
{
#define CUR_PROC "gradient_descent_gfree"
int i;
if (matrix_b)
for (i = 0; i < N; i++) {
m_free (matrix_b[i]);
}
m_free (matrix_b);
m_free (matrix_a);
m_free (matrix_pi);
#undef CUR_PROC
}
/*----------------------------------------------------------------------------*/
/** allocates memory for m and n matrices: */
static int gradient_descent_galloc (double ***matrix_b, double **matrix_a,
double **matrix_pi, ghmm_dmodel * mo)
{
#define CUR_PROC "gradient_descent_galloc"
int i;
/* first allocate memory for matrix_b */
ARRAY_MALLOC (*matrix_b, mo->N);
for (i = 0; i < mo->N; i++)
ARRAY_CALLOC ((*matrix_b)[i], ghmm_ipow (mo, mo->M, mo->order[i] + 1));
/* matrix_a(i,j) = matrix_a[i*mo->N+j] */
ARRAY_CALLOC (*matrix_a, mo->N * mo->N);
/* allocate memory for matrix_pi */
ARRAY_CALLOC (*matrix_pi, mo->N);
return 0;
STOP: /* Label STOP from ARRAY_[CM]ALLOC */
gradient_descent_gfree (*matrix_b, *matrix_a, *matrix_pi, mo->N);
return -1;
#undef CUR_PROC
}
/*----------------------------------------------------------------------------*/
int ghmm_dmodel_label_gradient_expectations (ghmm_dmodel* mo, double **alpha,
double **beta, double *scale, int *seq,
int seq_len, double **matrix_b,
double *matrix_a, double *vec_pi)
{
#define CUR_PROC "ghmm_dmodel_label_gradient_expectations"
int h, i, j, t;
int size, j_id, e_index;
double gamma, xi;
double foba_sum;
/* initialise matrices with zeros */
for (i = 0; i < mo->N; i++) {
for (j = 0; j < mo->N; j++)
matrix_a[i * mo->N + j] = 0;
size = ghmm_ipow (mo, mo->M, mo->order[i] + 1);
for (h = 0; h < size; h++)
matrix_b[i][h] = 0;
}
for (t = 0; t < seq_len; t++) {
/* sum products of forward and backward variables over all states: */
foba_sum = 0.0;
for (i = 0; i < mo->N; i++)
foba_sum += alpha[t][i] * beta[t][i];
if (GHMM_EPS_PREC > fabs (foba_sum)) {
printf
("gradescent_compute_expect: foba_sum (%g) smaller as EPS_PREC (%g). t = %d.\n",
foba_sum, GHMM_EPS_PREC, t);
return -1;
}
for (i = 0; i < mo->N; i++) {
/* compute gamma implicit */
gamma = alpha[t][i] * beta[t][i] / foba_sum;
/* n_pi is easiest: n_pi(i) = gamma(0,i) */
if (0 == t)
vec_pi[i] = gamma;
/* n_b(i,c) = sum[t, hist(t)=c | gamma(t,i)] / sum[t | gamma(t,i)] */
e_index = get_emission_index (mo, i, seq[t], t);
if (-1 != e_index)
matrix_b[i][e_index] += gamma;
}
/* updating history, xi needs the right e_index for the next state */
update_emission_history (mo, seq[t]);
for (i = 0; i < mo->N; i++) {
/* n_a(i,j) = sum[t=0..T-2 | xi(t,i,j)] / sum[t=0..T-2 | gamma(t,i)] */
/* compute xi only till the state before the last */
for (j = 0; (j < mo->s[i].out_states) && (t < seq_len - 1); j++) {
j_id = mo->s[i].out_id[j];
/* compute xi implicit */
xi = 0;
e_index = get_emission_index (mo, j_id, seq[t + 1], t + 1);
if (e_index != -1)
xi = alpha[t][i] * beta[t + 1][j_id] * mo->s[i].out_a[j]
* mo->s[j_id].b[e_index] / (scale[t + 1] * foba_sum);
matrix_a[i * mo->N + j_id] += xi;
}
}
}
return 0;
#undef CUR_PROC
}
/*----------------------------------------------------------------------------*/
static double compute_performance (ghmm_dmodel * mo, ghmm_dseq * sq)
{
#define CUR_PROC "compute_performance"
int k, seq_len, success = 1;
/* log P[O | lambda, labeling] as computed by the forward algorithm */
double log_p;
/* sum over log P (calculated by forward_label) for all sequences
used to compute the performance of the training */
double log_p_sum = 0.0;
/* loop over all sequences */
for (k = 0; k < sq->seq_number && success; k++) {
success = 0;
seq_len = sq->seq_len[k];
if (-1 !=
ghmm_dmodel_label_logp (mo, sq->seq[k], sq->state_labels[k], seq_len,
&log_p)) {
success = 1;
log_p_sum += log_p;
if (-1 != ghmm_dmodel_logp (mo, sq->seq[k], seq_len, &log_p))
log_p_sum -= log_p;
else {
printf ("ghmm_dmodel_logp error in sequence %d, length: %d\n", k,
seq_len);
success = 0;
}
}
else
printf ("ghmm_dl_logp error in sequence %d, length: %d\n", k,
seq_len);
}
/* return log_p_sum in success or +1.0 a probality of 0.0 on error */
if (success)
return log_p_sum;
else
return 1.0;
#undef CUR_PROC
}
/*----------------------------------------------------------------------------*/
/**
Trains the model with a set of annotated sequences using gradient descent.
Model must not have silent states. (one iteration)
@return 0/-1 success/error
@param mo: pointer to a ghmm_dmodel
@param sq: struct of annotated sequences
@param eta: training parameter for gradient descent
*/
static int gradient_descent_onestep (ghmm_dmodel * mo, ghmm_dseq * sq, double eta)
{
#define CUR_PROC "gradient_descent_onestep"
int g, h, i, j, k;
int seq_len, j_id, hist, size;
/* expected usage of model parameters (A and B entries) */
double *m_pi, *n_pi, *m_a, *n_a;
double **m_b, **n_b;
/* forward & backward variables w/ their scaling vector */
double **alpha, **beta, *scale;
/* variables to store values associated with the algorithm */
double pi_sum, a_row_sum, b_block_sum, gradient;
/* log P[O | lambda, labeling] as computed by the forward algorithm */
double log_p;
/* allocate memory for the parameters used for reestimation */
if (-1 == gradient_descent_galloc (&m_b, &m_a, &m_pi, mo))
return -1;
if (-1 == gradient_descent_galloc (&n_b, &n_a, &n_pi, mo))
return -1;
/* loop over all sequences */
for (k = 0; k < sq->seq_number; k++) {
seq_len = sq->seq_len[k];
if (-1 == ighmm_reestimate_alloc_matvek (&alpha, &beta, &scale, seq_len, mo->N))
continue;
/* calculate forward and backward variables without labels: */
if (-1 == ghmm_dmodel_forward (mo, sq->seq[k], seq_len, alpha, scale, &log_p)) {
printf ("forward error!\n");
goto FREE;
}
if (-1 == ghmm_dmodel_backward (mo, sq->seq[k], seq_len, beta, scale)) {
printf ("backward error!\n");
goto FREE;
}
/* compute n matrices (no labels): */
if (-1 ==
ghmm_dmodel_label_gradient_expectations (mo, alpha, beta, scale, sq->seq[k],
seq_len, m_b, m_a, m_pi))
printf ("Error in sequence %d, length %d (no labels)\n", k, seq_len);
/* calculate forward and backward variables with labels: */
if (-1 ==
ghmm_dmodel_label_forward (mo, sq->seq[k], sq->state_labels[k], seq_len,
alpha, scale, &log_p)) {
printf ("forward labels error!\n");
goto FREE;
}
if (-1 ==
ghmm_dmodel_label_backward (mo, sq->seq[k], sq->state_labels[k], seq_len,
beta, scale, &log_p)) {
printf ("backward labels error!\n");
goto FREE;
}
/* compute m matrices (labels): */
if (-1 ==
ghmm_dmodel_label_gradient_expectations (mo, alpha, beta, scale, sq->seq[k],
seq_len, m_b, m_a, m_pi))
printf ("Error in sequence %d, length %d (with labels)\n", k, seq_len);
/* reestimate model parameters: */
/* PI */
pi_sum = 0;
/* update */
for (i = 0; i < mo->N; i++) {
if (mo->s[i].pi > 0.0) {
gradient = eta * (m_pi[i] - n_pi[i]);
if (mo->s[i].pi + gradient > GHMM_EPS_PREC)
mo->s[i].pi += gradient;
else
mo->s[i].pi = GHMM_EPS_PREC;
}
/* sum over new PI vector */
pi_sum += mo->s[i].pi;
}
if (pi_sum < GHMM_EPS_PREC) {
/* never get here */
fprintf (stderr, "Training ruined the model. You lose.\n");
k = sq->seq_number;
goto FREE;
}
/* normalise */
for (i = 0; i < mo->N; i++)
mo->s[i].pi /= pi_sum;
/* A */
for (i = 0; i < mo->N; i++) {
a_row_sum = 0;
/* update */
for (j = 0; j < mo->s[i].out_states; j++) {
j_id = mo->s[i].out_id[j];
gradient =
eta * (m_a[i * mo->N + j_id] - n_a[i * mo->N + j_id]) / (seq_len -
1);
if (mo->s[i].out_a[j] + gradient > GHMM_EPS_PREC)
mo->s[i].out_a[j] += gradient;
else
mo->s[i].out_a[j] = GHMM_EPS_PREC;
/* sum over rows of new A matrix */
a_row_sum += mo->s[i].out_a[j];
}
if (a_row_sum < GHMM_EPS_PREC) {
/* never get here */
fprintf (stderr, "Training ruined the model. You lose.\n");
k = sq->seq_number;
goto FREE;
}
/* normalise */
for (j = 0; j < mo->s[i].out_states; j++) {
mo->s[i].out_a[j] /= a_row_sum;
/* mirror out_a to corresponding in_a */
j_id = mo->s[i].out_id[j];
for (g = 0; g < mo->s[j_id].in_states; g++)
if (i == mo->s[j_id].in_id[g]) {
mo->s[j_id].in_a[g] = mo->s[i].out_a[j];
break;
}
}
}
/* B */
for (i = 0; i < mo->N; i++) {
/* don't update fix states */
if (mo->s[i].fix)
continue;
/* update */
size = ghmm_ipow (mo, mo->M, mo->order[i]);
for (h = 0; h < size; h++) {
b_block_sum = 0;
for (g = 0; g < mo->M; g++) {
hist = h * mo->M + g;
gradient = eta * (m_b[i][hist] - n_b[i][hist]) / seq_len;
/* printf("gradient[%d][%d] = %g, m_b = %g, n_b = %g\n"
, i, hist, gradient, m_b[i][hist], n_b[i][hist]); */
if (gradient + mo->s[i].b[hist] > GHMM_EPS_PREC)
mo->s[i].b[hist] += gradient;
else
mo->s[i].b[hist] = GHMM_EPS_PREC;
/* sum over M-length blocks of new B matrix */
b_block_sum += mo->s[i].b[hist];
}
if (b_block_sum < GHMM_EPS_PREC) {
/* never get here */
fprintf (stderr, "Training ruined the model. You lose.\n");
k = sq->seq_number;
goto FREE;
}
/* normalise */
for (g = 0; g < mo->M; g++) {
hist = h * mo->M + g;
mo->s[i].b[hist] /= b_block_sum;
}
}
}
/* restore "tied_to" property */
if (mo->model_type & GHMM_kTiedEmissions)
ghmm_dmodel_update_tie_groups (mo);
FREE:
ighmm_reestimate_free_matvek (alpha, beta, scale, seq_len);
}
gradient_descent_gfree (m_b, m_a, m_pi, mo->N);
gradient_descent_gfree (n_b, n_a, n_pi, mo->N);
return 0;
#undef CUR_PROC
}
/*----------------------------------------------------------------------------*/
/**
Trains the ghmm_dmodel with a set of annotated sequences till convergence using
gradient descent.
Model must not have silent states. (checked in Python wrapper)
@return trained model/NULL pointer success/error
@param mo: pointer to a ghmm_dmodel
@param sq: struct of annotated sequences
@param eta: intial parameter eta (learning rate)
@param no_steps number of training steps
*/
ghmm_dmodel* ghmm_dmodel_label_gradient_descent (ghmm_dmodel* mo, ghmm_dseq * sq, double eta, int no_steps)
{
#define CUR_PROC "ghmm_dmodel_label_gradient_descent"
char * str;
int runs = 0;
double cur_perf, last_perf;
ghmm_dmodel *last;
last = ghmm_dmodel_copy(mo);
last_perf = compute_performance (last, sq);
while (eta > GHMM_EPS_PREC && runs < no_steps) {
runs++;
if (-1 == gradient_descent_onestep(mo, sq, eta)) {
ghmm_dmodel_free(&last);
return NULL;
}
cur_perf = compute_performance(mo, sq);
if (last_perf < cur_perf) {
/* if model is degenerated, lower eta and try again */
if (cur_perf > 0.0) {
str = ighmm_mprintf(NULL, 0, "current performance = %g", cur_perf);
GHMM_LOG(LINFO, str);
m_free(str);
ghmm_dmodel_free(&mo);
mo = ghmm_dmodel_copy(last);
eta *= .5;
}
else {
/* Improvement insignificant, assume convergence */
if (fabs (last_perf - cur_perf) < cur_perf * (-1e-8)) {
ghmm_dmodel_free(&last);
str = ighmm_mprintf(NULL, 0, "convergence after %d steps.", runs);
GHMM_LOG(LINFO, str);
m_free(str);
return 0;
}
if (runs < 175 || 0 == runs % 50) {
str = ighmm_mprintf(NULL, 0, "Performance: %g\t improvement: %g\t step %d", cur_perf,
cur_perf - last_perf, runs);
GHMM_LOG(LINFO, str);
m_free(str);
}
/* significant improvement, next iteration */
ghmm_dmodel_free(&last);
last = ghmm_dmodel_copy(mo);
last_perf = cur_perf;
eta *= 1.07;
}
}
/* no improvement */
else {
if (runs < 175 || 0 == runs % 50) {
str = ighmm_mprintf(NULL, 0, "Performance: %g\t !IMPROVEMENT: %g\t step %d", cur_perf,
cur_perf - last_perf, runs);
GHMM_LOG(LINFO, str);
m_free(str);
}
/* try another training step */
runs++;
eta *= .85;
if (-1 == gradient_descent_onestep(mo, sq, eta)) {
ghmm_dmodel_free(&last);
return NULL;
}
cur_perf = compute_performance (mo, sq);
str = ighmm_mprintf(NULL, 0, "Performance: %g\t ?Improvement: %g\t step %d", cur_perf,
cur_perf - last_perf, runs);
GHMM_LOG(LINFO, str);
m_free(str);
/* improvement, save and proceed with next iteration */
if (last_perf < cur_perf && cur_perf < 0.0) {
ghmm_dmodel_free (&last);
last = ghmm_dmodel_copy(mo);
last_perf = cur_perf;
}
/* still no improvement, revert to saved model */
else {
runs--;
ghmm_dmodel_free(&mo);
mo = ghmm_dmodel_copy(last);
eta *= .9;
}
}
}
ghmm_dmodel_free(&last);
return mo;
#undef CUR_PROC
}