/*******************************************************************************
*
* This file is part of the General Hidden Markov Model Library,
* GHMM version __VERSION__, see http://ghmm.org
*
* Filename: ghmm/ghmm/scluster.c
* Authors: Bernhard Knab, Benjamin Georgi
*
* Copyright (C) 1998-2004 Alexander Schliep
* Copyright (C) 1998-2001 ZAIK/ZPR, Universitaet zu Koeln
* Copyright (C) 2002-2004 Max-Planck-Institut fuer Molekulare Genetik,
* Berlin
*
* Contact: schliep@ghmm.org
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the Free
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*
* This file is version $Revision: 2267 $
* from $Date: 2009-04-24 11:01:58 -0400 (Fri, 24 Apr 2009) $
* last change by $Author: grunau $.
*
*******************************************************************************/
#ifdef HAVE_CONFIG_H
# include "../config.h"
#endif
#include "ghmmconfig.h"
#ifdef GHMM_OBSOLETE
#include <stdio.h>
#undef HAVE_LIBPTHREAD
#ifdef HAVE_LIBPTHREAD
# include <pthread.h>
#endif /* HAVE_LIBPTHREAD */
#include <float.h>
#include <math.h>
#include <time.h>
#include "ghmm.h"
#include "mprintf.h"
#include "mes.h"
#include "scluster.h"
#include "smodel.h"
#include "sreestimate.h"
#include "sfoba.h"
#include "rng.h"
#include "sequence.h"
#include "matrix.h"
#include "vector.h"
#include "ghmm_internals.h"
#include "obsolete.h"
#include "smap_classify.h"
#ifdef HAVE_LIBPTHREAD
/* switch for parallel mode: (1) = sequential, (0) = parallel */
# define POUT 0
/* number of parallel threads */
# define THREADS 4
#else /* */
# define POUT 1
#endif /* HAVE_LIBPTHREAD */
#define POUT 1
/*============================================================================*/
int ghmm_scluster_hmm (char *argv[])
{
# define CUR_PROC "ghmm_scluster_hmm"
char *seq_file = argv[1], *smo_file = argv[2], *out_filename = argv[3];
int labels = atoi (argv[4]);
int res = -1, i, iter = 0, sqd_number, idummy;
ghmm_cseq * sqd = NULL;
ghmm_cseq ** sqd_vec = NULL; /* only temp. pointer */
long j, changes = 1;
long *oldlabel, *smo_changed;
double log_p, log_apo;
double **all_log_p = NULL; /* for storing all log_p */
FILE * outfile = NULL;
char *str;
char filename[1024];
scluster_t cl;
double eps_bw, q;
int max_iter_bw;
/* ghmm_cmodel_baum_welch needs this structure (introduced for parallel mode) */
ghmm_cmodel_baum_welch_context * cs;
#if POUT == 0
int *return_value;
pthread_t * tid;
int perror;
pthread_attr_t Attribute;
#endif /* POUT */
cl.smo = NULL;
cl.smo_seq = NULL;
cl.seq_counter = NULL;
cl.smo_Z_MD = NULL;
cl.smo_Z_MAW = NULL;
/* -----------------initialization ------------------------- */
fprintf (stdout, "Clustering seqs. \"%s\"\nwith ", seq_file);
if (CLASSIFY == 0)
fprintf (stdout, "MD-Classification\n");
else
fprintf (stdout, "MAW-Classification\n");
fflush (stdout);
/*-----------open output file----------------------------------*/
sprintf (filename, "%s%s", out_filename, ".cl");
if (!(outfile = ighmm_mes_fopen (filename, "wt"))) {
GHMM_LOG_QUEUED(LCONVERTED);
goto STOP;
}
/*--------------Memory allocation and data reading----------------------------*/
/* 1 sequence array and initial models */
ghmm_scluster_print_header (outfile, argv);
/*--- memory alloc and read data ----------------------------*/
sqd_vec = ghmm_cseq_read (seq_file, &sqd_number);
if (!sqd_vec) {
GHMM_LOG_QUEUED(LCONVERTED);
goto STOP;
}
sqd = sqd_vec[0];
cl.smo = ghmm_cmodel_read (smo_file, &cl.smo_number);
if (!cl.smo) {
GHMM_LOG_QUEUED(LCONVERTED);
goto STOP;
}
/* if labels == 0 || labels == 1: no startlabels needed. */
/* if `labels` = 3: random start labels */
if (labels == 3) {
fprintf (outfile, "(random start partition)n");
fprintf (stdout, "(Random start partition...)\n");
ghmm_scluster_random_labels (sqd, cl.smo_number);
}
ARRAY_CALLOC (oldlabel, sqd->seq_number);
for (i = 0; i < sqd->seq_number; i++)
oldlabel[i] = (-1);
ARRAY_CALLOC (cl.smo_seq, cl.smo_number);
ARRAY_CALLOC (cl.seq_counter, cl.smo_number);
ARRAY_CALLOC (cl.smo_Z_MD, cl.smo_number);
ARRAY_CALLOC (cl.smo_Z_MAW, cl.smo_number);
all_log_p = ighmm_cmatrix_alloc (cl.smo_number, (int) sqd->seq_number);
if (!all_log_p) {
GHMM_LOG_QUEUED(LCONVERTED);
goto STOP;
}
/*if (ghmm_cmodel_check_compatibility(cl.smo, cl.smo_number)) {
GHMM_LOG_QUEUED(LCONVERTED); goto STOP;
} */
ARRAY_CALLOC (smo_changed, cl.smo_number);
for (i = 0; i < cl.smo_number; i++) {
cl.smo_seq[i] = NULL;
smo_changed[i] = 1;
}
/* uninformative model prior */
if (CLASSIFY == 1)
for (i = 0; i < cl.smo_number; i++)
cl.smo[i]->prior = 1 / (double) cl.smo_number;
/*--------for parallel mode --------------------*/
#if POUT == 0
/* id for threads */
ARRAY_CALLOC (tid, cl.smo_number);
#endif /* POUT */
/* data structure for threads */
ARRAY_CALLOC (cs, cl.smo_number);
for (i = 0; i < cl.smo_number; i++)
cs[i].smo = cl.smo[i];
/* return values for each thread */
#if POUT == 0
ARRAY_CALLOC (return_value, cl.smo_number);
#endif /* POUT */
#ifdef HAVE_LIBPTHREAD
pthread_attr_init (&Attribute);
pthread_attr_setscope (&Attribute, PTHREAD_SCOPE_SYSTEM);
#endif /* HAVE_LIBPTHREAD */
/* eps_bw: stopping criterion in baum-welch
max_iter_bw: max. number of baum-welch iterations */
eps_bw = 0.0; /* not used */
q = 0.1; /* ??? */
max_iter_bw = 20; /*MAX_ITER_BW; */
/*------------------------main loop-------------------------------------------*/
/* do it until no sequence changes model;
attention: error function values are not used directly as a stopping criterion */
while (changes > 0 || eps_bw > GHMM_EPS_ITER_BW || max_iter_bw < GHMM_MAX_ITER_BW) {
iter++;
/* reset error functions and counters */
for (i = 0; i < cl.smo_number; i++) {
cl.seq_counter[i] = 0;
cl.smo_Z_MD[i] = 0.0;
cl.smo_Z_MAW[i] = 0.0;
}
/* set pointer of cs to sqd-field and matrix of all_logp values */
for (i = 0; i < cl.smo_number; i++) {
cs[i].sqd = sqd; /* all seqs. ! */
cs[i].logp = all_log_p[i];
}
/* ------------calculate logp for all seqs. and all models -------------- */
#if POUT
/* sequential version */
for (i = 0; i < cl.smo_number; i++) {
if (!smo_changed[i])
continue;
ghmm_scluster_prob (&cs[i]);
}
#else /* */
/* parallel version */
i = 0;
while (i < cl.smo_number) {
for (j = i; j < m_min (cl.smo_number, i + THREADS); j++) {
if (!smo_changed[j])
continue;
if ((perror = pthread_create (&tid[j], &Attribute,
(void *(*)(void *)) scluster_prob,
(void *) &cs[j]))) {
str =
ighmm_mprintf (NULL, 0,
"pthread_create returns false (step %d, smo[%d])\n",
iter, j);
GHMM_LOG(LCONVERTED, str);
m_free (str);
goto STOP;
}
}
for (j = i; j < m_min (cl.smo_number, i + THREADS); j++) {
if (smo_changed[j])
pthread_join (tid[j], NULL);
}
i = j;
}
#endif /* */
if (iter == 1 && labels > 1) {
/* use sequence labels as start labels */
fprintf (outfile, "(sequences with start labels!)\n");
fprintf (stdout, "(sequences with start labels!)\n");
}
/*----------- classification, sequence labeling and error function -----------*/
if (iter == 1 && labels == 1) {
for (i = 0; i < cl.smo_number; i++) {
cl.seq_counter[i] = sqd->seq_number;
}
}
else {
for (j = 0; j < sqd->seq_number; j++) {
if (iter > 1 || labels == 0)
/* classification: set seq_label to ID of best_model */
sqd->seq_label[j] =
ghmm_scluster_best_model (&cl, j, all_log_p, &log_p);
if (sqd->seq_label[j] == -1 || sqd->seq_label[j] >= cl.smo_number) {
/* no model fits! What to do? hack: use arbitrary model ! */
str = ighmm_mprintf (NULL, 0,
"Warning: seq. %ld, ID %.0f: ghmm_scluster_best_model returns %d\n",
j, sqd->seq_id[j], sqd->seq_label[j]);
GHMM_LOG(LCONVERTED, str);
m_free (str);
sqd->seq_label[j] = j % cl.smo_number;
/* goto STOP; */
}
cl.seq_counter[sqd->seq_label[j]]++;
/* add to error function value with seq. weights */
/* 1. Z_MD */
cl.smo_Z_MD[sqd->seq_label[j]] +=
sqd->seq_w[j] * all_log_p[sqd->seq_label[j]][j];
/* 2. Z_MAW */
if (CLASSIFY == 1) {
idummy = ghmm_scluster_log_aposteriori (&cl, sqd, j, &log_apo);
if (idummy == -1) {
str = ighmm_mprintf (NULL, 0,
"Warn: no model fits to Seq %10.0f, use PENALTY_LOGP\n",
sqd->seq_id[j]);
GHMM_LOG(LCONVERTED, str);
m_free (str);
cl.smo_Z_MAW[sqd->seq_label[j]] += sqd->seq_w[j] * GHMM_PENALTY_LOGP;
continue;
}
if (idummy != sqd->seq_label[j]) {
printf ("Warn: best_model = %ld; best_bayes(logapo) = %d\n",
sqd->seq_label[j], idummy);
}
cl.smo_Z_MAW[sqd->seq_label[j]] += sqd->seq_w[j] * log_apo;
}
} /* for (j = 0; j < sqd->seq_number; j++) */
} /* else */
if (!(iter == 1 && labels == 1)) {
if (ghmm_scluster_avoid_empty_smodel (sqd, &cl) == -1) {
GHMM_LOG_QUEUED(LCONVERTED);
goto STOP;
}
for (i = 0; i < cl.smo_number; i++)
smo_changed[i] = 0;
changes =
ghmm_scluster_update_label (oldlabel, sqd->seq_label, sqd->seq_number,
smo_changed);
}
fprintf (outfile, "%ld changes in iteration %d \n", changes, iter);
fprintf (stdout, "\n*** %ld changes in iteration %d ***\n", changes,
iter);
/* NEW: If no changes anymore: increase max_iter
( Alternative: change eps ) */
if (changes == 0 && max_iter_bw < GHMM_MAX_ITER_BW) {
max_iter_bw = GHMM_MAX_ITER_BW;
changes = 1; /* so that reestimated and assigned again */
for (i = 0; i < cl.smo_number; i++)
smo_changed[i] = 1;
fprintf (outfile, "max_iter_bw := %d\n", max_iter_bw);
fprintf (stdout, "*** max_iter_bw := %d ***\n", max_iter_bw);
}
/* -------Reestimate all models with assigned sequences---------------- */
if (changes > 0) {
if (!(iter == 1 && labels == 1))
if (ghmm_scluster_update (&cl, sqd)) {
GHMM_LOG_QUEUED(LCONVERTED);
goto STOP;
}
/* TEST: Write out k-Means-Start-Clusterung */
if (iter == 1 && labels == 2) {
for (i = 0; i < cl.smo_number; i++) {
if (cl.smo_seq[i] != NULL)
ghmm_cseq_print (stdout, cl.smo_seq[i], 0);
}
}
/*if (labels == -1) {
fprintf(outfile, "(only determining the best model for each seq.!)\n");
fprintf(stdout, "(only determining the best model for each seq.!)\n");
break;
}
*/
fprintf (outfile, "\ntotal Prob. before %d.reestimate:\n", iter);
ghmm_scluster_print_likelihood (outfile, &cl);
for (i = 0; i < cl.smo_number; i++) {
cs[i].smo = cl.smo[i];
if (!(iter == 1 && labels == 1))
cs[i].sqd = cl.smo_seq[i];
cs[i].logp = &cl.smo_Z_MD[i];
cs[i].eps = eps_bw;
cs[i].max_iter = max_iter_bw;
}
#if POUT
/* sequential version */
for (i = 0; i < cl.smo_number; i++) {
printf ("SMO %d\n", i);
if (!smo_changed[i])
continue;
if (cs[i].sqd == NULL)
ighmm_mes (MES_WIN, "cluster %d empty, no reestimate!\n", i);
else if (ghmm_cmodel_baum_welch (&cs[i]) == -1) {
str = ighmm_mprintf (NULL, 0, "%d.reestimate false, smo[%d]\n", iter, i);
GHMM_LOG(LCONVERTED, str);
m_free (str);
/* ghmm_dmodel_print(stdout, cl.mo[i]); */
goto STOP;
}
}
#else /* */
/* parallel version */
i = 0;
while (i < cl.smo_number) {
for (j = i; j < m_min (cl.smo_number, i + THREADS); j++) {
if (!smo_changed[j])
continue;
if (cs[j].sqd == NULL)
ighmm_mes (MES_WIN, "cluster %d empty, no reestimate!\n", j);
else if ((perror = pthread_create (&tid[j], &Attribute,
(void *(*)(void *))
sreestimate_baum_welch,
(void *) &cs[j]))) {
str =
ighmm_mprintf (NULL, 0,
"pthread_create returns false (step %d, smo[%d])\n",
iter, j);
GHMM_LOG(LCONVERTED, str);
m_free (str);
goto STOP;
}
}
for (j = i; j < m_min (cl.smo_number, i + THREADS); j++) {
if (!smo_changed[j])
continue;
if (cs[j].sqd != NULL) {
pthread_join (tid[j], (void **) &return_value[j]);
if (return_value[j] == -1) {
str =
ighmm_mprintf (NULL, 0, "%d.reestimate false, smo[%d]\n", iter, j);
GHMM_LOG(LCONVERTED, str);
m_free (str);
goto STOP;
}
}
}
i = j;
}
#endif /* */
/* update model priors */
if (CLASSIFY == 1) {
if (sqd->total_w == 0) {
GHMM_LOG(LCONVERTED, "weights = 0!\n");
goto STOP;
}
for (i = 0; i < cl.smo_number; i++)
cl.smo[i]->prior = cl.smo_seq[i]->total_w / sqd->total_w;
}
fprintf (outfile, "\ntotal Prob. after %d.reestimate:\n", iter);
ghmm_scluster_print_likelihood (outfile, &cl);
} /* if changes ... end reestimate */
} /* while */
/*------------------- OUTPUT -----------------------------------------------*/
if (ghmm_scluster_out (&cl, sqd, outfile, argv) == -1)
{
GHMM_LOG_QUEUED(LCONVERTED);
goto STOP;
}
/*--------------------------------------------------------------------------*/
res = 0;
STOP: /* Label STOP from ARRAY_[CM]ALLOC */
#if POUT == 0
pthread_attr_destroy (&Attribute);
#endif /* POUT */
/* XXX ...still div. free! XXX */
if (outfile)
fclose (outfile);
return (res);
# undef CUR_PROC
} /* ghmm_scluster_hmm */
/*============================================================================*/
int ghmm_scluster_t_free (scluster_t * scl)
{
int i;
#define CUR_PROC "ghmm_scluster_t_free"
mes_check_ptr (scl, return (-1));
if (!scl)
return (0);
for (i = 0; i <= scl->smo_number; i++) {
ghmm_cmodel_free (&(scl->smo[i]));
ghmm_cseq_free (&(scl->smo_seq[i]));
}
printf ("hier1\n");
m_free (scl->smo);
m_free (scl->smo_seq);
m_free (scl->seq_counter);
m_free (scl->smo_Z_MD);
m_free (scl->smo_Z_MAW);
return 0;
# undef CUR_PROC
}
/*============================================================================*/
int ghmm_scluster_out (scluster_t * cl, ghmm_cseq * sqd, FILE * outfile,
char *argv[])
{
#define CUR_PROC "ghmm_scluster_out"
int res = -1, i;
char filename[128];
char *out_filename = argv[3];
FILE * out_model = NULL;
/*fprintf(outfile, "\nFinal Models:\n");
for (i = 0; i < cl->smo_number; i++) {
fprintf(outfile, "smodel[%d]:\n", i);
ghmm_cmodel_print(outfile, cl->smo[i]);
fprintf(outfile, "Sequences of smodel[%d] (# %ld, total weight %.0f)\n",
i, cl->smo_seq[i]->seq_number,
cl->smo_seq[i]->total_w);*/
/*for (k = 0; k < sqd->seq_number; k++) {*/
/*if (sqd->seq_label[k] == i) {*/
/* fuer Wetterdaten: Numerierung von 1 - .. */
/*fprintf(outfile, "\t%4d\t|%.0f|\t", k+1, sqd->seq_w[k]);*/
/*ighmm_cvector_print(outfile, sqd->seq[k], sqd->seq_len[k]," "," ","");*/
/*} */
/*}*/
/*
fprintf(outfile, "(%ld sequences)\n\n", cl->seq_counter[i]);
}*/
sprintf (filename, "%s.smo", out_filename);
if (!(out_model = ighmm_mes_fopen (filename, "wt"))) {
GHMM_LOG_QUEUED(LCONVERTED);
goto STOP;
}
ghmm_scluster_print_header (out_model, argv);
for (i = 0; i < cl->smo_number; i++) {
fprintf (out_model, "#trained smodel[%d]:\n", i);
ghmm_cmodel_print (out_model, cl->smo[i]);
}
fclose (out_model);
/* Output from all sequences in HMM-format; different clusters
make seperate lists */
fclose (out_model);
sprintf (filename, "%s.sqd", out_filename);
if (!(out_model = ighmm_mes_fopen (filename, "wt"))) {
GHMM_LOG_QUEUED(LCONVERTED);
goto STOP;
}
ghmm_scluster_print_header (out_model, argv);
for (i = 0; i < cl->smo_number; i++) {
if (cl->smo_seq[i] != NULL)
ghmm_cseq_print (out_model, cl->smo_seq[i], 0);
}
/* Output from all sequences in one row with clusterlabel */
/* ghmm_cseq_print(out_model, sqd, 0); */
/* Output: The number of sequences and sequence weights pro cluster,
respektively (for the generator) */
fclose (out_model);
sprintf (filename, "%s.numbers", out_filename);
if (!(out_model = ighmm_mes_fopen (filename, "wt"))) {
GHMM_LOG_QUEUED(LCONVERTED);
goto STOP;
}
ghmm_scluster_print_header (out_model, argv);
fprintf (out_model, "numbers = {\n");
fprintf (out_model,
"# Clusterung mit Gewichten --> in BS/10, sonst Anzahl Seqs.\n");
/* Weights */
if (cl->smo_seq[0]->total_w > cl->smo_seq[0]->seq_number) {
for (i = 0; i < cl->smo_number - 1; i++)
fprintf (out_model, "%.0f,\n", 0.1 * cl->smo_seq[i]->total_w);
fprintf (out_model, "%.0f;\n};",
0.1 * cl->smo_seq[cl->smo_number - 1]->total_w);
}
/* Number */
else {
for (i = 0; i < cl->smo_number - 1; i++)
fprintf (out_model, "%ld,\n", cl->seq_counter[i]);
fprintf (out_model, "%ld;\n};", cl->seq_counter[cl->smo_number - 1]);
}
res = 0;
STOP:if (out_model)
fclose (out_model);
return (res);
#undef CUR_PROC
} /* ghmm_scluster_out */
/*============================================================================*/
/* Memory for sequences for each model is allocated only once and not done with
realloc for each sequence as before. */
int ghmm_scluster_update (scluster_t * cl, ghmm_cseq * sqd)
{
#define CUR_PROC "ghmm_scluster_update"
int i;
ghmm_cseq * seq_ptr;
/* Allocate memoery block by block */
for (i = 0; i < cl->smo_number; i++) {
if (cl->smo_seq[i]) {
/* Important: No ghmm_dseq_free here, because then the originals will be lost. */
ghmm_cseq_clean (cl->smo_seq[i]);
m_free (cl->smo_seq[i]);
}
if (cl->seq_counter[i] != 0) {
cl->smo_seq[i] = ghmm_cseq_calloc (cl->seq_counter[i]);
cl->smo_seq[i]->seq_number = 0; /* counted below */
}
else
cl->smo_seq[i] = NULL;
}
/* Set entries */
for (i = 0; i < sqd->seq_number; i++) {
seq_ptr = cl->smo_seq[sqd->seq_label[i]];
seq_ptr->seq_len[seq_ptr->seq_number] = sqd->seq_len[i];
seq_ptr->seq_id[seq_ptr->seq_number] = sqd->seq_id[i];
seq_ptr->seq[seq_ptr->seq_number] = sqd->seq[i]; /* Pointer!!! */
seq_ptr->seq_label[seq_ptr->seq_number] = sqd->seq_label[i];
seq_ptr->seq_w[seq_ptr->seq_number] = sqd->seq_w[i];
seq_ptr->seq_number++;
seq_ptr->total_w += sqd->seq_w[i];
}
return (0);
# undef CUR_PROC
} /* ghmm_scluster_update */
/*============================================================================*/
void ghmm_scluster_print_likelihood (FILE * outfile, scluster_t * cl)
{
double total_Z_MD = 0.0, total_Z_MAW = 0.0;
int i;
for (i = 0; i < cl->smo_number; i++) {
total_Z_MD += cl->smo_Z_MD[i];
total_Z_MAW += cl->smo_Z_MAW[i];
fprintf (outfile, "smo %d\t(#Seq. %4ld):\t", i, cl->seq_counter[i]);
if (CLASSIFY == 0)
fprintf (outfile, "ZMD:%8.2f", cl->smo_Z_MD[i]);
else
fprintf (outfile, "ZMAW:%8.2f", cl->smo_Z_MAW[i]);
fprintf (outfile, "\n");
}
fprintf (outfile, "sum\t");
if (CLASSIFY == 0)
fprintf (outfile, "ZMD: %12.5f", total_Z_MD);
else
fprintf (outfile, "ZMAW: %12.5f", total_Z_MAW);
fprintf (outfile, "\n\n");
if (CLASSIFY == 0)
printf ("total error function (ZMD): %15.4f\n", total_Z_MD);
else
printf ("total error function (ZMAW): %15.4f\n", total_Z_MAW);
} /* ghmm_scluster_print_likelihood */
/*============================================================================*/
/* Avoids empty models going out as outputs by assigning them a random
sequence. This may lead to a produce of a new empty model - therefore
change out sequences until a non empty model is found. (Quit after 100
iterations to avoid a infinite loop). */
int ghmm_scluster_avoid_empty_smodel (ghmm_cseq * sqd, scluster_t * cl)
{
#define CUR_PROC "ghmm_scluster_avoid_empty_smodel"
int i, best_smo;
long i_old, j = 0;
char error = 1, change = 0;
int iter = 0;
double log_p_minus, log_p_plus;
double *result = NULL;
/* If only one Model or one sequencet: Nothing to be done */
if (sqd->seq_number < 2 || cl->smo_number < 2)
return (0);
if (CLASSIFY == 1)
ARRAY_CALLOC (result, cl->smo_number);
while (error && iter < 100) {
iter++;
error = 0;
for (i = 0; i < cl->smo_number; i++) {
if (cl->seq_counter[i] <= 1) {
change = 1;
/* Choose a random sequence for an empty model */
j = m_int (GHMM_RNG_UNIFORM (RNG) * (sqd->seq_number - 1));
/* If it's not possible to produce a sequence for an empty model: Quit */
if (ghmm_cmodel_logp
(cl->smo[i], sqd->seq[j], sqd->seq_len[j], &log_p_plus) == -1)
continue;
if (CLASSIFY == 1) {
best_smo =
ghmm_smap_bayes (cl->smo, result, cl->smo_number, sqd->seq[j],
sqd->seq_len[j]);
if (best_smo == -1)
continue;
}
i_old = sqd->seq_label[j];
/* updates */
printf ("!!\"avoid_empty_model\": move seq. %ld: %ld --> %d !!\n",
j, i_old, i);
cl->seq_counter[i_old]--;
cl->seq_counter[i]++;
sqd->seq_label[j] = i;
/* smo_Z_MD update */
if (ghmm_cmodel_logp
(cl->smo[i_old], sqd->seq[j], sqd->seq_len[j],
&log_p_minus) == -1)
{
GHMM_LOG(LCONVERTED, "ghmm_cmodel_logp returns -1!\n");
goto STOP;
};
cl->smo_Z_MD[i_old] -= sqd->seq_w[j] * log_p_minus;
cl->smo_Z_MD[i] += sqd->seq_w[j] * log_p_plus;
/* smo_Z_MAW update */
if (CLASSIFY == 1) {
cl->smo_Z_MAW[i_old] -= sqd->seq_w[j] * result[i_old];
cl->smo_Z_MAW[i] += sqd->seq_w[j] * result[i];
}
}
}
/* Is now everything OK ? */
if (change) {
for (i = 0; i < cl->smo_number; i++) {
if (cl->seq_counter[i] <= 1) {
error = 1;
break;
}
}
}
} /* while */
if (!error)
return (0);
STOP:
if (result){
m_free (result);
}
return (-1);
#undef CUR_PROC
} /* ghmm_scluster_avoid_empty_smodel */
/*============================================================================*/
long ghmm_scluster_update_label (long *oldlabel, long *seq_label, long seq_number,
long *smo_changed)
{
long i, changes = 0;
for (i = 0; i < seq_number; i++)
if (oldlabel[i] != seq_label[i]) {
changes++;
smo_changed[oldlabel[i]] = smo_changed[seq_label[i]] = 1;
oldlabel[i] = seq_label[i];
}
return changes;
} /* ghmm_scluster_update_label */
/*============================================================================*/
/* Determines from an already calculated probability matrix, which model
fits best to the sequence with the ID seq_id. */
int ghmm_scluster_best_model (scluster_t * cl, long seq_id, double **all_log_p,
double *log_p)
{
#define CUR_PROC "ghmm_scluster_best_model"
int i, smo_id;
double save = -DBL_MAX;
*log_p = -DBL_MAX;
smo_id = -1;
/* MD-Classify: argmax_i (log(p(O | \lambda_i ))) */
if (CLASSIFY == 0) {
for (i = 0; i < cl->smo_number; i++) {
if (all_log_p[i][seq_id] == GHMM_PENALTY_LOGP)
continue; /* model and seq. don't fit */
if (*log_p < all_log_p[i][seq_id]) {
*log_p = all_log_p[i][seq_id];
smo_id = i;
}
}
}
/* MAW-Classify: argmax_i (log(p(O | \lambda_i )) + log(p( \lambda_i))) */
else {
for (i = 0; i < cl->smo_number; i++) {
if (cl->smo[i]->prior == -1) {
GHMM_LOG(LCONVERTED, "Error! Need model prior for MAW-classification\n");
}
if (cl->smo[i]->prior != 0) {
if (all_log_p[i][seq_id] == GHMM_PENALTY_LOGP)
continue; /* model and seq. don't fit */
if (save < all_log_p[i][seq_id] + log (cl->smo[i]->prior)) {
save = all_log_p[i][seq_id] + log (cl->smo[i]->prior);
*log_p = all_log_p[i][seq_id];
smo_id = i;
}
}
}
}
return (smo_id);
#undef CUR_PROC
} /* ghmm_scluster_best_model */
/*============================================================================*/
void ghmm_scluster_prob (ghmm_cmodel_baum_welch_context * cs)
{
int i;
/*printf("seq_num = %d\n", cs->sqd->seq_number);*/
for (i = 0; i < cs->sqd->seq_number; i++)
if (ghmm_cmodel_logp
(cs->smo, cs->sqd->seq[i], cs->sqd->seq_len[i],
&(cs->logp[i])) == -1)
cs->logp[i] = (double) GHMM_PENALTY_LOGP; /* Penalty costs */
} /* ghmm_scluster_prob */
/*============================================================================*/
int ghmm_scluster_random_labels (ghmm_cseq * sqd, int smo_number)
{
#define CUR_PROC "ghmm_scluster_random_labels"
int j, label;
for (j = 0; j < sqd->seq_number; j++) {
label = m_int (GHMM_RNG_UNIFORM (RNG) * (smo_number - 1));
sqd->seq_label[j] = label;
}
return (0);
# undef CUR_PROC
} /* ghmm_scluster_random_labels */
/*============================================================================*/
int ghmm_scluster_log_aposteriori (scluster_t * cl, ghmm_cseq * sqd,
int seq_id, double *log_apo)
{
#define CUR_PROC "ghmm_scluster_log_aposteriori"
double *result = NULL;
int best_smo = -1;
ARRAY_CALLOC (result, cl->smo_number);
best_smo =
ghmm_smap_bayes (cl->smo, result, cl->smo_number, sqd->seq[seq_id],
sqd->seq_len[seq_id]);
if (best_smo == -1) {
GHMM_LOG(LCONVERTED, "best_smo == -1 !\n");
goto STOP;
}
else
*log_apo = result[best_smo];
/* TEST */
/* printf("result: ");
for (i = 0; i< cl->smo_number; i++)
printf("%.2f ", result[i]); printf("\n");
*/
STOP:m_free (result);
return best_smo;
# undef CUR_PROC
} /* ghmm_scluster_log_aposteriori */
/*============================================================================*/
void ghmm_scluster_print_header (FILE * file, char *argv[])
{
time_t zeit;
time (&zeit);
fprintf (file, "# Date: %s# scluster:\n", ctime ((const time_t *) &zeit));
fprintf (file, "# Sequence File: %s\n", argv[1]);
fprintf (file, "# Model File: %s\n", argv[2]);
fprintf (file, "# Start Partion Label: ");
switch (atoi (argv[4])) {
case 0:
fprintf (file, "SP_BEST (best model)\n\n");
break;
case 1:
fprintf (file, "NO_SP (no start partition)\n\n");
break;
case 2:
fprintf (file, "SP_KM (partition from k-means)\n\n");
break;
case 3:
fprintf (file, "SP_ZUF (random start partition)\n\n");
break;
default:
fprintf (file, "???\n\n");
}
}
#if POUT == 0
#undef THREADS
#endif /* HAVE_LIBPTHREAD */
#undef POUT
#endif /* GHMM_OBSOLETE */