/*******************************************************************************
*
* This file is part of the General Hidden Markov Model Library,
* GHMM version __VERSION__, see http://ghmm.org
*
* Filename: ghmm/ghmm/cluster.c
* Authors: Bernd Wichern
*
* Copyright (C) 1998-2004 Alexander Schliep
* Copyright (C) 1998-2001 ZAIK/ZPR, Universitaet zu Koeln
* Copyright (C) 2002-2004 Max-Planck-Institut fuer Molekulare Genetik,
* Berlin
*
* Contact: schliep@ghmm.org
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the Free
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*
* This file is version $Revision: 2267 $
* from $Date: 2009-04-24 11:01:58 -0400 (Fri, 24 Apr 2009) $
* last change by $Author: grunau $.
*
*******************************************************************************/
#ifdef HAVE_CONFIG_H
# include "../config.h"
#endif
#include "ghmmconfig.h"
#ifdef GHMM_OBSOLETE
#include <stdio.h>
#include "ghmm.h"
#include "mprintf.h"
#include "mes.h"
#include "cluster.h"
#include "rng.h"
#include "model.h"
#include "sequence.h"
#include "reestimate.h"
#include "foba.h"
#include "matrix.h"
#include "ghmm_internals.h"
#include "obsolete.h"
/*============================================================================*/
int ghmm_cluster_hmm (char *seq_file, char *mo_file, char *out_filename)
{
# define CUR_PROC "ghmm_cluster_hmm"
int res = -1, i, iter = 0, sq_number;
ghmm_dseq *sq = NULL, **sq_vec = NULL;
long j, changes = 1;
long *oldlabel;
double log_p;
char * str;
FILE *outfile = NULL;
cluster_t cl;
cl.mo = NULL;
cl.mo_seq = NULL;
if (!(outfile = ighmm_mes_fopen (out_filename, "wt"))) {
GHMM_LOG_QUEUED(LCONVERTED);
goto STOP;
}
/*----------------------------------------------------------------------------*/
/* Allocate memory and read in data: Sequences and initial model */
sq_vec = ghmm_dseq_read (seq_file, &sq_number);
if (!sq_vec[0]) {
GHMM_LOG_QUEUED(LCONVERTED);
goto STOP;
}
if (sq_number > 1)
GHMM_LOG(LCONVERTED, "Warning: seq. file contains multiple seq. arrays. \
Only first array is used for clustering\n");
sq = sq_vec[0];
fprintf (outfile, "Cluster Sequences\n");
ghmm_dseq_print (outfile, sq);
cl.mo = ghmm_dmodel_read (mo_file, &cl.mo_number);
if (!cl.mo) {
GHMM_LOG_QUEUED(LCONVERTED);
goto STOP;
}
ARRAY_CALLOC (oldlabel, sq->seq_number);
for (i = 0; i < sq->seq_number; i++)
oldlabel[i] = (-1);
ARRAY_CALLOC (cl.mo_seq, cl.mo_number);
for (i = 0; i < cl.mo_number; i++)
cl.mo_seq[i] = NULL;
if (ghmm_dmodel_check_compatibility (cl.mo, cl.mo_number)) {
GHMM_LOG_QUEUED(LCONVERTED);
goto STOP;
}
/*----------------------------------------------------------------------------*/
fprintf (outfile, "\nInitial Models:\n");
for (i = 0; i < cl.mo_number; i++)
ghmm_dmodel_print (outfile, cl.mo[i]);
/*----------------------------------------------------------------------------*/
while (changes > 0) {
iter++;
/* Associate the sequences */
fprintf (outfile, "\nSequence, Best Model, logP of generating Seq.:\n");
for (j = 0; j < sq->seq_number; j++) {
sq->seq_label[j] =
ghmm_dseq_best_model (cl.mo, cl.mo_number, sq->seq[j], sq->seq_len[j],
&log_p);
fprintf (outfile, "seq %ld, mo %ld, log p %.4f\n", j,
sq->seq_label[j], log_p);
if (sq->seq_label[j] == -1 || sq->seq_label[j] >= cl.mo_number) {
/* No model fits! */
str = ighmm_mprintf (NULL, 0, "Seq. %ld: ghmm_dseq_best_model gives %d\n",
j, sq->seq_label[j]);
GHMM_LOG(LCONVERTED, str);
m_free (str);
goto STOP;
}
}
if (ghmm_cluster_avoid_empty_model
(sq->seq_label, sq->seq_number, cl.mo_number)) {
GHMM_LOG_QUEUED(LCONVERTED);
goto STOP;
}
changes = ghmm_cluster_update_label (oldlabel, sq->seq_label, sq->seq_number);
fprintf (outfile, "%ld changes\n", changes);
fprintf (stdout, "\n*** %ld changes in iteration %d ***\n\n", changes,
iter);
/* Reestimate models with the associated sequences */
if (changes > 0) {
if (ghmm_cluster_update (&cl, sq)) {
GHMM_LOG_QUEUED(LCONVERTED);
goto STOP;
}
fprintf (outfile, "\nGes. WS VOR %d.Reestimate:\n", iter);
ghmm_cluster_print_likelihood (outfile, &cl);
for (i = 0; i < cl.mo_number; i++) {
if (ghmm_dmodel_baum_welch (cl.mo[i], cl.mo_seq[i])) {
str = ighmm_mprintf (NULL, 0, "%d.reestimate false, mo[%d]\n", iter, i);
GHMM_LOG(LCONVERTED, str);
m_free (str);
/* ghmm_dmodel_print(stdout, cl.mo[i]); */
goto STOP;
}
}
fprintf (outfile, "\nGes. WS NACH %d.Reestimate:\n", iter);
ghmm_cluster_print_likelihood (outfile, &cl);
} /* if changes */
} /* while */
/*----------------------------------------------------------------------------*/
if (!ghmm_cluster_out (&cl, sq, outfile, out_filename)) {
GHMM_LOG_QUEUED(LCONVERTED);
goto STOP;
}
res = 0;
STOP: /* Label STOP from ARRAY_[CM]ALLOC */
/* ...noch div. free! */
if (outfile)
fclose (outfile);
return (res);
# undef CUR_PROC
} /* ghmm_cluster_hmm */
/*============================================================================*/
int ghmm_cluster_update (cluster_t * cl, ghmm_dseq * sq)
{
#define CUR_PROC "ghmm_cluster_update"
int i, res = -1;
long *seq_counter;
ghmm_dseq *seq_t;
ARRAY_CALLOC (seq_counter, cl->mo_number);
/* Fix the number of associated sequences */
for (i = 0; i < sq->seq_number; i++)
seq_counter[sq->seq_label[i]]++;
/* allocate memory block for block */
for (i = 0; i < cl->mo_number; i++) {
if (cl->mo_seq[i]) {
/* Important: No ghmm_dseq_free here, otherwise are the original
sequences gone! */
ghmm_dseq_clean (cl->mo_seq[i]);
m_free (cl->mo_seq[i]);
}
cl->mo_seq[i] = ghmm_dseq_calloc (seq_counter[i]);
cl->mo_seq[i]->seq_number = 0; /* Counted later */
}
/* Set entries */
for (i = 0; i < sq->seq_number; i++) {
seq_t = cl->mo_seq[sq->seq_label[i]];
seq_t->seq_len[seq_t->seq_number] = sq->seq_len[i];
seq_t->seq[seq_t->seq_number] = sq->seq[i]; /* Pointer!!! */
seq_t->seq_label[seq_t->seq_number] = sq->seq_label[i];
seq_t->seq_number++;
}
res = 0;
STOP: /* Label STOP from ARRAY_[CM]ALLOC */
m_free (seq_counter);
return (res);
# undef CUR_PROC
} /* ghmm_cluster_update */
/*============================================================================*/
void ghmm_cluster_print_likelihood (FILE * outfile, cluster_t * cl)
{
double ges_prob = 0.0, mo_prob;
int i;
for (i = 0; i < cl->mo_number; i++) {
mo_prob = ghmm_dmodel_likelihood (cl->mo[i], cl->mo_seq[i]);
ges_prob += mo_prob;
fprintf (outfile, "mo %d (#Seq. %ld): %.4f\n", i,
cl->mo_seq[i]->seq_number, mo_prob);
}
fprintf (outfile, "Summe: %.4f\n\n", ges_prob);
} /* ghmm_cluster_print_likelihood */
/*============================================================================*/
/* Prevents that empty models are sent out (no associated seqences) by
associating a random sequence. Since it's possible to produce an empty model
in this way, we swap the sequences until a nonempty model is produced. (This
could be a never-ending process and therefore it's only done 100 times). */
int ghmm_cluster_avoid_empty_model (long *seq_label, long seq_number,
int mo_number)
{
#define CUR_PROC "ghmm_cluster_avoid_empty_model"
int i;
long j = 0;
long *counter;
char error = 1, change = 0;
int iter = 0;
/* Initialization */
ARRAY_CALLOC (counter, mo_number);
for (i = 0; i < mo_number; i++)
counter[i] = 0;
for (i = 0; i < seq_number; i++)
counter[seq_label[i]]++;
while (error && iter < 100) {
iter++;
error = 0;
/* Do we have empty models ? */
for (i = 0; i < mo_number; i++) {
if (counter[i] == 0) {
change = 1;
/* Choose a random sequence for an empty model */
j = m_int (GHMM_RNG_UNIFORM (RNG) * (seq_number - 1));
/* The orginal model loses one sequence */
printf
("!!\"avoid_empty_model\":Verschiebe Seq. %ld: %ld --> %d !!\n", j,
seq_label[j], i);
counter[seq_label[j]]--;
counter[i] = 1;
seq_label[j] = i;
}
}
/* Is now everything OK ? */
if (change) {
for (i = 0; i < mo_number; i++) {
if (counter[i] <= 0) {
error = 1;
break;
}
}
}
} /* while */
STOP: /* Label STOP from ARRAY_[CM]ALLOC */
if (error)
return (-1);
else
return 0;
#undef CUR_PROC
} /* ghmm_cluster_avoid_empty_model */
/*============================================================================*/
int ghmm_cluster_out (cluster_t * cl, ghmm_dseq * sq, FILE * outfile,
char *out_filename)
{
#define CUR_PROC "ghmm_cluster_out"
int res = -1;
int i;
ghmm_cseq *sqd = NULL;
fprintf (outfile, "\nFinal Models:\n");
for (i = 0; i < cl->mo_number; i++)
ghmm_dmodel_print (outfile, cl->mo[i]);
res = 0;
ghmm_cseq_free (&sqd);
return (res);
#undef CUR_PROC
} /* ghmm_cluster_out */
/*============================================================================*/
long ghmm_cluster_update_label (long *oldlabel, long *seq_label, long seq_number)
{
long i, changes = 0;
for (i = 0; i < seq_number; i++)
if (oldlabel[i] != seq_label[i]) {
changes++;
oldlabel[i] = seq_label[i];
}
return changes;
} /* ghmm_cluster_update_label */
#endif /* GHMM_OBSOLETE */