/*******************************************************************************
*
* This file is part of the General Hidden Markov Model Library,
* GHMM version __VERSION__, see http://ghmm.org
*
* Filename: ghmm/ghmm/xmlreader.c
* Authors: Janne Grunau
*
* Copyright (C) 1998-2006 Alexander Schliep
* Copyright (C) 1998-2001 ZAIK/ZPR, Universitaet zu Koeln
* Copyright (C) 2002-2006 Max-Planck-Institut fuer Molekulare Genetik,
* Berlin
*
* Contact: schliep@ghmm.org
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the Free
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*
* This file is version $Revision: 2306 $
* from $Date: 2013-06-03 12:35:43 -0400 (Mon, 03 Jun 2013) $
* last change by $Author: ejb177 $
*
*******************************************************************************/
#ifdef HAVE_CONFIG_H
# include "../config.h"
#endif
#include <stdio.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <limits.h>
#include <libxml/xmlmemory.h>
#include <libxml/tree.h>
#include <libxml/parser.h>
#include "ghmm.h"
#include "ghmm_internals.h"
#include "mes.h"
#include "mprintf.h"
#include "xmlreader.h"
#include "matrixop.h"
/* we should not need more than two alphabets, no plan to implement triple HMMs */
#define MAX_ALPHABETS 2
/* Bitmask to test the modeltype against to choose the type of the model pointer
we use in the union */
#define PTR_TYPE_MASK (GHMM_kDiscreteHMM + GHMM_kTransitionClasses + GHMM_kPairHMM + GHMM_kContinuousHMM)
/* holds all valid modeltypes sorted */
static int validModelTypes[] = {
(GHMM_kDiscreteHMM),
(GHMM_kDiscreteHMM + GHMM_kLeftRight),
(GHMM_kDiscreteHMM + GHMM_kSilentStates),
(GHMM_kDiscreteHMM + GHMM_kTiedEmissions),
(GHMM_kDiscreteHMM + GHMM_kTiedEmissions + GHMM_kSilentStates),
(GHMM_kDiscreteHMM + GHMM_kHigherOrderEmissions),
(GHMM_kDiscreteHMM + GHMM_kHigherOrderEmissions + GHMM_kSilentStates),
(GHMM_kDiscreteHMM + GHMM_kHigherOrderEmissions + GHMM_kTiedEmissions),
(GHMM_kDiscreteHMM + GHMM_kHigherOrderEmissions + GHMM_kTiedEmissions + GHMM_kSilentStates),
(GHMM_kDiscreteHMM + GHMM_kBackgroundDistributions),
(GHMM_kDiscreteHMM + GHMM_kBackgroundDistributions + GHMM_kSilentStates),
(GHMM_kDiscreteHMM + GHMM_kBackgroundDistributions + GHMM_kTiedEmissions),
(GHMM_kDiscreteHMM + GHMM_kBackgroundDistributions + GHMM_kTiedEmissions + GHMM_kSilentStates),
(GHMM_kDiscreteHMM + GHMM_kBackgroundDistributions + GHMM_kHigherOrderEmissions + GHMM_kSilentStates),
(GHMM_kDiscreteHMM + GHMM_kBackgroundDistributions + GHMM_kHigherOrderEmissions + GHMM_kTiedEmissions),
(GHMM_kDiscreteHMM + GHMM_kBackgroundDistributions + GHMM_kHigherOrderEmissions + GHMM_kTiedEmissions + GHMM_kSilentStates),
(GHMM_kDiscreteHMM + GHMM_kLabeledStates),
(GHMM_kDiscreteHMM + GHMM_kLabeledStates + GHMM_kSilentStates),
(GHMM_kDiscreteHMM + GHMM_kLabeledStates + GHMM_kTiedEmissions),
(GHMM_kDiscreteHMM + GHMM_kLabeledStates + GHMM_kTiedEmissions + GHMM_kSilentStates),
(GHMM_kDiscreteHMM + GHMM_kLabeledStates + GHMM_kHigherOrderEmissions),
(GHMM_kDiscreteHMM + GHMM_kLabeledStates + GHMM_kHigherOrderEmissions + GHMM_kSilentStates),
(GHMM_kDiscreteHMM + GHMM_kLabeledStates + GHMM_kHigherOrderEmissions + GHMM_kTiedEmissions),
(GHMM_kDiscreteHMM + GHMM_kLabeledStates + GHMM_kHigherOrderEmissions + GHMM_kTiedEmissions + GHMM_kSilentStates),
(GHMM_kDiscreteHMM + GHMM_kLabeledStates + GHMM_kBackgroundDistributions),
(GHMM_kDiscreteHMM + GHMM_kLabeledStates + GHMM_kBackgroundDistributions + GHMM_kSilentStates),
(GHMM_kDiscreteHMM + GHMM_kLabeledStates + GHMM_kBackgroundDistributions + GHMM_kTiedEmissions),
(GHMM_kDiscreteHMM + GHMM_kLabeledStates + GHMM_kBackgroundDistributions + GHMM_kTiedEmissions + GHMM_kSilentStates),
(GHMM_kDiscreteHMM + GHMM_kLabeledStates + GHMM_kBackgroundDistributions + GHMM_kHigherOrderEmissions + GHMM_kTiedEmissions),
(GHMM_kDiscreteHMM + GHMM_kLabeledStates + GHMM_kBackgroundDistributions + GHMM_kHigherOrderEmissions + GHMM_kTiedEmissions + GHMM_kSilentStates),
(GHMM_kDiscreteHMM + GHMM_kTransitionClasses),
(GHMM_kContinuousHMM),
(GHMM_kContinuousHMM + GHMM_kTransitionClasses),
(GHMM_kContinuousHMM + GHMM_kMultivariate),
(GHMM_kContinuousHMM + GHMM_kMultivariate + GHMM_kTransitionClasses),
(GHMM_kPairHMM + GHMM_kDiscreteHMM),
(GHMM_kPairHMM + GHMM_kDiscreteHMM + GHMM_kTransitionClasses)
};
/*===========================================================================*/
static int getIntAttribute(xmlNodePtr node, const char *name, int *error) {
xmlChar *attr;
int value = -3894;
if ((attr = xmlGetProp(node, BAD_CAST name)) != NULL) {
value = atoi((char *)attr);
xmlFree(attr);
*error = 0;
} else {
*error = 1;
}
return value;
}
/*===========================================================================*/
static double getDoubleAttribute(xmlNodePtr node, const char *name,
int *error) {
xmlChar *attr;
double value = 0.0;
if ((attr = xmlGetProp(node, BAD_CAST name)) != NULL) {
value = atof((char *)attr);
xmlFree(attr);
*error = 0;
} else {
*error = 1;
}
return value;
}
/*===========================================================================*/
/* Caller owns return value */
static char * getXMLCharAttribute(xmlNodePtr node, const char *name,
int *error) {
xmlChar *attr;
if ((attr = xmlGetProp(node, BAD_CAST name)) != NULL) {
*error = 0;
return (char *)attr;
} else {
*error = 1;
return NULL;
}
}
/*===========================================================================*/
static int parseCSVList(const char * data, unsigned int size, double * array, int reverse) {
#define CUR_PROC "parseCSVList"
int retval=0;
int i;
char * * next, * estr;
double tmp;
ARRAY_CALLOC(next, 1);
for (i=0; i<size; i++) {
array[i] = strtod(data, next);
if (data == *next) {
estr = ighmm_mprintf(NULL, 0, "error in parsing CSV. entry %d of %d. (%s)", i, size, *next);
GHMM_LOG(LERROR, estr);
m_free(estr);
retval=-1;
break;
}
if (next)
data = *next+1;
else
break;
}
if (i != size) {
retval=-1;
estr = ighmm_mprintf(NULL, 0, "error in parsing CSV. sizes do not match (%d != %d)", i, size);
GHMM_LOG(LERROR, estr);
m_free(estr);
}
if (reverse) {
for (i=0; i<size/2; i++) {
tmp = array[i];
array[i] = array[size-i-1];
array[size-i-1] = tmp;
}
}
STOP:
m_free(next);
return retval;
#undef CUR_PROC
}
/*===========================================================================*/
static int matchModelType(const char * data, unsigned int size) {
#define CUR_PROC "matchModelType"
if (!strncmp(data, "left-right", size))
return GHMM_kLeftRight;
if (!strncmp(data, "silent", size))
return GHMM_kSilentStates;
if (!strncmp(data, "tied", size))
return GHMM_kTiedEmissions;
if (!strncmp(data, "higher-order", size))
return GHMM_kHigherOrderEmissions;
if (!strncmp(data, "background", size))
return GHMM_kBackgroundDistributions;
if (!strncmp(data, "labeled", size))
return GHMM_kLabeledStates;
if (!strncmp(data, "transition-classes", size))
return GHMM_kTransitionClasses;
if (!strncmp(data, "discrete", size))
return GHMM_kDiscreteHMM;
if (!strncmp(data, "continuous", size))
return GHMM_kContinuousHMM;
if (!strncmp(data, "pair", size))
return GHMM_kPairHMM;
if (!strncmp(data, "multivariate", size))
return GHMM_kMultivariate;
return INT_MIN;
#undef CUR_PROC
}
/*===========================================================================*/
static int parseModelType(const char * data, unsigned int size) {
#define CUR_PROC "parseModelType"
int i, noValidMo, modelType=0;
const char * end = data;
char * str;
while ((end = strchr(data, ' '))) {
modelType += matchModelType(data, end-data);
size -= (end-data)+1;
data = end+1;
}
modelType += matchModelType(data, size);
noValidMo = sizeof(validModelTypes)/sizeof(validModelTypes[0]);
for (i=0; i<noValidMo; i++) {
if (modelType == validModelTypes[i])
break;
}
if (i == noValidMo) {
str = ighmm_mprintf(NULL, 0, "%d is no known valid model type", modelType);
GHMM_LOG(LERROR, str);
m_free(str);
return -1;
}
return modelType;
#undef CUR_PROC
}
/*===========================================================================*/
static ghmm_alphabet * parseAlphabet(xmlDocPtr doc, xmlNodePtr cur, ghmm_xmlfile* f) {
#define CUR_PROC "parseAlphabet"
char * str;
int M, code, error;
xmlNodePtr symbol;
ghmm_alphabet * alfa;
ARRAY_CALLOC(alfa, 1);
symbol = cur->children;
M=0;
while (symbol!=NULL) {
if ((!xmlStrcmp(symbol->name, BAD_CAST "symbol"))) {
code = getIntAttribute(symbol, "code", &error);
if (error || code!=M) {
str = ighmm_mprintf(NULL, 0, "non consecutive code %d == %d", code, M);
GHMM_LOG(LERROR, str);
m_free(str);
goto STOP;
} else
M++;
}
symbol=symbol->next;
}
alfa->size = M;
/*printf("Parsing alphabet with %d symbols\n", alfa->size);*/
ARRAY_MALLOC(alfa->symbols, M);
symbol = cur->children;
M=0;
while (symbol!=NULL) {
if ((!xmlStrcmp(symbol->name, BAD_CAST "symbol"))) {
alfa->symbols[M++] = (char *)xmlNodeGetContent(symbol);
/*printf("%d. symbol: %s\n", M, alfa->symbols[M-1]);*/
}
symbol=symbol->next;
}
return alfa;
STOP:
m_free(alfa->symbols);
m_free(alfa)
return NULL;
#undef CUR_PROC
}
/*===========================================================================*/
static int parseBackground(xmlDocPtr doc, xmlNodePtr cur, ghmm_xmlfile* f, int modelNo) {
#define CUR_PROC "parseBackground"
int error, order;
int bgNr, rev;
double *b = NULL;
char *s = NULL;
assert(f->modelType & GHMM_kDiscreteHMM);
bgNr = f->model.d[modelNo]->bp->n++;
/* get order */
order = getIntAttribute(cur, "order", &error);
if (error)
order=0;
else if (order && !(f->modelType & GHMM_kHigherOrderEmissions)) {
GHMM_LOG(LERROR, "background distribution has order > 0, but model is not higher order");
goto STOP;
}
f->model.d[modelNo]->bp->order[bgNr] = order;
/* get name */
s = (char *)getXMLCharAttribute(cur, "key", &error);
f->model.d[modelNo]->bp->name[bgNr] = s;
rev = getIntAttribute(cur, "rev", &error);
if (error)
rev = 0;
/* get distribution */
s = (char *)xmlNodeGetContent(cur);
ARRAY_MALLOC(b, pow(f->model.d[modelNo]->bp->m, order+1));
if (-1 != parseCSVList(s, pow(f->model.d[modelNo]->bp->m, order+1), b, rev))
f->model.d[modelNo]->bp->b[bgNr] = b;
else {
GHMM_LOG(LERROR, "Can not parse background CSV list.");
goto STOP;
}
free(s);
return 0;
STOP:
m_free(b);
free(s);
return -1;
#undef CUR_PROC
}
/*===========================================================================*/
static int parseState(xmlDocPtr doc, xmlNodePtr cur, ghmm_xmlfile* f, int * inDegree, int * outDegree, int modelNo) {
#define CUR_PROC "parseState"
int i, error, order=0, state=-1442, fixed=-985, tied=-9354, M, aprox, label;
int curX=0, curY=0;
double pi, prior;
double *emissions = NULL;
char *desc = NULL;
char *s = NULL, *estr;
int rev, stateFixed=1;
ghmm_cstate *newcstate;
ghmm_c_emission *emission;
xmlNodePtr elem, child, multichild;
state = getIntAttribute(cur, "id", &error);
pi = getDoubleAttribute(cur, "initial", &error);
if (error) {
estr = ighmm_mprintf(NULL, 0, "can't read required intial probability for"
"state %d", state);
GHMM_LOG(LERROR, estr);
goto STOP;
} else
desc = xmlGetProp(cur, BAD_CAST "desc");
elem = cur->children;
while (elem!=NULL) {
/* ======== silent state ============================================== */
if ((!xmlStrcmp(elem->name, BAD_CAST "silent"))) {
switch (f->modelType & PTR_TYPE_MASK) {
case (GHMM_kDiscreteHMM):
f->model.d[modelNo]->silent[state] = 1;
break;
case (GHMM_kDiscreteHMM+GHMM_kTransitionClasses):
f->model.ds[modelNo]->silent[state] = 1;
break;
case (GHMM_kDiscreteHMM+GHMM_kPairHMM):
case (GHMM_kDiscreteHMM+GHMM_kPairHMM+GHMM_kTransitionClasses):
f->model.dp[modelNo]->silent[state] = 1;
break;
default:
GHMM_LOG(LERROR, "invalid modelType");
goto STOP;
}
}
/* ======== discrete state (possible higher order) ==================== */
if ((!xmlStrcmp(elem->name, BAD_CAST "discrete"))) {
assert((f->modelType & GHMM_kDiscreteHMM) && ((f->modelType & GHMM_kPairHMM) == 0));
/* fixed is a propety of the distribution and optional */
fixed = getIntAttribute(elem, "fixed", &error);
if (error)
fixed = 0;
/* order is optional for discrete */
if (f->modelType & GHMM_kHigherOrderEmissions) {
order = getIntAttribute(elem, "order", &error);
if (error)
order = 0;
}
rev = getIntAttribute(cur, "rev", &error);
if (error)
rev = 0;
/* parsing emission probabilities */
s = (char *)xmlNodeGetContent(elem);
switch (f->modelType & PTR_TYPE_MASK) {
case (GHMM_kDiscreteHMM):
f->model.d[modelNo]->s[state].desc = desc;
f->model.d[modelNo]->s[state].pi = pi;
f->model.d[modelNo]->s[state].fix = fixed;
if (f->modelType & GHMM_kHigherOrderEmissions) {
f->model.d[modelNo]->order[state] = order;
if (f->model.d[modelNo]->maxorder < order) {
f->model.d[modelNo]->maxorder = order;
estr = ighmm_mprintf(NULL, 0, "Updated maxorder to %d\n",
f->model.d[modelNo]->maxorder);
GHMM_LOG(LDEBUG, estr);
m_free(estr);
}
}
ARRAY_MALLOC(emissions, pow(f->model.d[modelNo]->M, order+1));
parseCSVList(s, pow(f->model.d[modelNo]->M, order+1), emissions, rev);
free(f->model.d[modelNo]->s[state].b);
f->model.d[modelNo]->s[state].b = emissions;
break;
case (GHMM_kDiscreteHMM+GHMM_kTransitionClasses):
f->model.ds[modelNo]->s[state].desc = desc;
f->model.ds[modelNo]->s[state].pi = pi;
f->model.ds[modelNo]->s[state].fix = fixed;
if (f->modelType & GHMM_kHigherOrderEmissions)
f->model.ds[modelNo]->order[state] = order;
ARRAY_MALLOC(emissions, pow(f->model.ds[modelNo]->M, order+1));
parseCSVList(s, pow(f->model.ds[modelNo]->M, order+1), emissions, rev);
f->model.ds[modelNo]->s[state].b = emissions;
break;
default:
GHMM_LOG(LERROR, "invalid modelType");
goto STOP;
}
m_free(s);
}
/* ======== continuous state ========================================== */
if ((!xmlStrcmp(elem->name, BAD_CAST "mixture"))) {
assert(f->modelType & GHMM_kContinuousHMM);
M = 0;
child = elem->children;
while (child != NULL) {
if ((!xmlStrcmp(child->name, BAD_CAST "normal")) ||
(!xmlStrcmp(child->name, BAD_CAST "normalTruncatedLeft")) ||
(!xmlStrcmp(child->name, BAD_CAST "normalTruncatedRight")) ||
(!xmlStrcmp(child->name, BAD_CAST "multinormal")) ||
(!xmlStrcmp(child->name, BAD_CAST "uniform"))){
M ++;
}
child = child->next;
}
ghmm_cstate_alloc(f->model.c[modelNo]->s + state, M, inDegree[state], outDegree[state], f->model.c[modelNo]->cos);
newcstate = f->model.c[modelNo]->s + state;
newcstate->desc = desc;
newcstate->M = M;
newcstate->pi = pi;
if( f->model.c[modelNo]->M < M)
f->model.c[modelNo]->M = M;
child = elem->children;
i = 0;
while (child != NULL) {
emission = newcstate->e+i;
/* common attributes */
if ((!xmlStrcmp(child->name, BAD_CAST "normal")) ||
(!xmlStrcmp(child->name, BAD_CAST "normalTruncatedLeft")) ||
(!xmlStrcmp(child->name, BAD_CAST "normalTruncatedRight")) ||
(!xmlStrcmp(child->name, BAD_CAST "multinormal")) ||
(!xmlStrcmp(child->name, BAD_CAST "uniform"))){
fixed = getIntAttribute(child, "fixed", &error);
if (error)
fixed = 0;
stateFixed = fixed && stateFixed;
/* allocate emission */
emission->fixed = fixed;
prior = getDoubleAttribute(child, "prior", &error);
if (error)
prior = 1.0;
newcstate->c[i] = prior;
}
/* child is not a density, continue with the next child */
else {
child = child->next;
continue;
}
/* density type dependent attributes */
if ((!xmlStrcmp(child->name, BAD_CAST "normal"))) {
emission->mean.val = getDoubleAttribute(child, "mean", &error);
emission->variance.val = getDoubleAttribute(child, "variance", &error);
/* should the normal distribution be approximated? */
aprox = getIntAttribute(child, "approx", &error);
if (error)
aprox = 0;
emission->type = aprox ? normal_approx : normal;
emission->dimension = 1;
if (f->model.c[modelNo]->dim > 1) {
GHMM_LOG(LERROR, "All emissions must have same dimension.");
goto STOP;
}
}
if ((!xmlStrcmp(child->name, BAD_CAST "normalLeftTail"))) {
emission->mean.val = getDoubleAttribute(child, "mean", &error);
emission->variance.val = getDoubleAttribute(child, "variance", &error);
emission->min = getDoubleAttribute(child, "max", &error);
emission->type = normal_left;
emission->dimension = 1;
if (f->model.c[modelNo]->dim > 1) {
GHMM_LOG(LERROR, "All emissions must have same dimension.");
goto STOP;
}
}
if ((!xmlStrcmp(child->name, BAD_CAST "normalRightTail"))) {
emission->mean.val = getDoubleAttribute(child, "mean", &error);
emission->variance.val = getDoubleAttribute(child, "variance", &error);
emission->max = getDoubleAttribute(child, "min", &error);
emission->type = normal_right;
emission->dimension = 1;
if (f->model.c[modelNo]->dim > 1) {
GHMM_LOG(LERROR, "All emissions must have same dimension.");
goto STOP;
}
}
if ((!xmlStrcmp(child->name, BAD_CAST "uniform"))) {
emission->max = getDoubleAttribute(child, "max", &error);
emission->min = getDoubleAttribute(child, "min", &error);
emission->type = uniform;
emission->dimension = 1;
if (f->model.c[modelNo]->dim > 1) {
GHMM_LOG(LERROR, "All emissions must have same dimension.");
goto STOP;
}
}
if ((!xmlStrcmp(child->name, BAD_CAST "multinormal"))) {
emission->type = multinormal;
emission->dimension = getIntAttribute(child, "dimension", &error);
/* check that all emissions in all states have same dimension or
set when first emission is read*/
if (f->model.c[modelNo]->dim <= 1)
f->model.c[modelNo]->dim = emission->dimension;
else if (f->model.c[modelNo]->dim != emission->dimension) {
GHMM_LOG(LERROR, "All emissions must have same dimension.");
goto STOP;
}
if (0 != ghmm_c_emission_alloc(emission, emission->dimension)) {
GHMM_LOG(LERROR, "Can not allocate multinormal emission.");
goto STOP;
}
multichild = child->children;
while (multichild != NULL) {
if ((!xmlStrcmp(multichild->name, BAD_CAST "mean"))) {
s = (char *)xmlNodeGetContent(multichild);
if (-1 == parseCSVList(s, emission->dimension, emission->mean.vec, 0)) {
GHMM_LOG(LERROR, "Can not parse mean CSV list.");
goto STOP;
}
}
if ((!xmlStrcmp(multichild->name, BAD_CAST "variance"))) {
s = (char *)xmlNodeGetContent(multichild);
if (-1 == parseCSVList(s, emission->dimension * emission->dimension,
emission->variance.mat, 0)) {
GHMM_LOG(LERROR, "Can not parse variance CSV list.");
goto STOP;
}
if (0 != ighmm_invert_det(emission->sigmainv, &emission->det,
emission->dimension, emission->variance.mat))
{
GHMM_LOG(LERROR, "Can not calculate inverse of covariance matrix.");
goto STOP;
}
if (0 != ighmm_cholesky_decomposition(emission->sigmacd,
emission->dimension,
emission->variance.mat))
{
GHMM_LOG(LERROR, "Can not calculate cholesky decomposition of covariance matrix.");
goto STOP;
}
}
multichild = multichild->next;
}
}
i++;
child = child->next;
}
newcstate->fix = stateFixed;
}
/* ======== pair hmm state ============================================ */
if ((!xmlStrcmp(elem->name, BAD_CAST "pair"))) {
}
/* -------- background name ------------------------------------------ */
if ((!xmlStrcmp(elem->name, BAD_CAST "backgroundKey"))) {
assert(f->modelType & GHMM_kBackgroundDistributions);
s = (char *)xmlNodeGetContent(elem);
for (i=0; i<f->model.d[modelNo]->bp->n; i++) {
if (0 == strcmp(s, f->model.d[modelNo]->bp->name[i])) {
if (order != f->model.d[modelNo]->bp->order[i]) {
estr = ighmm_mprintf(NULL, 0, "order of background %s and state %d"
" does not match",
f->model.d[modelNo]->bp->name[i], state);
GHMM_LOG(LERROR, estr);
m_free(estr);
goto STOP;
} else {
f->model.d[modelNo]->background_id[state] = i;
break;
}
}
}
if (i == f->model.d[modelNo]->bp->n) {
estr = ighmm_mprintf(NULL, 0, "can't find background with name %s in"
" state %d", s, state);
GHMM_LOG(LERROR, estr);
m_free(estr);
goto STOP;
}
m_free(s);
}
/* -------- tied to --------------------------------------------------- */
if ((!xmlStrcmp(elem->name, BAD_CAST "class"))) {
assert(f->modelType & GHMM_kLabeledStates);
s = (char *)xmlNodeGetContent(elem);
label = atoi(s);
m_free(s);
if ((f->modelType & PTR_TYPE_MASK) == GHMM_kDiscreteHMM) {
if (f->model.d[modelNo]->label_alphabet->size > label)
f->model.d[modelNo]->label[state] = label;
else
GHMM_LOG(LWARN, "Invalid label");
}
}
/* -------- tied to --------------------------------------------------- */
if ((!xmlStrcmp(elem->name, BAD_CAST "tiedTo"))) {
assert(f->modelType & GHMM_kTiedEmissions);
s = (char *)xmlNodeGetContent(elem);
tied = atoi(s);
if (state>=tied) {
f->model.d[modelNo]->tied_to[state] = tied;
if (f->model.d[modelNo]->tied_to[tied] != tied) {
estr = ighmm_mprintf(NULL, 0, "state %d not tied to tie group leader", state);
GHMM_LOG(LERROR, estr);
m_free(estr);
goto STOP;
}
} else {
estr = ighmm_mprintf(NULL, 0, "state %d tiedTo (%d) is invalid", state, tied);
GHMM_LOG(LERROR, estr);
m_free(estr);
goto STOP;
}
m_free(s);
}
/* -------- position for graphical editing ---------------------------- */
if ((!xmlStrcmp(elem->name, BAD_CAST "position"))) {
curX = getIntAttribute(elem, "x", &error);
if (error)
GHMM_LOG(LWARN, "failed to read x position");
curY = getIntAttribute(elem, "y", &error);
if (error)
GHMM_LOG(LWARN, "failed to read y position");
switch (f->modelType & PTR_TYPE_MASK) {
case GHMM_kDiscreteHMM:
f->model.d[modelNo]->s[state].xPosition = curX;
f->model.d[modelNo]->s[state].yPosition = curY;
break;
case GHMM_kDiscreteHMM+GHMM_kTransitionClasses:
f->model.ds[modelNo]->s[state].xPosition = curX;
f->model.ds[modelNo]->s[state].yPosition = curY;
break;
case GHMM_kDiscreteHMM+GHMM_kPairHMM:
case GHMM_kDiscreteHMM+GHMM_kPairHMM+GHMM_kTransitionClasses:
f->model.dp[modelNo]->s[state].xPosition = curX;
f->model.dp[modelNo]->s[state].yPosition = curY;
break;
case GHMM_kContinuousHMM:
case GHMM_kContinuousHMM+GHMM_kTransitionClasses:
case (GHMM_kContinuousHMM+GHMM_kMultivariate):
case (GHMM_kContinuousHMM+GHMM_kMultivariate+GHMM_kTransitionClasses):
f->model.c[modelNo]->s[state].xPosition = curX;
f->model.c[modelNo]->s[state].yPosition = curY;
break;
default:
GHMM_LOG(LERROR, "invalid modelType");
goto STOP;
}
}
elem = elem->next;
}
return 0;
STOP:
m_free(s);
m_free(desc);
m_free(emissions)
return -1;
#undef CUR_PROC
}
/*===========================================================================*/
static int parseSingleTransition(xmlDocPtr doc, xmlNodePtr cur, ghmm_xmlfile* f,
int modelNo) {
#define CUR_PROC "parseTransition"
int retval=-1;
int source, target, error;
int in_state, out_state;
double p;
char * s;
xmlNodePtr elem;
assert((f->modelType & GHMM_kTransitionClasses) == 0);
source = getIntAttribute(cur, "source", &error);
target = getIntAttribute(cur, "target", &error);
elem = cur->children;
while (elem!=NULL) {
if ((!xmlStrcmp(elem->name, BAD_CAST "probability"))) {
s = (char *)xmlNodeGetContent(elem);
p = atof(s);
m_free(s)
break;
}
elem = elem->next;
}
switch (f->modelType & PTR_TYPE_MASK) {
case GHMM_kDiscreteHMM:
out_state = f->model.d[modelNo]->s[source].out_states++;
in_state = f->model.d[modelNo]->s[target].in_states++;
f->model.d[modelNo]->s[source].out_id[out_state] = target;
f->model.d[modelNo]->s[source].out_a[out_state] = p;
f->model.d[modelNo]->s[target].in_id[in_state] = source;
f->model.d[modelNo]->s[target].in_a[in_state] = p;
break;
case (GHMM_kDiscreteHMM+GHMM_kPairHMM):
out_state = f->model.dp[modelNo]->s[source].out_states++;
in_state = f->model.dp[modelNo]->s[target].in_states++;
f->model.dp[modelNo]->s[source].out_id[out_state] = target;
f->model.dp[modelNo]->s[source].out_a[out_state][0] = p;
f->model.dp[modelNo]->s[target].in_id[in_state] = source;
f->model.dp[modelNo]->s[target].in_a[in_state][0] = p;
break;
case GHMM_kContinuousHMM:
case (GHMM_kContinuousHMM+GHMM_kMultivariate):
out_state = f->model.c[modelNo]->s[source].out_states++;
in_state = f->model.c[modelNo]->s[target].in_states++;
f->model.c[modelNo]->s[source].out_id[out_state] = target;
f->model.c[modelNo]->s[source].out_a[0][out_state] = p;
f->model.c[modelNo]->s[target].in_id[in_state] = source;
f->model.c[modelNo]->s[target].in_a[0][in_state] = p;
break;
default:
GHMM_LOG(LERROR, "invalid modelType");
goto STOP;
}
retval = 0;
STOP:
return retval;
#undef CUR_PROC
}
/*===========================================================================*/
static int parseMultipleTransition(xmlDocPtr doc, xmlNodePtr cur,
ghmm_xmlfile* f, int modelNo,
int nrTransitionClasses) {
#define CUR_PROC "parseMultipleTransition"
int i, retval=-1;
int source, target, error;
int in_state, out_state;
double * probs;
char * s;
xmlNodePtr elem;
assert(f->modelType & GHMM_kTransitionClasses);
source = getIntAttribute(cur, "source", &error);
target = getIntAttribute(cur, "target", &error);
elem = cur->children;
while (elem!=NULL) {
if ((!xmlStrcmp(elem->name, BAD_CAST "probability"))) {
s = (char *)xmlNodeGetContent(elem);
ARRAY_MALLOC(probs, nrTransitionClasses);
parseCSVList(s, nrTransitionClasses, probs, 0);
m_free(s);
break;
}
elem = elem->next;
}
switch (f->modelType & PTR_TYPE_MASK) {
case (GHMM_kDiscreteHMM + GHMM_kTransitionClasses):
out_state = f->model.ds[modelNo]->s[source].out_states++;
in_state = f->model.ds[modelNo]->s[target].in_states++;
f->model.ds[modelNo]->s[source].out_id[out_state] = target;
/* f->model.ds[modelNo]->s[source].out_a[out_state] = probs; */
f->model.ds[modelNo]->s[target].in_id[in_state] = source;
/* f->model.ds[modelNo]->s[target].in_a[in_state] = probs; */
break;
case (GHMM_kDiscreteHMM + GHMM_kPairHMM + GHMM_kTransitionClasses):
out_state = f->model.dp[modelNo]->s[source].out_states++;
in_state = f->model.dp[modelNo]->s[target].in_states++;
f->model.dp[modelNo]->s[source].out_id[out_state] = target;
f->model.dp[modelNo]->s[source].out_a[out_state] = probs;
f->model.dp[modelNo]->s[target].in_id[in_state] = source;
f->model.dp[modelNo]->s[target].in_a[in_state] = probs;
break;
case (GHMM_kContinuousHMM + GHMM_kTransitionClasses):
case (GHMM_kContinuousHMM+GHMM_kMultivariate+GHMM_kTransitionClasses):
out_state = f->model.c[modelNo]->s[source].out_states++;
in_state = f->model.c[modelNo]->s[target].in_states++;
f->model.c[modelNo]->s[source].out_id[out_state] = target;
f->model.c[modelNo]->s[target].in_id[in_state] = source;
for (i=0; i<nrTransitionClasses; i++) {
f->model.c[modelNo]->s[source].out_a[i][out_state] = probs[i];
f->model.c[modelNo]->s[target].in_a[i][in_state] = probs[i];
}
break;
default:
GHMM_LOG(LERROR, "invalid modelType");
goto STOP;
}
retval = 0;
STOP:
m_free(probs);
return retval;
#undef CUR_PROC
}
/*===========================================================================*/
static int parseHMM(ghmm_xmlfile* f, xmlDocPtr doc, xmlNodePtr cur, int modelNo) {
#define CUR_PROC "parseHMM"
char * estr;
xmlNodePtr child;
int i, id, error;
int source, target;
int N = 0;
int nrBackgrounds=0, M=-1;
int * inDegree = NULL;
int * outDegree = NULL;
int cos;
float prior;
int modeltype=0;
char * mt;
char *modelname;
int * bg_orders = NULL;
double * * bg_ptr = NULL;
ghmm_alphabet * alfa;
ghmm_alphabet * * alphabets=NULL;
int nrAlphabets=0;
child = cur->children;
/*xmlElemDump(stdout, doc, cur);*/
/* parse HMM for counting */
GHMM_LOG(LINFO, "parseHMM to count ");
while (child != NULL) {
/* ========== ALPHABETS ================================================ */
if ((!xmlStrcmp(child->name, BAD_CAST "alphabet"))) {
if (!alphabets)
ARRAY_MALLOC(alphabets, MAX_ALPHABETS);
alfa = parseAlphabet(doc, child, f);
if (alfa && nrAlphabets<MAX_ALPHABETS) {
alphabets[nrAlphabets++] = alfa;
} else {
GHMM_LOG(LERROR, "Error in parsing alphabets.");
goto STOP;
}
}
/* ========== NODES ================================================== */
if ((!xmlStrcmp(child->name, BAD_CAST "state"))) {
id = getIntAttribute(child, "id", &error);
if (error || id!=N) {
GHMM_LOG(LERROR, "non consecutive node ids");
goto STOP;
}
N++;
}
/* ========== EDGES ================================================== */
if ((!xmlStrcmp(child->name, BAD_CAST "transition"))) {
if (inDegree == NULL) {
ARRAY_CALLOC(inDegree, N);
ARRAY_CALLOC(outDegree, N);
}
source = getIntAttribute(child, "source", &error);
if (error || source<0 || source>N) {
estr = ighmm_mprintf(NULL, 0, "source (%d) node not existing (%d)",
source, error);
GHMM_LOG(LERROR, estr);
m_free(estr);
goto STOP;
}
target = getIntAttribute(child, "target", &error);
if (error || target<0 || target>N) {
estr = ighmm_mprintf(NULL, 0, "target (%d) node not existing (%d)",
target, error);
GHMM_LOG(LERROR, estr);
m_free(estr);
goto STOP;
}
inDegree[target]++;
outDegree[source]++;
}
/* ========== BACKGROUND DISTRIBUTIONS ================================ */
if ((!xmlStrcmp(child->name, BAD_CAST "background")))
nrBackgrounds++;
child = child->next;
}
/* allocate zero degree count in the case of a HMM without transitions */
if (inDegree == NULL) {
ARRAY_CALLOC(inDegree, N);
ARRAY_CALLOC(outDegree, N);
}
estr = ighmm_mprintf(NULL, 0, "Found HMM with %d states\n", N);
GHMM_LOG(LDEBUG, estr);
m_free(estr);
for (i=0; i<N; i++) {
estr = ighmm_mprintf(NULL, 0, " %d\t%d\n", inDegree[i], outDegree[i]);
GHMM_LOG(LDEBUG, estr);
m_free(estr);
}
/* starting real parsing */
modelname = xmlGetProp(cur, BAD_CAST "name");
mt = getXMLCharAttribute(cur, "type", &error);
modeltype = parseModelType(mt, strlen(mt));
free(mt);
f->modelType = modeltype;
/* reading common optional atribute prior, 1.0 if not defined */
prior = getDoubleAttribute(cur, "prior", &error);
if (error)
prior = 1.0;
/* reading common optional atribute cos, 1 if not defined */
cos = getIntAttribute(cur, "transitionClasses", &error);
if (error)
cos = 1;
/* if first model, initialize model structures */
if (modelNo == 0){
switch (f->modelType & PTR_TYPE_MASK) {
case GHMM_kDiscreteHMM:
ARRAY_CALLOC(f->model.d, f->noModels);
break;
case (GHMM_kDiscreteHMM+GHMM_kTransitionClasses):
ARRAY_CALLOC(f->model.ds, f->noModels);
break;
case (GHMM_kDiscreteHMM+GHMM_kPairHMM):
case (GHMM_kDiscreteHMM+GHMM_kPairHMM+GHMM_kTransitionClasses):
ARRAY_CALLOC(f->model.dp, f->noModels);
break;
case GHMM_kContinuousHMM:
case (GHMM_kContinuousHMM+GHMM_kMultivariate):
case (GHMM_kContinuousHMM+GHMM_kTransitionClasses):
case (GHMM_kContinuousHMM+GHMM_kMultivariate+GHMM_kTransitionClasses):
ARRAY_CALLOC(f->model.c, f->noModels);
break;
break;
default:
GHMM_LOG(LERROR, "invalid modelType");
goto STOP;
}
}
/* allocating the different models */
switch (f->modelType & PTR_TYPE_MASK) {
case GHMM_kDiscreteHMM:
assert(nrAlphabets == 1);
M = alphabets[0]->size;
f->model.d[modelNo] = ghmm_dmodel_calloc(M, N, modeltype, inDegree,
outDegree);
f->model.d[modelNo]->alphabet = alphabets[0];
f->model.d[modelNo]->prior = prior;
f->model.d[modelNo]->name = modelname;
break;
case (GHMM_kDiscreteHMM+GHMM_kTransitionClasses):
assert(nrAlphabets == 1);
M = alphabets[0]->size;
f->model.ds[modelNo] = ghmm_dsmodel_calloc(M, N, modeltype, cos, inDegree,
outDegree);
f->model.ds[modelNo]->alphabet = alphabets[0];
f->model.ds[modelNo]->prior = prior;
f->model.ds[modelNo]->name = modelname;
f->model.ds[modelNo]->cos = cos;
break;
/* XXX case (GHMM_kDiscreteHMM+GHMM_kPairHMM):
case (GHMM_kDiscreteHMM+GHMM_kPairHMM+GHMM_kTransitionClasses):
f->model.dp[modelNo] = NULL;
break; */
case GHMM_kContinuousHMM:
case (GHMM_kContinuousHMM+GHMM_kMultivariate):
case (GHMM_kContinuousHMM+GHMM_kTransitionClasses):
case (GHMM_kContinuousHMM+GHMM_kMultivariate+GHMM_kTransitionClasses):
f->model.c[modelNo] = ghmm_cmodel_calloc(N,modeltype);
f->model.c[modelNo]->prior = prior;
f->model.c[modelNo]->name = modelname;
f->model.c[modelNo]->cos = cos;
f->model.c[modelNo]->dim = 1;
break;
default:
GHMM_LOG(LERROR, "invalid or unimplemented model type");
goto STOP;
}
free(alphabets);
/* allocating background distributions for approtiate models */
if (modeltype & GHMM_kBackgroundDistributions) {
switch (f->modelType & PTR_TYPE_MASK) {
case GHMM_kDiscreteHMM:
ARRAY_CALLOC(bg_orders, nrBackgrounds);
ARRAY_CALLOC(bg_ptr, nrBackgrounds);
f->model.d[modelNo]->bp = ghmm_dbackground_alloc(nrBackgrounds, M,
bg_orders, bg_ptr);
f->model.d[modelNo]->bp->n = 0;
break;
default:
GHMM_LOG(LERROR, "invalid modelType");
goto STOP;
}
}
child = cur->xmlChildrenNode;
/* parse HMM for real */
while (child != NULL) {
/* ========== LABEL ALPHABETS ========================================== */
if ((!xmlStrcmp(child->name, BAD_CAST "classAlphabet"))) {
alfa = parseAlphabet(doc, child, f);
if (alfa) {
f->model.d[modelNo]->label_alphabet = alfa;
} else {
GHMM_LOG(LERROR, "Error in parsing alphabets.");
goto STOP;
}
}
if ((!xmlStrcmp(child->name, BAD_CAST "background"))) {
if (modeltype & GHMM_kBackgroundDistributions) {
parseBackground(doc, child, f, modelNo);
} else {
GHMM_LOG(LWARN, "Ignoring background distribution.");
}
}
if ((!xmlStrcmp(child->name, BAD_CAST "state"))) {
parseState(doc, child, f, inDegree, outDegree, modelNo);
}
if ((!xmlStrcmp(child->name, BAD_CAST "transition"))) {
if (modeltype & GHMM_kTransitionClasses)
parseMultipleTransition(doc, child, f, modelNo, cos);
else
parseSingleTransition(doc, child, f, modelNo);
}
child = child->next;
}
if (modeltype & GHMM_kHigherOrderEmissions) {
ARRAY_MALLOC(f->model.d[modelNo]->pow_lookup, f->model.d[modelNo]->maxorder+2);
f->model.d[modelNo]->pow_lookup[0] = 1;
for (i=1; i < f->model.d[modelNo]->maxorder+2; ++i)
f->model.d[modelNo]->pow_lookup[i] = f->model.d[modelNo]->M * f->model.d[modelNo]->pow_lookup[i-1];
}
/* freeing temporary data */
m_free(inDegree);
m_free(outDegree);
return 0;
STOP:
free(inDegree);
free(outDegree);
free(modelname);
free(bg_orders);
free(bg_ptr);
free(alphabets);
free(f);
return -1;
#undef CUR_PROC
}
/*===========================================================================*/
ghmm_xmlfile* ghmm_xmlfile_parse(const char *filename) {
#define CUR_PROC "ghmm_xmlfile_parse"
xmlParserCtxtPtr ctxt; /* the parser context */
xmlDocPtr doc; /* the resulting document tree */
xmlNodePtr cur, child;
int modelNo = 0;
int error;
char * estr;
ghmm_xmlfile* filedata = NULL;
/* validate the document */
if (!ghmm_xmlfile_validate(filename)) {
estr = ighmm_mprintf(NULL, 0, "Failed to validate document %s", filename);
GHMM_LOG(LERROR, estr);
m_free(estr);
goto STOP;
}
/* create a parser context */
ctxt = xmlNewParserCtxt();
if (ctxt == NULL) {
GHMM_LOG(LERROR, "Failed to allocate parser context");
goto STOP;
}
/* parse the file, activating the DTD validation option */
doc = xmlCtxtReadFile(ctxt, filename, NULL, 0);
/* check if parsing suceeded */
if (doc == NULL) {
estr = ighmm_mprintf(NULL, 0, "Failed to parse %s", filename);
GHMM_LOG(LERROR, estr);
m_free(estr);
} else {
/* checking the root node, creating the file structure and
iteration over all HMMs */
cur = xmlDocGetRootElement(doc);
/* file contains a mixture of HMMs */
if ((!xmlStrcmp(cur->name, BAD_CAST "mixture"))) {
ARRAY_CALLOC(filedata, 1);
filedata->noModels = getIntAttribute(cur, "noComponents", &error);
child = cur->children;
while (child!=NULL) {
if ((!xmlStrcmp(child->name, BAD_CAST "HMM"))) {
if (modelNo >= filedata->noModels) {
estr = ighmm_mprintf(NULL, 0, "The mixture has more models than"
" defined, ignoring all following HMMs (%d/%d)",
modelNo, filedata->noModels);
GHMM_LOG(LWARN, estr);
m_free(estr);
break;
} else {
if (parseHMM(filedata, doc, child, modelNo)) {
estr = ighmm_mprintf(NULL, 0, "could not parse model no. %d", modelNo);
GHMM_LOG(LERROR, estr);
m_free(estr);
goto STOP;
}
modelNo++;
}
}
child=child->next;
}
if (modelNo < filedata->noModels){
GHMM_LOG(LERROR, "The mixture has less models than defined");
goto STOP;
}
/* only single hmm in file */
} else if (!xmlStrcmp(cur->name, BAD_CAST "HMM")) {
ARRAY_CALLOC(filedata, 1);
filedata->noModels = 1;
if (parseHMM(filedata, doc, cur, 0)) {
GHMM_LOG(LERROR, "could not parse the hidden markov model");
goto STOP;
}
/* invalid root entry */
} else {
estr = ighmm_mprintf(NULL, 0, "The file does not contains the appropriate root %s", filename);
GHMM_LOG(LERROR, estr);
m_free(estr);
}
}
/* free up the resulting document */
xmlFreeDoc(doc);
/* free up the parser context */
xmlFreeParserCtxt(ctxt);
return filedata;
STOP:
return NULL;
#undef CUR_PROC
}
/*===========================================================================*/
static void silence(void* x, const char* y, ...) {return;}
/*===========================================================================*/
static int validateFixedDTD(const char* filename) {
#define CUR_PROC "validateFixedDTD"
#ifndef DTD_LOC
#define DTD_LOC "/usr/share/ghmm/ghmm.dtd.1.0"
#endif
const char fileDTD[] = DTD_LOC;
char * estr;
int retval = 0;
xmlDtdPtr dtd = NULL;
xmlDocPtr doc = NULL;
xmlValidCtxtPtr cvp = NULL;
if (filename != NULL && fileDTD != NULL) {
dtd = xmlParseDTD(NULL, (const xmlChar *)fileDTD);
if (dtd == NULL) {
estr = ighmm_mprintf(NULL, 0, "Could not parse DTD %s.", fileDTD);
GHMM_LOG(LDEBUG, estr);
m_free(estr);
goto STOP;
}
doc = xmlReadFile(filename, NULL, 0);
if (doc == NULL) {
estr = ighmm_mprintf(NULL, 0, "Could not parse document %s.", filename);
GHMM_LOG(LERROR, estr);
m_free(estr);
goto STOP;
}
if ((cvp = xmlNewValidCtxt()) == NULL) {
GHMM_LOG(LERROR, "Couldn't allocate validation context\n");
goto STOP;
}
/* set error and warning functions to NULL to make validation silent */
cvp->error = (xmlValidityErrorFunc) silence;
cvp->warning = (xmlValidityWarningFunc) silence;
/* check if validation suceeded */
if (xmlValidateDtd(cvp, doc, dtd)) {
retval = 1;
} else {
estr = ighmm_mprintf(NULL, 0, "Failed to validate document %s against %s",
filename, fileDTD);
GHMM_LOG(LDEBUG, estr);
m_free(estr);
}
}
STOP:
if (cvp != NULL)
xmlFreeValidCtxt(cvp);
if (doc != NULL)
xmlFreeDoc(doc);
if (dtd != NULL)
xmlFreeDtd(dtd);
return retval;
#undef CUR_PROC
}
/*===========================================================================*/
static int validateDynamicDTD(const char* filename) {
#define CUR_PROC "validateDynamicDTD"
int retval = 0;
char *estr;
xmlParserCtxtPtr ctxt = NULL;
xmlDocPtr doc = NULL;
ctxt = xmlNewParserCtxt();
/* silencing the validation errors */
ctxt->vctxt.error = (xmlValidityErrorFunc) silence;
ctxt->vctxt.warning = (xmlValidityWarningFunc) silence;
doc = xmlCtxtReadFile(ctxt, filename, NULL, XML_PARSE_DTDVALID);
/* check if parsing suceeded */
if (doc == NULL) {
estr = ighmm_mprintf(NULL, 0, "Failed to parse %s", filename);
GHMM_LOG(LDEBUG, estr);
m_free(estr);
} else {
if (ctxt->valid == 0) {
estr = ighmm_mprintf(NULL, 0, "Failed to validate %s", filename);
GHMM_LOG(LDEBUG, estr);
m_free(estr);
}
else
retval = 1;
}
xmlFreeDoc(doc);
xmlFreeParserCtxt(ctxt);
return retval;
#undef CUR_PROC
}
/*===========================================================================*/
int ghmm_xmlfile_validate(const char *filename) {
#define CUR_PROC "ghmm_xmlfile_validate"
return (validateFixedDTD(filename) || validateDynamicDTD(filename));
#undef CUR_PROC
}