Codebase list libocas / HEAD load_svmlight_file.c
HEAD

Tree @HEAD (Download .tar.gz)

load_svmlight_file.c @HEADraw · history · blame

/*=================================================================
 *  [feat,labels] = load_svmlight_format(file_name)
 *  [feat,labels] = load_svmlight_format(file_name,verb)
 *
 *  This function reads examples from a file complaying to SVM^light 
 *  format. 
 *
 *  
 * 
 *
 *=================================================================*/

#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <mex.h>
#include <sys/time.h>
#include <time.h>
#include <errno.h>

#if !defined(MX_API_VER) || MX_API_VER<0x07040000
#define mwSize int
#define INDEX_TYPE_T int
#define mwIndex int
#else
#define INDEX_TYPE_T mwSize
#endif

#include "lib_svmlight_format.h"

#define MaxExamples   50000000

#define MIN(A,B) ((A) > (B) ? (B) : (A))
#define MAX(A,B) ((A) < (B) ? (B) : (A))
#define ABS(A) ((A) < 0 ? -(A) : (A))
#define INDEX2(ROW,COL,NUM_ROWS) ((COL)*(NUM_ROWS)+(ROW))


/*======================================================================
  Main code plus interface to Matlab.
========================================================================*/

void mexFunction( int nlhs, mxArray *plhs[],int nrhs, const mxArray *prhs[] )
{
  char fname[1000];
  FILE *fid;
  char *line;
  double *feat_val;
  uint32_t *feat_idx;
  long nnzf;
  mxArray *W;
  long nDims = 0;
  long nData;
  long i, j;
  long nnz = 0;
  double max_data_norm2 = -mxGetInf();
  mwIndex *irs, *jcs;
  double *sr;
  int verb=0;

  if( nrhs < 1 )
    mexErrMsgTxt("At least one input argument required.\n"
                 "Synopsis:\n"
                 " [feat,labels] = load_svmlight_format(file_name)\n"
                 " [feat,labels] = load_svmlight_format(file_name,verb)\n"
                 " \n"
                 );
  if( nrhs >= 2)
    verb = (int)mxGetScalar(prhs[1]);
  else
    verb = 1;

  /* get input arguments */
  mxGetString(prhs[0], fname, 1000);

  if(verb)
    mexPrintf("Input file: %s\n", fname);

  fid = fopen(fname, "r");
  if(fid == NULL) {
    perror("fopen error: ");
    mexErrMsgTxt("Cannot open input file.");
  }

  /**********************************/
  line = mxCalloc(LIBSLF_MAXLINELEN, sizeof(char));
  if( line == NULL )
    mexErrMsgTxt("Not enough memmory to allocate line buffer.");

  feat_idx = mxCalloc(LIBSLF_MAXLINELEN, sizeof(uint32_t));
  if( feat_idx == NULL )
    mexErrMsgTxt("Not enough memmory to allocate feat_idx.");

  feat_val = mxCalloc(LIBSLF_MAXLINELEN, sizeof(double));
  if( feat_val == NULL )
    mexErrMsgTxt("Not enough memmory to allocate feat_val.");



  /*********************************************/
  /* Main code                                 */
  /*********************************************/

  if(verb)
    mexPrintf("Analysing input data...");

  double label;
  int go = 1;
  long line_cnt = 0;

  while(go) {
    
    if(fgets(line,LIBSLF_MAXLINELEN, fid) == NULL ) 
    {
      go = 0;
      if(verb)
      {
        if( (line_cnt % 1000) != 0) 
          mexPrintf(" %d", line_cnt);
        mexPrintf(" EOF.\n");
      }

    }
    else
    {
      line_cnt ++;
      nnzf = svmlight_format_parse_line_doubley(line, &label, feat_idx, feat_val);
      
      if(nnzf == -1) 
      {
         mexPrintf("Parsing error on line %d .\n", line_cnt);
         mexErrMsgTxt("Defective input file.");
      }

      double norm2 = 0;
      for(j = 0; j < nnzf; j++)
        norm2 += feat_val[j]*feat_val[j];

      max_data_norm2 = MAX(max_data_norm2,norm2);

      nDims = MAX(nDims,feat_idx[nnzf-1]);

      nnz += nnzf;
      
      if( (line_cnt % 1000) == 0) {
        if(verb)
        {
          mexPrintf(" %d", line_cnt);
          fflush(NULL);
        }
      }
    }
  }

  nData = line_cnt;

  fclose(fid);  
  if(verb)
  {
    mexPrintf("Number of examples: %d\n", nData);
    mexPrintf("Dimensions: %d\n", nDims);
    mexPrintf("nnz: %d, density: %f%%\n", nnz, 100*(double)nnz/((double)nDims*(double)nData) );
    mexPrintf("max_i ||x_i||^2: %f\n", max_data_norm2);
  }

  /*---------------------------------------------*/


  mxArray* sp_mat_X = mxCreateSparse(nDims, nData, nnz, mxREAL);
  if( sp_mat_X == NULL)
    mexErrMsgTxt("Not enough memory to allocate sp_mat_X");
  plhs[0] = sp_mat_X;

  plhs[1] = mxCreateDoubleMatrix(nData,1,mxREAL);
  if( plhs[1] == NULL)
    mexErrMsgTxt("Not enough memory to allocate vec_y.");
  double *vec_y = mxGetPr(plhs[1]);

  sr  = mxGetPr(sp_mat_X);
  irs = mxGetIr(sp_mat_X);
  jcs = mxGetJc(sp_mat_X);

  fid = fopen(fname, "r");
  if(fid == NULL) {
    perror("fopen error: ");
    mexErrMsgTxt("Cannot open input file.");
  }

  if(verb)
    mexPrintf("Reading examples...");
  
  go = 1;
  line_cnt = 0;
  long k=0;
  while(go) {
    if(fgets(line,LIBSLF_MAXLINELEN, fid) == NULL ) 
    {
      go = 0;
      if(verb)
      {
        if( (line_cnt % 1000) != 0) 
          mexPrintf(" %d", line_cnt);
        mexPrintf(" EOF.\n");
      }
    }
    else
    {
      line_cnt ++;
      nnzf = svmlight_format_parse_line_doubley(line, &label, feat_idx, feat_val);
      
      if(nnzf == -1) 
      {
         mexPrintf("Parsing error on line %d .\n", line_cnt);
         mexErrMsgTxt("Defective input file.");
      }

      vec_y[line_cnt-1] = (double)label;

      jcs[line_cnt-1] = k;

      for(j = 0; j < nnzf; j++) {
        sr[k] = feat_val[j];
        irs[k] = feat_idx[j]-1;
        k++;
      }
      
      if(verb)
      {
        if( (line_cnt % 1000) == 0) {
          mexPrintf(" %d", line_cnt);
          fflush(NULL);
        }
      }
    }
  }
  jcs[line_cnt] = k;

  plhs[0] = sp_mat_X;

  fclose(fid);

  return;
}