simulator.c - dazzdb (27faf40d-6825-4cdb-97a6-7e916b8d8318/upstream)

simulator.c @27faf40d-6825-4cdb-97a6-7e916b8d8318/upstream — raw · history · blame

/*******************************************************************************************
 *
 *  Synthetic DNA shotgun dataset simulator
 *     From a supplied reference genome in the form of a Dazzler .dam, sample reads of
 *     mean length -m from a log-normal length distribution with standard deviation -s,
 *     but ignore reads of length less than -x.  Collect enough reads to cover the genome
 *     -c times.   Introduce -e fraction errors into each read where the ratio of insertions,
 *     deletions, and substitutions are set by defined constants INS_RATE and DEL_RATE
 *     within generate.c.  The fraction -f controls the rate at which reads are picked from
 *     the forward and reverse strands which defaults to 50%.  If -C is set then assume the
 *     scaffolds are circular.
 *
 *     The -r parameter seeds the random number generator for the generation of the genome
 *     so that one can reproducbile produce the same underlying genome to sample from.  If
 *     missing, then the job id of the invocation seeds the generator.  The output is sent
 *     to the standard output (i.e. it is a pipe).  The output is in fasta format (i.e. it is
 *     a UNIX pipe).  The output is in Pacbio .fasta format suitable as input to fasta2DB.
 *
 *     The genome is considered a sequence of *scaffolds* (these are reconstituted from the
 *     Dazzler's internal encoding of a .dam), where the gaps are filled with a random
 *     sequence that follows the base distribution of the contigs of the genome.  The program
 *     then samples these filled in scaffolds for reads.  If the -C optioin is set then the
 *     program assumes each scaffold is a circular sequence.
 *
 *     The -M option requests that the scaffold and coordinates from which each read has
 *     been sampled are written to the indicated file, one line per read, ASCII encoded.
 *     This "map" file essentially tells one where every read belongs in an assembly and
 *     is very useful for debugging and testing purposes.  If a read pair is say b,e then
 *     if b < e the read was sampled from [b,e] in the forward direction, and from [e,b]
 *     in the reverse direction otherwise.
 *
 *  Author:  Gene Myers
 *  Date  :  July 2013
 *  Mod   :  April 2016 (generates reads w.r.t. a reference genome)
 *
 ********************************************************************************************/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <math.h>

#define PACBIO

#include "DB.h"

static char *Usage[] = { "<genome:dam> [-CU] [-m<int(10000)>]  [-s<int(2000)>] [-e<double(.15)>]",
                         "                   [-c<double(50.)>] [-f<double(.5)>] [-x<int(4000)>]",
                         "                   [-w<int(80)>] [-r<int>] [-M<file>]",
                       };

static int    CIRCULAR;   // -C option
static int    UPPER;      // -U option
static int    RMEAN;      // -m option
static int    RSDEV;      // -s option
static double ERROR;      // -e option
static double COVERAGE;   // -c option
static double FLIP_RATE;  // -f option
static int    RSHORT;     // -x option
static int    WIDTH;      // -w option
static int    HASR;       // -r option is set?
static int    SEED;       // -r option
static FILE  *MAP;        // -M option

#ifdef PACBIO

#define INS_RATE  .73333  // insert rate (for PB data)
#define DEL_RATE  .20000  // deletion rate
#define IDL_RATE  .93333  // insert + delete rate

#elif ILLUMINA

#define INS_RATE  .1  // insert rate (for Illumina data)
#define DEL_RATE  .1  // deletion rate
#define IDL_RATE  .2  // insert + delete rate

#else

#define INS_RATE  .33333  // insert rate (equal weighting)
#define DEL_RATE  .33333  // deletion rate
#define IDL_RATE  .66666  // insert + delete rate

#endif

//  Complement (in the DNA sense) string *s*.

static void complement(int elen, char *s)
{ char *t;
  int   c;

  t = s + (elen-1);
  while (s <= t)
    { c = *s;
      *s = (char) (3-*t);
      *t = (char) (3-c);
      s += 1;
      t -= 1;
    }
}

//  A unit normal distribution random number generator

#define UNORM_LEN 60000
#define UNORM_MAX   6.0

static double unorm_table[UNORM_LEN+1];  // Upper half of cdf of N(0,1)
static double unorm_scale;

static void init_unorm()
{ double del, sum, x;
  int    i;

  unorm_scale = del = UNORM_MAX / UNORM_LEN;

  sum = 0;                            // Integrate pdf, x >= 0 half only.
  for (i = 0; i < UNORM_LEN; i++)
    { x = i * del;
      unorm_table[i] = sum;
      sum += exp(-.5*x*x) * del;
    }
  unorm_table[UNORM_LEN] = sum;

                /* Normalize cdf */
  sum *= 2.;
  for (i = 0; i < UNORM_LEN; i++)
    unorm_table[i] /= sum;
  unorm_table[UNORM_LEN] = 1.;

#ifdef DEBUG
  printf("Truncated tail is < %g\n",
          exp(-.5*UNORM_MAX*UNORM_MAX)/(sum*(1.-exp(-UNORM_MAX))) );
  printf("Diff between last two entries is %g\n",.5-unorm_table[UNORM_LEN-1]);

  printf("\n  CDF:\n");
  for (i = 0; i <= UNORM_LEN; i += 100)
    printf("%6.2f: %10.9f\n",i*del,unorm_table[i]);
#endif
}

static int bin_search(int len, double *tab, double y)
{ int l, m, r;

  // Searches tab[0..len] for min { r : y < tab[r] }.
  //   Assumes y < 1, tab[0] = 0 and tab[len] = 1.
  //   So returned index is in [1,len].

  l = 0;
  r = len;
  while (l < r)
    { m = (l+r) >> 1;
      if (y < tab[m])
        r = m;
      else
        l = m+1;
    }
  return (r);
}

static double sample_unorm(double x)
{ double y;
  int    f;

  if (x >= .5)  // Map [0,1) random var to upper-half of cdf */
    y = x-.5;
  else
    y = .5-x;

  f = bin_search(UNORM_LEN,unorm_table,y);    // Bin. search upper-half cdf
#ifdef DEBUG
  printf("Normal search %g -> %g -> %d",x,y,f);
#endif

  // Linear interpolate between table points

  y = (f - (unorm_table[f]-y) / (unorm_table[f] - unorm_table[f-1]) ) * unorm_scale;

  if (x < .5) y = -y;       // Map upper-half var back to full range
#ifdef DEBUG
  printf(" -> %g\n",y);
#endif

  return (y);
}

//  Open and trim the reference genome *name*.  Determine the number of scaffolds and sizes
//    of each scaffold (in nscaffs and the .coff field of the read records) in the dam.  Then
//    create a sequence for each scaffold (index in the .boff field of the read records), that
//    consists of its contigs with a random sequence filling the gaps (generated according to
//    the bp frequency in db.freq[4]).

DAZZ_DB *load_and_fill(char *name, int *pscaffs)
{ static DAZZ_DB db;
  DAZZ_READ *reads;
  FILE      *bases;
  char      *bases_name;
  char      *seq;
  int        nreads, nscaffs;
  int        i, c;
  int64      ctot;
  int64      o, u;
  double     PRA, PRC, PRG;

  if (Open_DB(name,&db) != 1)
    { fprintf(stderr,"%s: %s is not a Dazzler .dam\n",Prog_Name,name);
      exit (1);
    }
  Trim_DB(&db);

  PRA = db.freq[0];
  PRC = PRA + db.freq[1];
  PRG = PRC + db.freq[2];

  nreads  = db.nreads;
  reads   = db.reads;

  nscaffs = 0;
  for (i = 0; i < nreads; i++)
    if (reads[i].origin == 0)
      nscaffs += 1;

  for (i = 0; i < nscaffs; i++)
    reads[i].coff = 0;

  c = -1;
  for (i = 0; i < nreads; i++)
    { if (reads[i].origin == 0)
        c += 1;
      reads[c].coff = reads[i].fpulse+reads[i].rlen;
    }

  ctot = 0;
  for (i = 0; i < nscaffs; i++)
    ctot += reads[i].coff+1;

  bases_name = Strdup(Catenate(db.path,"","",".bps"),"Allocating base-pair file name");
  bases = Fopen(bases_name,"r");
  if (bases_name == NULL || bases == NULL)
    exit (1);

  seq = (char *) Malloc(ctot+4,"Allocating space for genome");
  if (seq == NULL)
    exit (1);
  *seq++ = 4;

  c = -1;
  o = u = 0;
  for (i = 0; i < nreads; i++)
    { int   len, clen;
      int64 off;

      if (reads[i].origin == 0)
        { if (c >= 0)
            o += reads[c].coff + 1;
          c += 1;
          u = o;
        }
      else
        { int64  p;
          double x;

          p = u + reads[i-1].rlen;
          u = o + reads[i].fpulse;
          while (p < u)
            { x = drand48();
              if (x < PRC)
                if (x < PRA)
                  seq[p++] = 0;
                else
                  seq[p++] = 1;
              else
                if (x < PRG)
                  seq[p++] = 2;
                else
                  seq[p++] = 3;
            }
        }

      len = reads[i].rlen;
      off = reads[i].boff;
      if (ftello(bases) != off)
        FSEEKO(bases,off,SEEK_SET)
      clen = COMPRESSED_LEN(len);
      if (clen > 0)
        FFREAD(seq+u,clen,1,bases)
      Uncompress_Read(len,seq+u);
      if (reads[i].origin == 0)
        reads[c].boff = o;
    }
  reads[nscaffs].boff = ctot;

  db.bases  = (void *) seq;
  db.loaded = 1;

  *pscaffs = nscaffs;
  return (&db);
}

//  Generate reads (a) whose lengths are exponentially distributed with mean *mean* and
//    standard deviation *stdev*, and (b) that are never shorter than *shortest*.  Each
//    read is a randomly sampled interval of one of the filled scaffolds of *source*
//    (each interval is equally likely) that has insertion, deletion, and/or substitution
//    errors introduced into it and which is oriented in either the forward or reverse
//    strand direction with probability FLIP_RATE.  The number of errors introduced is the
//    length of the string times *erate*, and the probability of an insertion, delection,
//    or substitution is controlled by the defined constants INS_RATE and DEL_RATE.
//    If the -C option is set then each scaffold is assumed to be circular and reads can
//    be sampled that span the origin.   Reads are generated until the sum of the lengths of
//    the reads is greater thant coverage times the sum of the lengths of the scaffolds in
//    the reference (i.e. including filled scaffold gaps in the genome size).  The reads are
//    output as fasta entries with the PacBio-specific header format that contains the
//    sampling interval, read length, and a read id.

static void shotgun(DAZZ_DB *source, int nscaffs)
{ DAZZ_READ *reads;
  int64      gleng;
  int        maxlen, nreads, qv;
  int64      totlen, totbp;
  char      *rbuffer, *bases;
  double     nmean, nsdev;
  double    *weights;
  int        scf;

  nsdev = (1.*RSDEV)/RMEAN;
  nsdev = log(1.+nsdev*nsdev);
  nmean = log(1.*RMEAN) - .5*nsdev;
  nsdev = sqrt(nsdev);

  bases = source->bases;
  reads = source->reads;
  gleng = reads[nscaffs].boff - nscaffs;
  if (gleng <= RSHORT)
    { fprintf(stderr,"Genome length is less than shortest read length !\n");
      exit (1);
    }

  init_unorm();

  weights = (double *) Malloc(sizeof(double)*(nscaffs+1),"Allocating contig weights");
  if (weights == NULL)
    exit (1);

  { double r;

    r = 0.;
    for (scf = 0; scf < nscaffs; scf++)
      { weights[scf] = r/gleng;
        r += reads[scf].coff;
      }
    weights[nscaffs] = 1.;
  }

  qv = (int) (1000 * (1.-ERROR));

  rbuffer = NULL;
  maxlen  = 0;
  totlen  = 0;
  totbp   = COVERAGE*gleng;
  nreads  = 0;
  while (totlen < totbp)
    { int    len, sdl, ins, del, elen, slen, rbeg, rend;
      int    j;
      double uni;
      char  *s, *t;

      scf = bin_search(nscaffs,weights,drand48()) - 1;   //  Pick a scaffold with probability
                                                         //    proportional to its length

      uni = drand48();
      len = (int) exp(nmean + nsdev*sample_unorm(uni));    //  Pick a read length
      if (len <= RSHORT)
        continue;

      // New sampler:

      slen = reads[scf].coff;
      rbeg = (int) (drand48()*slen);          //  Pick a spot for read start
      if (CIRCULAR)
        rend = (rbeg + len) % slen;           //  Wrap if circular
      else
        { if (drand48() < .5)                 //  Pick direction and trim if necessary
            { rend = rbeg + len;              //    if not circular
              if (rend > slen)
                { rend = slen;
                  len  = rend - rbeg;
                }
            }
          else
            { rend = rbeg;
              rbeg = rbeg - len;
              if (rbeg < 0)
                { rbeg = 0;
                  len  = rend;
                }
            }
          if (len <= RSHORT)
            continue;
        }

      // Old sampler:
      //
      // rbeg = (int) (drand48()*((reads[scf].coff-len)+.9999999));
      // rend = rbeg + len;

      sdl = (int) (len*ERROR);      //  Determine number of inserts *ins*, deletions *del,
      ins = del = 0;                //    and substitions+deletions *sdl*.
      for (j = 0; j < sdl; j++)
        { double x = drand48();
          if (x < INS_RATE)
            ins += 1;
          else if (x < IDL_RATE)
            del += 1;
        }
      sdl -= ins;
      elen = len + (ins-del);

      if (elen > maxlen)
        { maxlen  = ((int) (1.2*elen)) + 1000;
          rbuffer = (char *) Realloc(rbuffer,maxlen+3,"Allocating read buffer");
          if (rbuffer == NULL)
            exit (1);
        }

      t = rbuffer;
      s = bases + (reads[scf].boff + rbeg);

      //   Generate the string with errors.  NB that inserts occur randomly between source
      //     characters, while deletions and substitutions occur on source characters.

      while ((len+1) * drand48() < ins)
        { *t++ = (char) (4.*drand48());
          ins -= 1;
        }
      for ( ; len > 0; len--)
        { if (len * drand48() >= sdl)
            *t++ = *s;
          else if (sdl * drand48() >= del)
            { double x = 3.*drand48();
              if (x >= *s)
                x += 1.;
              *t++ = (char) x;
              sdl -= 1;
            }
          else
            { del -= 1;
              sdl -= 1;
            }
          s += 1;
          if (*s == 4)
            s = bases + reads[scf].boff;
          while (len * drand48() < ins)
            { *t++ = (char) (4.*drand48());
              ins -= 1;
            }
        }
      *t = 4;

      if (drand48() >= FLIP_RATE)    //  Complement the string with probability FLIP_RATE.
        { complement(elen,rbuffer);
          j = rend;
          rend = rbeg;
          rbeg = j;
        }

      PRINTF(">Sim/%d/%d_%d RQ=0.%d\n",nreads+1,0,elen,qv)
      if (UPPER)
        Upper_Read(rbuffer);
      else
        Lower_Read(rbuffer);
      for (j = 0; j+WIDTH < elen; j += WIDTH)
        PRINTF("%.*s\n",WIDTH,rbuffer+j)
      if (j < elen)
        PRINTF("%s\n",rbuffer+j)

       if (MAP != NULL)
         FPRINTF(MAP," %6d %9d %9d\n",scf,rbeg,rend)

       totlen += elen;
       nreads += 1;
    }
}

int main(int argc, char *argv[])
{ DAZZ_DB *source;
  int      nscaffs;

  //  Process command line

  { int    i, j, k;
    int    flags[128];
    char  *eptr;

    ARG_INIT("simulator");

    RMEAN     = 10000;
    RSDEV     = 2000;
    ERROR     = .15;
    COVERAGE  = 50.;
    FLIP_RATE = .5;
    RSHORT    = 4000;
    HASR      = 0;
    MAP       = NULL;
    WIDTH     = 80;

    j = 1;
    for (i = 1; i < argc; i++)
      if (argv[i][0] == '-')
        switch (argv[i][1])
        { default:
            ARG_FLAGS("CU");
            break;
          case 'c':
            ARG_REAL(COVERAGE)
            if (COVERAGE < 0.)
              { fprintf(stderr,"%s: Coverage must be non-negative (%g)\n",Prog_Name,COVERAGE);
                exit (1);
              }
            break;
          case 'e':
            ARG_REAL(ERROR)
            if (ERROR < 0. || ERROR > .5)
              { fprintf(stderr,"%s: Error rate must be in [0,.5] (%g)\n",Prog_Name,ERROR);
                exit (1);
              }
            break;
          case 'f':
            ARG_REAL(FLIP_RATE)
            if (FLIP_RATE < 0. || FLIP_RATE > 1.)
              { fprintf(stderr,"%s: Error rate must be in [0,1] (%g)\n",Prog_Name,FLIP_RATE);
                exit (1);
              }
            break;
          case 'm':
            ARG_POSITIVE(RMEAN,"Mean read length")
            break;
          case 'r':
            SEED = strtol(argv[i]+2,&eptr,10);
            HASR = 1;
            if (*eptr != '\0' || argv[i][2] == '\0')
              { fprintf(stderr,"%s: -r argument is not an integer\n",Prog_Name);
                exit (1);
              }
            break;
          case 's':
            ARG_NON_NEGATIVE(RSDEV,"Read length standard deviation")
            break;
          case 'x':
            ARG_NON_NEGATIVE(RSHORT,"Read length minimum")
            break;
          case 'w':
            ARG_NON_NEGATIVE(WIDTH,"Line width")
            break;
          case 'M':
            MAP = Fopen(argv[i]+2,"w");
            if (MAP == NULL)
              exit (1);
            break;
        }
      else
        argv[j++] = argv[i];
    argc = j;

    CIRCULAR = flags['C'];
    UPPER    = flags['U'];

    if (argc != 2)
      { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]);
        fprintf(stderr,"       %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]);
        fprintf(stderr,"       %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]);
        fprintf(stderr,"\n");
        fprintf(stderr,"      -m: average read length (log normal distribution).\n");
        fprintf(stderr,"      -s: standard deviation of read lengths (log normal)\n");
        fprintf(stderr,"      -x: ignore reads below this length\n");
        fprintf(stderr,"      -f: forward/reverse strand sampling fraction\n");
        fprintf(stderr,"      -e: error rate\n");
        fprintf(stderr,"      -c: coverage of genome\n");
        fprintf(stderr,"      -C: assume genome is circular (default is linear)\n");
        fprintf(stderr,"\n");
        fprintf(stderr,"      -r: Random number generator seed (default is process id).\n");
        fprintf(stderr,"      -w: Print -w bp per line (default is 80).\n");
        fprintf(stderr,"      -U: Use upper case for DNA (default is lower case).\n");
        fprintf(stderr,"      -M: create a map file that indicates where every read was sampled\n");
        exit (1);
      }
  }

  if (HASR)
    srand48(SEED);
  else
    srand48(getpid());

  //  Read and generate

  source = load_and_fill(argv[1],&nscaffs);

  shotgun(source,nscaffs);

  if (MAP != NULL)
    fclose(MAP);

  exit (0);
}
Tree @27faf40d-6825-4cdb-97a6-7e916b8d8318/upstream (Download .tar.gz)

simulator.c @27faf40d-6825-4cdb-97a6-7e916b8d8318/upstream — raw · history · blame