Codebase list tigr-glimmer / 3e814cdb-e004-4d37-9919-46690da6c39e/main SimpleMake / start-codon-distrib.cc
3e814cdb-e004-4d37-9919-46690da6c39e/main

Tree @3e814cdb-e004-4d37-9919-46690da6c39e/main (Download .tar.gz)

start-codon-distrib.cc @3e814cdb-e004-4d37-9919-46690da6c39e/mainraw · history · blame

//  A. L. Delcher
//
//  File:  start-codon-distrib.cc
//
//  Last Modified:  Tue May 23 10:30:35 EDT 2006
//
//  This program reads the sequence in the file named as the
//  first command-line argument and then the list of gene coordinates
//  in the second command-line file.  It then counts the number of
//  different start codons for each of the genes in the second file.
//  If the genome is circular, the direction of each gene is assumed
//  to be the shorter direction around the circle; otherwise the
//  order of the coordinates determines the direction.  Alternately,
//  the 4th field of the coordinate file can specify the direction
//  of the gene.


#include  "start-codon-distrib.hh"


// External variables

extern int  Verbose;
extern int  Global_Debug_Flag;


// Global variables

static bool  Comma3_Output = false;
  // If set true by the -3 option, then output only a comma separated
  // list (no spaces) of atg, gtg, ttg start proportions, in that order
static char  * Coord_Info = NULL;
  // Name of file with list of coordinates (or a single coordinate
  // specification).
static bool  Is_Circular = true;
  // Determines whether the input sequence is regarded as a circular
  // genome.
static char  * Sequence_File_Name = NULL;
  // Name of file with input sequence
static bool  Use_Direction = false;
  // If set true (by -d option) then use 4th field of coords
  // to determine direction in a circular genome.



int  main
    (int argc, char * argv [])

  {
   FILE  * fp;
   vector <Count_Entry_t>  entry;
   string  sequence, hdr;
   char  line [MAX_LINE], tag [MAX_LINE];
   char  codon [4];
   long int  seq_len, start, end;
   int  i, n, total_entries = 0;

   Verbose = 0;

   Parse_Command_Line (argc, argv);

   fp = File_Open (Sequence_File_Name, "r");

   if  (! Fasta_Read (fp, sequence, hdr))
       {
        sprintf (Clean_Exit_Msg_Line, "ERROR:  Failed to open file %s",
             Sequence_File_Name);
        Clean_Exit (Clean_Exit_Msg_Line, __FILE__, __LINE__);
       }
   fclose (fp);

   seq_len = sequence . length ();

   if  (strcmp (Coord_Info, "-") == 0)
       fp = stdin;
     else
       fp = File_Open (Coord_Info, "r");

   codon [3] = '\0';
   while  (fgets (line, MAX_LINE, fp) != NULL)
     {
      int  dir;

      if  (Use_Direction)
          {
           if  (sscanf (line, "%s %ld %ld %d", tag, & start, & end, & dir) != 4)
               {
                fprintf (stderr, "ERROR:  Skipped following coord line\n");
                fputs (line, stderr);
                continue;
               }
          }
        else
          {
           if  (sscanf (line, "%s %ld %ld", tag, & start, & end) != 3)
               {
                fprintf (stderr, "ERROR:  Skipped following coord line\n");
                fputs (line, stderr);
                continue;
               }
           if  ((start < end && (! Is_Circular || end - start <= seq_len / 2))
                    || (Is_Circular && start - end > seq_len / 2))
               dir = +1;
             else
               dir = -1;
          }

      if  (dir > 0)
          {
           for  (i = 0;  i < 3;  i ++)
             codon [i] = tolower (sequence [Seq_Sub (start + i, seq_len)]);
           Incr (entry, codon);
          }
        else
          {
           for  (i = 0;  i < 3;  i ++)
             codon [i] = Complement (tolower (sequence [Seq_Sub (start - i, seq_len)]));
           Incr (entry, codon);
          }
      total_entries ++;
     }

   if  (Comma3_Output)
       {
        if  (total_entries == 0)
            total_entries = 1;
        n = entry . size ();
        for  (i = 0;  i < n;  i ++)
          if  (strcmp (entry [i] . codon, "atg") == 0)
              {
               printf ("%.3f", double (entry [i] . count) / total_entries);
               break;
              }
        if  (i == n)
            printf ("%.3f", 0.0);
        for  (i = 0;  i < n;  i ++)
          if  (strcmp (entry [i] . codon, "gtg") == 0)
              {
               printf (",%.3f", double (entry [i] . count) / total_entries);
               break;
              }
        if  (i == n)
            printf (",%.3f", 0.0);
        for  (i = 0;  i < n;  i ++)
          if  (strcmp (entry [i] . codon, "ttg") == 0)
              {
               printf (",%.3f\n", double (entry [i] . count) / total_entries);
               break;
              }
        if  (i == n)
            printf (",%.3f\n", 0.0);
       }
     else
       {
        sort (entry . begin (), entry . end (), Count_Entry_Cmp);
        n = entry . size ();
        for  (i = 0;  i < n;  i ++)
          {
           printf (" %s   %6d  %5.1f%%\n", entry [i] . codon, entry [i] . count,
                Percent (entry [i] . count, total_entries));
          }
        printf ("Total: %6d\n", total_entries);
       }

   return  0;
  }



static bool  Count_Entry_Cmp
    (Count_Entry_t const & a, Count_Entry_t const & b)

//  Compare  a  and  b  first by  count  (for descending order sort),
//  or if equal, by  codon  alphabetically.
    
  {
   if  (a . count > b . count)
       return  true;

   return  (a . count == b . count && strcmp (a . codon, b . codon) < 0);
  }



static void  Incr
    (vector <Count_Entry_t> & entry, const char s [4])

//  Search for string  s  in  entry .  If found, increment the
//  corresponding count; otherwise, create a new entry and set
//  its count to 1.

  {
   Count_Entry_t  e;
   int  i, n;

   n = entry . size ();
   for  (i = 0;  i < n;  i ++)
     if  (strcmp (entry [i] . codon, s) == 0)
         {
          entry [i] . count ++;
          return;
         }

   strcpy (e . codon, s);
   e . count = 1;
   entry . push_back (e);

   return;
  }



static void  Parse_Command_Line
    (int argc, char * argv [])

//  Get options and parameters from command line with  argc
//  arguments in  argv [0 .. (argc - 1)] .

  {
   bool  errflg = false;
   int  ch;

   optarg = NULL;

#if  ALLOW_LONG_OPTIONS
   int  option_index = 0;
   static struct option  long_options [] = {
        {"dir", 0, 0, 'd'},
        {"help", 0, 0, 'h'},
        {"minlen", 1, 0, 'l'},
        {"nostart", 0, 0, 's'},
        {"nostop", 0, 0, 't'},
        {"nowrap", 0, 0, 'w'},
        {"3comma", 0, 0, '3'},
        {0, 0, 0, 0}
      };

   while  (! errflg
        && ((ch = getopt_long (argc, argv, "dhw3",
                     long_options, & option_index)) != EOF))
#else
   while  (! errflg
        && ((ch = getopt (argc, argv, "dhw3")) != EOF))
#endif

     switch  (ch)
       {
        case  'd' :
          Use_Direction = true;
          break;

        case  'h' :
          Usage ();
          exit (EXIT_SUCCESS);

        case  'w' :
          Is_Circular = false;
          break;

        case  '3' :
          Comma3_Output = true;
          break;

        default :
          errflg = true;
       }

   if  (errflg || optind > argc - 2)
       {
        Usage ();
        exit (EXIT_FAILURE);
       }

   Sequence_File_Name = argv [optind ++];
   Coord_Info = argv [optind ++];

   return;
  }



static long int  Seq_Sub
    (long int sub, long int seq_len)

//  Return the subscript (in zero-based coordinates) of position
//   sub  (which is in 1-based coordinates) allowing for circular
//  wraparounds (off either end) of a sequence of length  seq_len .

  {
   sub --;

   while  (sub < 0)
     sub += seq_len;

   while  (seq_len - 1 <= sub)
     sub -= seq_len;

   return  sub;
  }



static void  Usage
    (void)

//  Print to stderr description of options and command line for
//  this program.

  {
   fprintf (stderr,
       "USAGE:  start-codon-distrib [options] <sequence-file> <coords>\n"
       "\n"
       "Read fasta-format <sequence-file> and count the number of\n"
       "different start codons for the genes specified in <coords>.\n"
       "By default, <coords> is the name of a file containing lines of the form\n"
       "  <tag>  <start>  <stop>  [<frame>] ...\n"
       "Coordinates are inclusive counting from 1, e.g., \"1 3\"\n"
       "represents the 1st 3 characters of the sequence.\n"
       "Output goes to stdout.\n"
       "\n"
       "Options:\n"
       " -d\n"
       " --dir\n"
       "    Use the 4th column of each input line to specify the direction\n"
       "    of the sequence.  Positive is forward, negative is reverse\n"
       "    The input sequence is assumed to be circular\n"
       " -h\n"
       "    Print this message\n"
       " -w\n"
       " --nowrap\n"
       "    Use the actual input coordinates without any wraparound\n"
       "    that would be needed by a circular genome.  Without this\n"
       "    option, the output sequence is the shorter of the two ways\n"
       "    around the circle.  Use the -d option to specify direction\n"
       "    explicitly.\n"
       " -3\n"
       " --3comma\n"
       "    output only a comma separated list (no spaces) of atg, gtg, ttg\n"
       "start proportions, in that order\n"
       "\n");

   return;
  }