Codebase list tigr-glimmer / a2ba7ce SimpleMake / entropy-profile.cc
a2ba7ce

Tree @a2ba7ce (Download .tar.gz)

entropy-profile.cc @a2ba7ceraw · history · blame

//  A. L. Delcher
//
//  File:  entropy-profile.cc
//
//  Last Modified:  Tue May 23 10:30:35 EDT 2006
//
//  This program reads a multifasta file of gene sequences.
//  It translates each sequence to it protein product and
//  computes and prints the natural and entropy distributions
//  of the amino acids.  It also translates the reverse-complement
//  of each sequence, and prints the same distributions for
//  those sequences.



#include  "entropy-profile.hh"


// External variables

extern int  Verbose;
extern int  Global_Debug_Flag;


// Global variables

static bool  Brief_Output = false;
  // Determines output format
static long int  Min_Len = 0;
  // Sequences shorter than this are ignored



int  main
    (int argc, char * argv [])

  {
   string  sequence, hdr;
   string  rev_sequence;
   double  entropy [26], prob [26];
   double  ep [20];
   double  rev_entropy [26], rev_prob [26];
   double  rev_ep [20];
   int  count [26] = {0}, rev_count [26] = {0};
   int  total = 0, rev_total = 0;;
   int  i, j;

   Verbose = 0;

   Parse_Command_Line (argc, argv);

   while  (Fasta_Read (stdin, sequence, hdr))
     {
      const char  * seq, * rev_seq;
      char  aa;
      int  i, n;

      n = sequence . length ();
      if  (n < Min_Len || n % 3 != 0)
          continue;

      rev_sequence = seq;
      Reverse_Complement (rev_sequence);

      seq = sequence . c_str ();
      rev_seq = rev_sequence . c_str ();
      for  (i = 0;  i < n;  i += 3)
        {
         aa = Codon_Translation (seq + i);
         if  (aa != '*')
             count [aa - 'A'] ++;
         aa = Codon_Translation (rev_seq + i);
         if  (aa != '*')
             rev_count [aa - 'A'] ++;
        }

     }

   for  (i = 0;  i < 26;  i ++)
     if  (IS_AMINO [i])
         {
          total += count [i];
          rev_total += rev_count [i];
         }

   Counts_To_Entropy_Profile (count, ep);
   Counts_To_Entropy_Profile (rev_count, rev_ep);

   if  (Brief_Output)
       {
        printf ("AA  %8s  %8s\n", "Positive", "Negative");
        for  (i = j = 0;  i < 26;  i ++)
          if  (IS_AMINO [i])
              {
               printf (" %c  %8.5f  %8.5f\n", 'A' + i, ep [j], rev_ep [j]);
               j ++;
              }
       }
     else
       {
        printf ("%2s %29s   %29s\n", "", "--- Forward Translation ----",
             "--- Reverse Translation ----");
        printf ("%2s %6s %6s  %6s  %6s   %6s %6s  %6s  %6s\n",
             "AA", "Count", "Percen", "Entrpy", "EFrac",
             "Count", "Percen", "Entrpy", "EFrac");
        for  (i = 0;  i < 26;  i ++)
          if  (IS_AMINO [i])
              {
               prob [i] = double (count [i]) / total;
               entropy [i] = -1.0 * prob [i] * log (prob [i]);
               rev_prob [i] = double (rev_count [i]) / rev_total;
               rev_entropy [i] = -1.0 * rev_prob [i] * log (rev_prob [i]);
              }

        for  (i = j = 0;  i < 26;  i ++)
          if  (IS_AMINO [i])
              {
               printf ("%c: %6d %5.1f%%  %6.3f  %6.3f   %6d %5.1f%%  %6.3f  %6.3f\n",
                    'A' + i, count [i], Percent (count [i], total),
                    entropy [i], ep [j],
                    rev_count [i], Percent (rev_count [i], rev_total),
                    rev_entropy [i], rev_ep [j]);
               j ++;
              }
       }

   return  0;
  }



static void  Parse_Command_Line
    (int argc, char * argv [])

//  Get options and parameters from command line with  argc
//  arguments in  argv [0 .. (argc - 1)] .

  {
   bool  errflg = false;
   int  ch;

   optarg = NULL;

#if  ALLOW_LONG_OPTIONS
   int  option_index = 0;
   static struct option  long_options [] = {
        {"brief", 0, 0, 'b'},
        {"help", 0, 0, 'h'},
        {"minlen", 1, 0, 'l'},
        {0, 0, 0, 0}
      };

   while  (! errflg
        && ((ch = getopt_long (argc, argv, "bhl:",
                     long_options, & option_index)) != EOF))
#else
   while  (! errflg
        && ((ch = getopt (argc, argv, "bhl:")) != EOF))
#endif

     switch  (ch)
       {
        case  'b' :
          Brief_Output = true;
          break;

        case  'h' :
          Usage ();
          exit (EXIT_SUCCESS);

        case  'l' :
          Min_Len = strtol (optarg, NULL, 10);
          break;

        default :
          errflg = true;
       }

   if  (errflg || optind > argc - 0)
       {
        Usage ();
        exit (EXIT_FAILURE);
       }

   return;
  }



static void  Usage
    (void)

//  Print to stderr description of options and command line for
//  this program.

  {
   fprintf (stderr,
       "USAGE:  entropy-profile [options] < input-file\n"
       "\n"
       "Read multi-fasta-format gene sequences from stdin.\n"
       "Translate each to its protein product and then print\n"
       "the natural and entropy distributions of the amino acids\n"
       "Output goes to stdout\n"
       "\n"
       "Options:\n"
       " -b\n"
       " --brief\n"
       "    Brief output:  3 columns with header line\n"
       " -h\n"
       " --help\n"
       "    Print this message\n"
       " -l <n>\n"
       " --minlen <n>\n"
       "    Don't output any sequence shorter than <n> characters\n"
       "\n");

   return;
  }