Codebase list tigr-glimmer / a2ba7ce SimpleMake / extract.cc
a2ba7ce

Tree @a2ba7ce (Download .tar.gz)

extract.cc @a2ba7ceraw · history · blame

//  A. L. Delcher
//
//  File:  extract.cc
//
//  Last Modified:  5 Aug 2005
//
//  This program reads the sequence in the file named as the
//  first command-line argument and then extracts from it
//  sequences specified by coordinates.  The resulting sequences
//  are output (in multifasta or two-string format) to stdout.



#include  "extract.hh"


// External variables

extern int  Verbose;
extern int  Global_Debug_Flag;


// Global variables

static char  * Coord_Info = NULL;
  // Name of file with list of coordinates (or a single coordinate
  // specification).
static bool  Fasta_Output_Format = true;
  // Determines format of output.
static bool  Is_Circular = true;
  // Determines whether the input sequence is regarded as a circular
  // genome.
static long int  Min_Len = 0;
  // Output sequences shorter than this are not printed.
static char  * Sequence_File_Name = NULL;
  // Name of file with input sequence
static bool  Skip_Start = false;
  // If set true (by -s option) then omit the first 3 letters
  // of each output sequence.
static bool  Skip_Stop = false;
  // If set true (by -t option) then omit the last 3 letters
  // of each output sequence.
static bool  Use_Direction = false;
  // If set true (by -d option) then use 4th field of coords
  // to determine direction in a circular genome.



int  main
    (int argc, char * argv [])

  {
   FILE  * fp;
   string  sequence, hdr;
   char  line [MAX_LINE], tag [MAX_LINE];
   long int  seq_len;

   Verbose = 0;

   Parse_Command_Line (argc, argv);

   fp = File_Open (Sequence_File_Name, "r");

   if  (! Fasta_Read (fp, sequence, hdr))
       {
        sprintf (Clean_Exit_Msg_Line, "ERROR:  Failed to read file %s",
             Sequence_File_Name);
        Clean_Exit (Clean_Exit_Msg_Line, __FILE__, __LINE__);
       }
   fclose (fp);

   seq_len = sequence . length ();

   if  (strcmp (Coord_Info, "-") == 0)
       fp = stdin;
     else
       fp = File_Open (Coord_Info, "r");

   while  (fgets (line, MAX_LINE, fp) != NULL)
     {
      long int  i, start, end, extract_len;
      int  dir;

      if  (Use_Direction)
          {
           if  (sscanf (line, "%s %ld %ld %d", tag, & start, & end, & dir) != 4)
               {
                fprintf (stderr, "ERROR:  Skipped following coord line\n");
                fputs (line, stderr);
                continue;
               }
          }
        else
          {
           if  (sscanf (line, "%s %ld %ld", tag, & start, & end) != 3)
               {
                fprintf (stderr, "ERROR:  Skipped following coord line\n");
                fputs (line, stderr);
                continue;
               }
           if  ((start < end && (! Is_Circular || end - start <= seq_len / 2))
                    || (Is_Circular && start - end > seq_len / 2))
               dir = +1;
             else
               dir = -1;
          }

      if  (dir > 0)
          {
           extract_len = 1 + end - start;
           if  (extract_len < 0)
               extract_len += seq_len;
           if  (extract_len < Min_Len)
               continue;

           i = start - 1;
           if  (Skip_Start)
               {
                i += 3;
                extract_len -= 3;
                start += 3;
               }
           if  (Skip_Stop)
               extract_len -= 3;

           if  (extract_len >= Min_Len)
               Output_Subsequence (sequence, i, extract_len, 1, tag, start, end);
          }
        else
          {
           extract_len = 1 + start - end;
           if  (extract_len < 0)
               extract_len += seq_len;
           if  (extract_len < Min_Len)
               continue;

           i = start - 1;
           if  (Skip_Start)
               {
                i -= 3;
                extract_len -= 3;
                start -= 3;
               }
           if  (Skip_Stop)
               extract_len -= 3;

           if  (extract_len >= Min_Len)
               Output_Subsequence (sequence, i, extract_len, -1, tag, start, end);
          }
        
     }

   return  0;
  }



static void  Output_Subsequence
     (const string & seq, long int i, long int len, int incr,
      const char * tag, long int start, long int end)

//  Print to stdout the subsequence of  seq  beginning at subscript
//   i  with length  len .  If  incr  is positive, output the forward
//  strand; otherwise, output the reverse complement strand.  Use  tag ,
//   start  and  end  to label the output.

  {
   const int  fasta_width = 60;
   long int  seq_len;
   long int  ct = 0, line_ct = 0;

   if  (Fasta_Output_Format)
       printf (">%s  %ld %ld  len=%ld\n", tag, start, end, len);
     else
       printf ("%-10s ", tag);

   if  (incr > 0)
       incr = 1;
     else
       incr = -1;
   seq_len = seq . length ();
        
   while  (ct < len)
     {
      if  (i < 0)
          i += seq_len;
      else if  (i >= seq_len)
          i -= seq_len;

      if  (Fasta_Output_Format && line_ct == fasta_width)
          {
           putchar ('\n');
           line_ct = 0;
          }
      if  (incr > 0)
          putchar (seq [i]);
        else
          putchar (Complement (seq [i]));

      i += incr;
      ct ++;
      line_ct ++;
     }
   putchar ('\n');

   return;
  }



static void  Parse_Command_Line
    (int argc, char * argv [])

//  Get options and parameters from command line with  argc
//  arguments in  argv [0 .. (argc - 1)] .

  {
   bool  errflg = false;
   int  ch;

   optarg = NULL;

#if  ALLOW_LONG_OPTIONS
   int  option_index = 0;
   static struct option  long_options [] = {
        {"2_fields", 0, 0, 'd'},
        {"dir", 0, 0, 'd'},
        {"help", 0, 0, 'h'},
        {"minlen", 1, 0, 'l'},
        {"nostart", 0, 0, 's'},
        {"nostop", 0, 0, 't'},
        {"nowrap", 0, 0, 'w'},
        {0, 0, 0, 0}
      };

   while  (! errflg
        && ((ch = getopt_long (argc, argv, "2dhl:stw",
                     long_options, & option_index)) != EOF))
#else
   while  (! errflg
        && ((ch = getopt (argc, argv, "2dhl:stw")) != EOF))
#endif

     switch  (ch)
       {
        case  '2' :
          Fasta_Output_Format = false;
          break;

        case  'd' :
          Use_Direction = true;
          break;

        case  'h' :
          Usage ();
          exit (EXIT_SUCCESS);

        case  'l' :
          Min_Len = strtol (optarg, NULL, 10);
          break;

        case  's' :
          Skip_Start = true;
          break;

        case  't' :
          Skip_Stop = true;
          break;

        case  'w' :
          Is_Circular = false;
          break;

        default :
          errflg = true;
       }

   if  (errflg || optind > argc - 2)
       {
        Usage ();
        exit (EXIT_FAILURE);
       }

   Sequence_File_Name = argv [optind ++];
   Coord_Info = argv [optind ++];

   return;
  }



static void  Usage
    (void)

//  Print to stderr description of options and command line for
//  this program.

  {
   fprintf (stderr,
       "USAGE:  extract [options] <sequence-file> <coords>\n"
       "\n"
       "Read fasta-format <sequence-file> and extract from it the\n"
       "subsequences specified by <coords>.  By default, <coords>\n"
       "is the name of a file containing lines of the form\n"
       "  <tag>  <start>  <stop>  [<frame>] ...\n"
       "Coordinates are inclusive counting from 1, e.g., \"1 3\"\n"
       "represents the 1st 3 characters of the sequence.\n"
       "For each line the corresponding region of <sequence-file>\n"
       "is extracted and output (after reverse-complementing if necessary)\n"
       "Output goes to stdout in multi-fasta format, unless the -2 option\n"
       "is specified\n"
       "\n"
       "Options:\n"
       " -2\n"
       " --2_fields\n"
       "    Output each sequence as 2 fields (tag and sequence) on a single line\n"
       " -d\n"
       " --dir\n"
       "    Use the 4th column of each input line to specify the direction\n"
       "    of the sequence.  Positive is forward, negative is reverse\n"
       "    The input sequence is assumed to be circular\n"
       " -h\n"
       " --help\n"
       "    Print this message\n"
       " -l <n>\n"
       " --minlen <n>\n"
       "    Don't output any sequence shorter than <n> characters\n"
       " -s\n"
       " --nostart\n"
       "    Omit the first 3 characters of each output string\n"
       " -t\n"
       " --nostop\n"
       "    Omit the last 3 characters of each output string\n"
       " -w\n"
       " --nowrap\n"
       "    Use the actual input coordinates without any wraparound\n"
       "    that would be needed by a circular genome.  Without this\n"
       "    option, the output sequence is the shorter of the two ways\n"
       "    around the circle.  Use the -d option to specify direction\n"
       "    explicitly.\n"
       "\n");

   return;
  }