Codebase list cd-hit / a68a11ac-1c8a-46c5-891d-ba2259aab301/main clstr_merge.pl
a68a11ac-1c8a-46c5-891d-ba2259aab301/main

Tree @a68a11ac-1c8a-46c5-891d-ba2259aab301/main (Download .tar.gz)

clstr_merge.pl @a68a11ac-1c8a-46c5-891d-ba2259aab301/mainraw · history · blame

#!/usr/bin/perl

# the order of clusters need to be identical
my ($master_clstr, @clstr) = @ARGV;
my $clstr_file_no = $#clstr+1;

my @fhs = ();
my @div_reps = ();
my @div_seqs = ();
my @div_rep_no = ();
for ($i=0; $i<$clstr_file_no; $i++) {
  $fh = "FH" . $i;
  open($fh, $clstr[$i]) || die "can not open $clstr[$i]";
  $div_reps[$i] = "";
  $div_seqs[$i] = "";
  $div_rep_no[$i] = 0;
}

my $master_rep = "";
my $master_seq = "";
my $rep_no = 0;
open(TMP, $master_clstr) || die "can not open $master_clstr";
while($ll = <TMP>) {
  if ($ll =~ /^>/) {
    if ($master_rep) {
      print $master_seq;
      foreach ($i=0; $i<$clstr_file_no; $i++) {
        $this_no = process_this($i, $master_rep, $rep_no);
        $rep_no += $this_no;
      }
    }

    $master_rep = "";
    $master_seq = $ll;
    $rep_no     = 0;
  }
  else {
    $master_seq .= $ll;
    $rep_no++;
    chop($ll);
    if ($ll =~ /\*$/) {
      $rep = "";
      if ($ll =~ /(aa|nt), >(.+)\.\.\./) {
        $rep = $2; 
        $master_rep = $rep;
      }
      else {
        die "format error $ll";
      }
    }
  }
}
    if ($master_rep) {
      print $master_seq;
      foreach ($i=0; $i<$clstr_file_no; $i++) {
        $this_no = process_this($i, $master_rep, $rep_no);
        $rep_no += $this_no;
      }
    }
close(TMP);

for ($i=0; $i<$clstr_file_no; $i++) {
  $fh = "FH" . $i;
  close($fh);
}

sub process_this {
  my ($i, $master_rep, $rep_no) = @_;
  my $ll;
  my ($j, $k);
  $fh = "FH" . $i;

  while($ll = <$fh>) {
    if ($ll =~ /^>/) {

      if ($div_reps[$i] eq $master_rep) {

        if ($div_rep_no[$i] > 1) {
          $j = $rep_no;
          my @lls = split(/\n/,$div_seqs[$i]);
          foreach $k (@lls) {
            next if ($k =~ /\*$/);
            $k =~ s/^\d+/$j/;
            print $k, "\n";
            $j++;
          }
        }

        $div_reps[$i] = "";
        $div_seqs[$i] = "";
        my $t1 = $div_rep_no[$i];
        $div_rep_no[$i] = 0;
 
        return ($t1-1);
        #return ($div_rep_no[$i]-1);
      }
      else {
        $div_reps[$i] = "";
        $div_seqs[$i] = "";
        $div_rep_no[$i] = 0;
      }
    }
    else {
      $div_seqs[$i] .= $ll;
      $div_rep_no[$i]++;
      chop($ll);
      if ($ll =~ /\*$/) {
        my $rep = "";
        if ($ll =~ /(aa|nt), >(.+)\.\.\./) {
          $rep = $2; 
          $div_reps[$i] = $rep;
        }
        else {
          die "format error $ll";
        }
      }
    }
  }

      if ($div_reps[$i] eq $master_rep) {
                                                                                
        if ($div_rep_no[$i] > 1) {
          $j = $rep_no;
          my @lls = split(/\n/,$div_seqs[$i]);
          foreach $k (@lls) {
            next if ($k =~ /\*$/);
            $k =~ s/^\d+/$j/;
            print $k, "\n";
            $j++;
          }
        }
                                                                                
        $div_reps[$i] = "";
        $div_seqs[$i] = "";
        my $t1 = $div_rep_no[$i];
        $div_rep_no[$i] = 0;
                                                                                
        return ($t1-1);
        #return ($div_rep_no[$i]-1);
      }



}