Codebase list cd-hit / 08d8190
New upstream version 4.6.8 Sascha Steinbiss 6 years ago
28 changed file(s) with 6180 addition(s) and 320 deletion(s). Raw diff Collapse all Expand all
0 #!/usr/bin/perl
1
2 use Storable;
3 use strict;
4 use Text::NSP::Measures::2D::Fisher::right;
5
6 my $clstr_file = shift;
7 my $anno_file = shift;
8 my $store_file = shift;
9
10 my @cls_list = ();
11 my @fun_list = ();
12 my $cur_cls = "";
13 my %cls2rep = ();
14 my @cur_anno = ();
15
16
17 open(TMP, $clstr_file) || die;
18 while(my $ll = <TMP>) { # read .clstr files
19 if ($ll =~ /^>/) { # the begin of a cluster
20 $cur_cls = $ll;
21 $cur_cls =~ s/^>(.*?)\s$/$1/;
22 # print "$cur_cls|\n";
23 }
24 else{
25 chop($ll);
26 if ($ll =~ /^(\d+)\s+(\d+)(aa|nt),\s+>(.+)\.\.\./) {
27 my @tmp = split(/\|\|/,$4);
28 if ($#tmp == 0){
29 @cur_anno = ();
30 }
31 else{
32 @cur_anno = split(/,/, pop(@tmp));
33 }
34 # print $cur_cls.$cur_anno[0]."|\n";
35 push(@cls_list, $cur_cls);
36 push(@fun_list, [@cur_anno]);
37 if ($ll =~ /^(\d+)\s+(\d+)(aa|nt),\s+>(.+)\.\.\.(.*)\*$/){
38 # print "$4\n";
39 $cls2rep{$cur_cls} = $4;
40 # print "$cur_cls\t$4\n";
41 }
42 }
43 }
44 }
45
46 #print join("\n", @cls_list[0..10]);
47 @cls_list = map {$cls2rep{$_}} @cls_list;
48 #print join("\n", @cls_list[0..10]);
49 #print "\n";
50 #foreach my $i (0..10){
51 # print join("\t",@{$fun_list[$i]});
52 # print "\n";
53 #}
54 #print join("\n", @fun_list[0..10]);
55 #exit(1);
56 my %cls_size = ();
57 my %cls_anno = ();
58 my %anno_size = ();
59 my $M = $#fun_list+1;
60 #print $#fun_list."\t".$M."\n";
61 #print $#cls_list."\t".$M."\n";
62 foreach my $i (0..$#fun_list){
63 $cls_size{$cls_list[$i]}++;
64 if ($#{$fun_list[$i]} >= 0) { # have annotation
65 foreach my $anno (@{$fun_list[$i]}){
66 # print "$i\t$cls_list[$i]\t$anno\n";
67 $anno_size{$anno}++;
68 $cls_anno{$cls_list[$i]}{$anno}++;
69 }
70 }
71 }
72
73 #while (my ($a,$b) = each %anno_size){
74 # print "$a\t$b\n";
75 #}
76
77 #print "COG0171\t".$anno_
78
79 my %resu = ();
80 while(my ($cls, $cls_s) = each %cls_size){
81 my @tmp = ();
82 # $resu{$cls} = [];
83 while (my ($anno,$anno_s) = each %{$cls_anno{$cls}}){
84 # print "$cls\t$cls_s\t$anno\t$anno_s\t$anno_size{$anno}";
85 # print "\n";
86 my $pvalue = calculateStatistic(n11=>$anno_s, n1p=>$cls_s, np1=>$anno_size{$anno}, npp=>$M);
87 # anno_term, anno_size, clsper, anno_total, backper, enrichment, pvalue
88 push @tmp, [$anno, $anno_s, $anno_s/$cls_s, $anno_size{$anno}, $anno_size{$anno}/$M, $anno_s*$M/($cls_s*$anno_size{$anno}), $pvalue];
89 # push $resu{$cls}, [sort{$a[0] <=> $b[0]} @tmp];
90 }
91 @tmp = sort {$$a[6] <=> $$b[6]} @tmp;
92 $resu{$cls} = [@tmp];
93 }
94
95 store \%resu, $store_file;
96 open(TMP, "> $anno_file") || die;
97 print TMP "ClsName\tClsSize\tAnno_term\tAnno_size\tClsPer\tAnno_total\tSeq_total\tBackPer\tEnrichment\tPvalue\n";
98 while(my ($cls, $info) = each %resu){
99 foreach my $a (@{$info}){ #[$pvalue, $enrichment, $anno_s, $anno]
100 print TMP join("\t",($cls, $cls_size{$cls}, $a->[0], $a->[1], $a->[2], $a->[3],
101 $M, $a->[4], $a->[5], $a->[6]))."\n";
102 # print "$cls\t".join("\t",@{$a})."\n";
103 }
104 # print "$cls\t$#{$info}\n";
105 }
106 close(TMP)
107
108
1313 please download legacy BLAST (not BLAST+) and install the executables in your $PATH
1414
1515
16 For more information, please visit http://cd-hit.org or please read the cdhit-users-guide.pdf.
17 Most up-to-date documents are available at http://weizhongli-lab.org/cd-hit/wiki/doku.php?id=cd-hit_user_guide.
16 For more information, please visit http://cd-hit.org
1817
19 cd-hit was originally hosted at Google Code, some of the old releases are still available from https://code.google.com/p/cdhit/.
18 Most up-to-date documents are available at https://github.com/weizhongli/cdhit/wiki
2019
2120 cd-hit is also available as web server, visit http://cd-hit.org for web server address.
211211 {
212212 int intval = atoi( value );
213213 if (strcmp(flag, "-i" ) == 0) input = value;
214 else if (strcmp(flag, "-j" ) == 0) input_pe = value;
214215 else if (strcmp(flag, "-o" ) == 0) output = value;
216 else if (strcmp(flag, "-op") == 0) output_pe = value;
215217 else if (strcmp(flag, "-M" ) == 0) max_memory = atoll(value) * 1000000;
216218 else if (strcmp(flag, "-l" ) == 0) min_length = intval;
217219 else if (strcmp(flag, "-c" ) == 0) cluster_thd = atof(value), useIdentity = true;
222224 else if (strcmp(flag, "-s" ) == 0) diff_cutoff = atof(value);
223225 else if (strcmp(flag, "-S" ) == 0) diff_cutoff_aa = intval;
224226 else if (strcmp(flag, "-B" ) == 0) store_disk = intval;
227 else if (strcmp(flag, "-P" ) == 0) PE_mode = intval;
228 else if (strcmp(flag, "-cx") == 0) trim_len = intval;
229 else if (strcmp(flag, "-cy") == 0) trim_len_R2 = intval;
230 else if (strcmp(flag, "-ap") == 0) align_pos = intval;
231 else if (strcmp(flag, "-sc") == 0) sort_output = intval;
232 else if (strcmp(flag, "-sf") == 0) sort_outputf = intval;
225233 else if (strcmp(flag, "-p" ) == 0) print = intval;
226234 else if (strcmp(flag, "-g" ) == 0) cluster_best = intval;
227235 else if (strcmp(flag, "-G" ) == 0) global_identity = intval;
279287 {
280288 if( SetOptionCommon( flag, value ) ) return true;
281289 if (strcmp(flag, "-i2" ) == 0) input2 = value;
290 else if (strcmp(flag, "-j2" ) == 0) input2_pe = value;
282291 else if (strcmp(flag, "-s2") == 0) diff_cutoff2 = atof(value);
283292 else if (strcmp(flag, "-S2") == 0) diff_cutoff_aa2 = atoi(value);
284293 else return false;
350359 if ((cluster_thd > 1.0) || (cluster_thd < 0.4)) bomb_error("invalid clstr");
351360 }
352361
362 if (input.size() == 0) bomb_error("no input file");
363 if (output.size() == 0) bomb_error("no output file");
364 if (PE_mode) {
365 if (input_pe.size() == 0) bomb_error("no input file for R2 sequences in PE mode");
366 if (output_pe.size() == 0) bomb_error("no output file for R2 sequences in PE mode");
367 }
368 if (isEST && (align_pos==1)) option_r = 0;
369
353370 if (band_width < 1 ) bomb_error("invalid band width");
354371 if (NAA < 2 || NAA > NAA_top_limit) bomb_error("invalid word length");
355372 if (des_len < 0 ) bomb_error("too short description, not enough to identify sequences");
359376 if( has2D ){
360377 if ((diff_cutoff2<0) || (diff_cutoff2>1)) bomb_error("invalid value for -s2");
361378 if (diff_cutoff_aa2<0) bomb_error("invalid value for -S2");
379 if (PE_mode) {
380 if (input2_pe.size() == 0) bomb_error("no input file for R2 sequences for 2nd db in PE mode");
381 }
362382 }
363383 if (global_identity == 0) print = 1;
364384 if (short_coverage < long_coverage) short_coverage = long_coverage;
467487 seq[j] = 0;
468488 } // END void format_seq
469489
490 void strrev(char *p)
491 {
492 char *q = p;
493 while(q && *q) ++q;
494 for(--q; p < q; ++p, --q)
495 *p = *p ^ *q,
496 *q = *p ^ *q,
497 *p = *p ^ *q;
498 }
470499
471500 ////For smiple len1 <= len2, len2 is for existing representative
472501 ////walk along all diag path of two sequences,
14571486 distance = 2.0;
14581487 if( other.data ){
14591488 size = bufsize = other.size;
1489 size_R2 = 0;
14601490 data = new char[size+1];
14611491 //printf( "data: %p %p\n", data, other.data );
14621492 data[size] = 0;
14701500 identifier[len] = 0;
14711501 }
14721502 }
1503
1504 // back to back merge for PE
1505 // R1 -> XXXXXXABC ------------------- NMLYYYYYY <--R2
1506 // >R1 >R2
1507 // XXXXXXABC YYYYYYLMN =====> Merge into
1508 // >R12
1509 // NMLYYYYYYXXXXXXABC
1510 Sequence::Sequence( const Sequence & other, const Sequence & other2, int mode )
1511 {
1512 int i;
1513 if (mode != 1) bomb_error("unknown mode");
1514
1515 //printf( "new: %p %p\n", this, & other );
1516 memcpy( this, & other, sizeof( Sequence ) );
1517 distance = 2.0;
1518
1519 if( other.data && other2.data ){
1520 size = bufsize = (other.size + other2.size);
1521 size_R2 = other2.size;
1522 data = new char[size+1];
1523 //printf( "data: %p %p\n", data, other.data );
1524 data[size] = 0;
1525 data[size_R2] = 0;
1526 memcpy( data, other2.data, size_R2); // copy R2 first
1527 strrev( data ); // reverse R2 on data
1528 memcpy( data+size_R2, other.data, size-size_R2 ); // copy R1 to end of R2
1529 //for (i=0; i<size; i++) data[i] = other.data[i];
1530 des_begin2 = other2.des_begin;
1531 tot_length2= other2.tot_length;
1532 }
1533 else if ( other.data || other2.data ) {
1534 bomb_error("Not both PE sequences have data");
1535 }
1536
1537 if( other.identifier ){ // only use R1
1538 int len = strlen( other.identifier );
1539 identifier = new char[len+1];
1540 memcpy( identifier, other.identifier, len );
1541 identifier[len] = 0;
1542 }
1543 }
1544
1545
14731546 Sequence::~Sequence()
14741547 {
14751548 //printf( "delete: %p\n", this );
15281601 }
15291602 }
15301603 if( size ) data[size] = 0;
1604 }
1605 void Sequence::trim(int trim_len) {
1606 if (trim_len >= size) return;
1607 size = trim_len;
1608 if (size) data[size]=0;
15311609 }
15321610 void Sequence::ConvertBases()
15331611 {
15991677 }
16001678 }
16011679
1680 // by liwz
1681 // disable swap option
1682 // change des_begin, des_length, des_length2, dat_length => des_begin, tot_length
1683 // where des_begin is the FILE pointer of sequence record start
1684 // tot_length is the total bytes of sequence record
16021685 void SequenceDB::Read( const char *file, const Options & options )
16031686 {
1604 Sequence one;
1605 Sequence dummy;
1606 Sequence des;
1607 Sequence *last = NULL;
1608 FILE *swap = NULL;
1609 FILE *fin = fopen( file, "rb" );
1610 char *buffer = NULL;
1611 char *res = NULL;
1612 size_t swap_size = 0;
1613 int option_l = options.min_length;
1614 if( fin == NULL ) bomb_error( "Failed to open the database file" );
1615 if( options.store_disk ) swap = OpenTempFile( temp_dir );
1616 Clear();
1617 dummy.swap = swap;
1618 buffer = new char[ MAX_LINE_SIZE+1 ];
1619
1620 while (not feof( fin ) || one.size) { /* do not break when the last sequence is not handled */
1621 buffer[0] = '>';
1622 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL && one.size == 0) break;
1623 if( buffer[0] == '+' ){
1624 int len = strlen( buffer );
1625 int len2 = len;
1626 while( len2 && buffer[len2-1] != '\n' ){
1627 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
1628 len2 = strlen( buffer );
1629 len += len2;
1630 }
1631 one.des_length2 = len;
1632 dummy.des_length2 = len;
1633 fseek( fin, one.size, SEEK_CUR );
1634 }else if (buffer[0] == '>' || buffer[0] == '@' || (res==NULL && one.size)) {
1635 if ( one.size ) { // write previous record
1636 one.dat_length = dummy.dat_length = one.size;
1637 if( one.identifier == NULL || one.Format() ){
1638 printf( "Warning: from file \"%s\",\n", file );
1639 printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
1640 if( one.identifier ) printf( "%s\n", one.identifier );
1641 printf( "%s\n", one.data );
1642 one.size = 0;
1643 }
1644 one.index = dummy.index = sequences.size();
1645 if( one.size > option_l ) {
1646 if ( swap ) {
1647 swap_size += one.size;
1648 // so that size of file < MAX_BIN_SWAP about 2GB
1649 if ( swap_size >= MAX_BIN_SWAP) {
1650 dummy.swap = swap = OpenTempFile( temp_dir );
1651 swap_size = one.size;
1652 }
1653 dummy.size = one.size;
1654 dummy.offset = ftell( swap );
1655 dummy.des_length = one.des_length;
1656 sequences.Append( new Sequence( dummy ) );
1657 one.ConvertBases();
1658 fwrite( one.data, 1, one.size, swap );
1659 }else{
1660 //printf( "==================\n" );
1661 sequences.Append( new Sequence( one ) );
1662 //printf( "------------------\n" );
1663 //if( sequences.size() > 10 ) break;
1664 }
1665 //if( sequences.size() >= 10000 ) break;
1666 }
1667 }
1668 one.size = 0;
1669 one.des_length2 = 0;
1670
1671 int len = strlen( buffer );
1672 int len2 = len;
1673 des.size = 0;
1674 des += buffer;
1675 while( len2 && buffer[len2-1] != '\n' ){
1676 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
1677 des += buffer;
1678 len2 = strlen( buffer );
1679 len += len2;
1680 }
1681 size_t offset = ftell( fin );
1682 one.des_begin = dummy.des_begin = offset - len;
1683 one.des_length = dummy.des_length = len;
1684
1685 int i = 0;
1686 if( des.data[i] == '>' || des.data[i] == '@' || des.data[i] == '+' ) i += 1;
1687 if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
1688 if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
1689 while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
1690 des.data[i] = 0;
1691 one.identifier = dummy.identifier = des.data;
1692 } else {
1693 one += buffer;
1694 }
1695 }
1687 Sequence one;
1688 Sequence des;
1689 FILE *fin = fopen( file, "rb" );
1690 char *buffer = NULL;
1691 char *res = NULL;
1692 int option_l = options.min_length;
1693 if( fin == NULL ) bomb_error( "Failed to open the database file" );
1694 Clear();
1695 buffer = new char[ MAX_LINE_SIZE+1 ];
1696
1697 while (not feof( fin ) || one.size) { /* do not break when the last sequence is not handled */
1698 buffer[0] = '>';
1699 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL && one.size == 0) break;
1700 if( buffer[0] == '+' ){
1701 int len = strlen( buffer );
1702 int len2 = len;
1703 while( len2 && buffer[len2-1] != '\n' ){
1704 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
1705 len2 = strlen( buffer );
1706 len += len2;
1707 }
1708 one.tot_length += len;
1709
1710 // read next line quality score
1711 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) bomb_error("can not read quality score after");
1712 len = strlen( buffer );
1713 len2 = len;
1714 while( len2 && buffer[len2-1] != '\n' ){
1715 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
1716 len2 = strlen( buffer );
1717 len += len2;
1718 }
1719 one.tot_length += len;
1720 }else if (buffer[0] == '>' || buffer[0] == '@' || (res==NULL && one.size)) {
1721 if ( one.size ) { // write previous record
1722 if( one.identifier == NULL || one.Format() ){
1723 printf( "Warning: from file \"%s\",\n", file );
1724 printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
1725 if( one.identifier ) printf( "%s\n", one.identifier );
1726 printf( "%s\n", one.data );
1727 one.size = 0;
1728 }
1729 one.index = sequences.size();
1730 if( one.size > option_l ) {
1731 if (options.trim_len > 0) one.trim(options.trim_len);
1732 sequences.Append( new Sequence( one ) );
1733 }
1734 }
1735 one.size = 0;
1736 one.tot_length = 0;
1737
1738 int len = strlen( buffer );
1739 int len2 = len;
1740 des.size = 0;
1741 des += buffer;
1742 while( len2 && buffer[len2-1] != '\n' ){
1743 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
1744 des += buffer;
1745 len2 = strlen( buffer );
1746 len += len2;
1747 }
1748 size_t offset = ftell( fin );
1749 one.des_begin = offset - len;
1750 one.tot_length += len; // count first line
1751
1752 int i = 0;
1753 if( des.data[i] == '>' || des.data[i] == '@' || des.data[i] == '+' ) i += 1;
1754 if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
1755 if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
1756 while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
1757 des.data[i] = 0;
1758 one.identifier = des.data;
1759 } else {
1760 one.tot_length += strlen(buffer); one += buffer;
1761 }
1762 }
16961763 #if 0
1697 int i, n = 0;
1698 for(i=0; i<sequences.size(); i++) n += sequences[i].bufsize + 4;
1699 cout<<n<<"\t"<<sequences.capacity() * sizeof(Sequence)<<endl;
1700 int i;
1701 scanf( "%i", & i );
1764 int i, n = 0;
1765 for(i=0; i<sequences.size(); i++) n += sequences[i].bufsize + 4;
1766 cout<<n<<"\t"<<sequences.capacity() * sizeof(Sequence)<<endl;
1767 int i;
1768 scanf( "%i", & i );
17021769 #endif
1703 one.identifier = dummy.identifier = NULL;
1704 delete[] buffer;
1705 fclose( fin );
1770 one.identifier = NULL;
1771 delete[] buffer;
1772 fclose( fin );
1773 }
1774
1775 // PE reads liwz, disable swap option
1776 void SequenceDB::Read( const char *file, const char *file2, const Options & options )
1777 {
1778 Sequence one, two;
1779 Sequence des;
1780 FILE *fin = fopen( file, "rb" );
1781 FILE *fin2= fopen( file2,"rb" );
1782 char *buffer = NULL;
1783 char *buffer2= NULL;
1784 char *res = NULL;
1785 char *res2= NULL;
1786 int option_l = options.min_length;
1787 if( fin == NULL ) bomb_error( "Failed to open the database file" );
1788 if( fin2== NULL ) bomb_error( "Failed to open the database file" );
1789 Clear();
1790 buffer = new char[ MAX_LINE_SIZE+1 ];
1791 buffer2= new char[ MAX_LINE_SIZE+1 ];
1792
1793 while (((not feof( fin )) && (not feof( fin2)) ) || (one.size && two.size)) { /* do not break when the last sequence is not handled */
1794 buffer[0] = '>'; res =fgets( buffer, MAX_LINE_SIZE, fin );
1795 buffer2[0]= '>'; res2=fgets( buffer2, MAX_LINE_SIZE, fin2 );
1796
1797 if ( (res == NULL) && (res2 != NULL)) bomb_error( "Paired input files have different number sequences" );
1798 if ( (res != NULL) && (res2 == NULL)) bomb_error( "Paired input files have different number sequences" );
1799 if ( (one.size == 0 ) && (two.size > 0)) bomb_error( "Paired input files have different number sequences" );
1800 if ( (one.size > 0 ) && (two.size == 0)) bomb_error( "Paired input files have different number sequences" );
1801 if ( (res == NULL) && (one.size == 0)) break;
1802
1803 if( buffer[0] == '+' ){ // fastq 3rd line
1804 // file 1
1805 int len = strlen( buffer );
1806 int len2 = len;
1807 while( len2 && buffer[len2-1] != '\n' ){ // read until the end of the line
1808 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
1809 len2 = strlen( buffer );
1810 len += len2;
1811 }
1812 one.tot_length += len;
1813
1814 // read next line quality score
1815 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) bomb_error("can not read quality score after");
1816 len = strlen( buffer );
1817 len2 = len;
1818 while( len2 && buffer[len2-1] != '\n' ){
1819 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
1820 len2 = strlen( buffer );
1821 len += len2;
1822 }
1823 one.tot_length += len;
1824
1825 // file 2
1826 len = strlen( buffer2 );
1827 len2 = len;
1828 while( len2 && buffer2[len2-1] != '\n' ){ // read until the end of the line
1829 if ( (res2=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) break;
1830 len2 = strlen( buffer2 );
1831 len += len2;
1832 }
1833 two.tot_length += len;
1834
1835 // read next line quality score
1836 if ( (res2=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) bomb_error("can not read quality score after");
1837 len = strlen( buffer2 );
1838 len2 = len;
1839 while( len2 && buffer2[len2-1] != '\n' ){
1840 if ( (res2=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) break;
1841 len2 = strlen( buffer2 );
1842 len += len2;
1843 }
1844 two.tot_length += len;
1845
1846 }else if (buffer[0] == '>' || buffer[0] == '@' || (res==NULL && one.size)) {
1847 if ( one.size && two.size ) { // write previous record
1848 if( one.identifier == NULL || one.Format() ){
1849 printf( "Warning: from file \"%s\",\n", file );
1850 printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
1851 if( one.identifier ) printf( "%s\n", one.identifier );
1852 printf( "%s\n", one.data );
1853 one.size=0; two.size=0;
1854 }
1855 if( two.identifier == NULL || two.Format() ){
1856 printf( "Warning: from file \"%s\",\n", file2 );
1857 printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
1858 if( two.identifier ) printf( "%s\n", two.identifier );
1859 printf( "%s\n", two.data );
1860 one.size=0; two.size = 0;
1861 }
1862 one.index = sequences.size();
1863 if( (one.size + two.size)> option_l ) {
1864 if (options.trim_len > 0) one.trim(options.trim_len);
1865 if (options.trim_len_R2 > 0) two.trim(options.trim_len_R2);
1866 sequences.Append( new Sequence( one, two, 1 ) );
1867 }
1868 }
1869 // R1
1870 one.size = 0;
1871 one.tot_length = 0;
1872
1873 int len = strlen( buffer );
1874 int len2 = len;
1875 des.size = 0;
1876 des += buffer;
1877 while( len2 && buffer[len2-1] != '\n' ){
1878 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
1879 des += buffer;
1880 len2 = strlen( buffer );
1881 len += len2;
1882 }
1883 size_t offset = ftell( fin );
1884 one.des_begin = offset - len; // offset of ">" or "@"
1885 one.tot_length += len; // count first line
1886
1887 int i = 0;
1888 if( des.data[i] == '>' || des.data[i] == '@' || des.data[i] == '+' ) i += 1;
1889 if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
1890 if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
1891 while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
1892 des.data[i] = 0; // find first non-space letter
1893 one.identifier = des.data;
1894
1895 // R2
1896 two.size = 0;
1897 two.tot_length = 0;
1898
1899 len = strlen( buffer2 );
1900 len2 = len;
1901 while( len2 && buffer2[len2-1] != '\n' ){
1902 if ( (res=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) break;
1903 len2 = strlen( buffer2 );
1904 len += len2;
1905 }
1906 offset = ftell( fin2 );
1907 two.des_begin = offset - len;
1908 two.tot_length += len; // count first line
1909 two.identifier = des.data;
1910 } else {
1911 one.tot_length += strlen(buffer); one += buffer;
1912 two.tot_length+= strlen(buffer2); two+= buffer2;
1913 }
1914 }
1915 #if 0
1916 int i, n = 0;
1917 for(i=0; i<sequences.size(); i++) n += sequences[i].bufsize + 4;
1918 cout<<n<<"\t"<<sequences.capacity() * sizeof(Sequence)<<endl;
1919 int i;
1920 scanf( "%i", & i );
1921 #endif
1922 one.identifier = NULL;
1923 two.identifier = NULL;
1924 delete[] buffer;
1925 fclose( fin );
1926 delete[] buffer2;
1927 fclose( fin2 );
17061928 }
17071929
17081930 #if 0
18262048 n = sequences.size();
18272049 for (i=0; i<n; i++){
18282050 Sequence *seq = sequences[i];
1829 int qs = seq->des_length2 ? seq->des_length2 + seq->dat_length : 0;
18302051 fseek( fin, seq->des_begin, SEEK_SET );
18312052
18322053 seg_size += seq->size;
18382059 seg_size = seq->size;
18392060 }
18402061
1841 count = (seq->des_length + seq->dat_length + qs) / MAX_LINE_SIZE;
1842 rest = (seq->des_length + seq->dat_length + qs) % MAX_LINE_SIZE;
2062 count = seq->tot_length / MAX_LINE_SIZE;
2063 rest = seq->tot_length % MAX_LINE_SIZE;
18432064 //printf( "count = %6i, rest = %6i\n", count, rest );
18442065 for (j=0; j<count; j++){
18452066 if( fread( buf, 1, MAX_LINE_SIZE, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
18672088 std::sort( sorting.begin(), sorting.end() );
18682089 for (i=0; i<n; i++){
18692090 Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
1870 int qs = seq->des_length2 ? seq->des_length2 + seq->dat_length : 0;
18712091 fseek( fin, seq->des_begin, SEEK_SET );
18722092
1873 count = (seq->des_length + seq->dat_length + qs) / MAX_LINE_SIZE;
1874 rest = (seq->des_length + seq->dat_length + qs) % MAX_LINE_SIZE;
2093 count = seq->tot_length / MAX_LINE_SIZE;
2094 rest = seq->tot_length % MAX_LINE_SIZE;
18752095 //printf( "count = %6i, rest = %6i\n", count, rest );
18762096 for (j=0; j<count; j++){
18772097 if( fread( buf, 1, MAX_LINE_SIZE, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
18862106 fclose( fout );
18872107 delete []buf;
18882108 }
2109 // liwz PE output
2110 void SequenceDB::WriteClusters( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options )
2111 {
2112 FILE *fin = fopen( db, "rb" );
2113 FILE *fout = fopen( newdb, "w+" );
2114 FILE *fin_pe = fopen( db_pe, "rb" );
2115 FILE *fout_pe = fopen( newdb_pe, "w+" );
2116 int i, j, n = rep_seqs.size();
2117 int count, rest;
2118 char *buf = new char[MAX_LINE_SIZE+1];
2119 vector<uint64_t> sorting( n );
2120 if( fin == NULL || fout == NULL ) bomb_error( "file opening failed" );
2121 if( fin_pe == NULL || fout_pe == NULL ) bomb_error( "file opening failed" );
2122 for (i=0; i<n; i++) sorting[i] = ((uint64_t)sequences[ rep_seqs[i] ]->index << 32) | rep_seqs[i];
2123 std::sort( sorting.begin(), sorting.end() );
2124
2125 //sort fasta / fastq
2126 int *clstr_size;
2127 int *clstr_idx1;
2128 if (options.sort_outputf) {
2129 clstr_size = new int[n];
2130 clstr_idx1 = new int[n];
2131 for (i=0; i<n; i++) {
2132 clstr_size[i] = 0;
2133 clstr_idx1[i] = i;
2134 }
2135
2136 int N = sequences.size();
2137 for (i=0; i<N; i++) {
2138 int id = sequences[i]->cluster_id;
2139 if (id < 0) continue;
2140 if (id >=n) continue;
2141 clstr_size[id]++;
2142 }
2143 quick_sort_idxr(clstr_size, clstr_idx1, 0, n-1);
2144 }
2145
2146 for (i=0; i<n; i++){
2147 Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
2148 if (options.sort_outputf) seq = sequences[ rep_seqs[ clstr_idx1[i] ] ];
2149 //R1
2150 fseek( fin, seq->des_begin, SEEK_SET );
2151
2152 count = seq->tot_length / MAX_LINE_SIZE;
2153 rest = seq->tot_length % MAX_LINE_SIZE;
2154 //printf( "count = %6i, rest = %6i\n", count, rest );
2155 for (j=0; j<count; j++){
2156 if( fread( buf, 1, MAX_LINE_SIZE, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
2157 fwrite( buf, 1, MAX_LINE_SIZE, fout );
2158 }
2159 if( rest ){
2160 if( fread( buf, 1, rest, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
2161 fwrite( buf, 1, rest, fout );
2162 }
2163
2164 //R2
2165 fseek( fin_pe, seq->des_begin2, SEEK_SET );
2166
2167 count = seq->tot_length2 / MAX_LINE_SIZE;
2168 rest = seq->tot_length2 % MAX_LINE_SIZE;
2169 //printf( "count = %6i, rest = %6i\n", count, rest );
2170 for (j=0; j<count; j++){
2171 if( fread( buf, 1, MAX_LINE_SIZE, fin_pe ) ==0 ) bomb_error( "Can not swap in sequence" );
2172 fwrite( buf, 1, MAX_LINE_SIZE, fout_pe );
2173 }
2174 if( rest ){
2175 if( fread( buf, 1, rest, fin_pe ) ==0 ) bomb_error( "Can not swap in sequence" );
2176 fwrite( buf, 1, rest, fout_pe );
2177 }
2178
2179 }
2180 fclose( fin );
2181 fclose( fout );
2182 fclose( fin_pe );
2183 fclose( fout_pe );
2184 delete []buf;
2185 }
2186
18892187 void SequenceDB::WriteExtra1D( const Options & options )
18902188 {
18912189 string db_clstr = options.output + ".clstr";
18922190 string db_clstr_bak = options.output + ".bak.clstr";
1893 int i, k, N = sequences.size();
2191 int i, i0, k, N = sequences.size();
18942192 vector<long long> sorting( N );
18952193 for (i=0; i<N; i++) sorting[i] = ((long long)sequences[i]->index << 32) | i;
18962194 std::sort( sorting.begin(), sorting.end() );
19172215 }
19182216
19192217 fout = fopen( db_clstr.c_str(), "w+" );
1920 for (i=0; i<M; i++) {
2218
2219 if (options.sort_output) {
2220 int *clstr_size = new int[M];
2221 int *clstr_idx1 = new int[M];
2222
2223 for (i=0; i<M; i++) {
2224 clstr_size[i] = (int)clusters[i].size();
2225 clstr_idx1[i] = i;
2226 }
2227 quick_sort_idxr(clstr_size, clstr_idx1, 0, M-1);
2228
2229 for (i=0; i<M; i++) {
2230 i0 = clstr_idx1[i];
2231 fprintf( fout, ">Cluster %i\n", i );
2232 for (k=0; k<(int)clusters[i0].size(); k++)
2233 sequences[ clusters[i0][k] ]->PrintInfo( k, fout, options, buf );
2234 }
2235 }
2236 else {
2237 for (i=0; i<M; i++) {
19212238 fprintf( fout, ">Cluster %i\n", i );
19222239 for (k=0; k<(int)clusters[i].size(); k++)
19232240 sequences[ clusters[i][k] ]->PrintInfo( k, fout, options, buf );
1924 }
2241 }
2242
2243 }
2244
19252245 delete []buf;
19262246 }
19272247 void SequenceDB::WriteExtra2D( SequenceDB & other, const Options & options )
32833603 }
32843604 } // make_comp_short_word_index
32853605
3286
3606 //quick_sort_idx calling (a, idx, 0, no-1)
3607 //sort a with another array idx
3608 //so that idx rearranged
3609 int quick_sort_idx (int *a, int *idx, int lo0, int hi0 ) {
3610 int lo = lo0;
3611 int hi = hi0;
3612 int mid;
3613 int tmp;
3614
3615 if ( hi0 > lo0) {
3616 mid = a[ ( lo0 + hi0 ) / 2 ];
3617
3618 while( lo <= hi ) {
3619 while( ( lo < hi0 ) && ( a[lo] < mid ) ) lo++;
3620 while( ( hi > lo0 ) && ( a[hi] > mid ) ) hi--;
3621 if( lo <= hi ) {
3622 tmp=a[lo]; a[lo]=a[hi]; a[hi]=tmp;
3623 tmp=idx[lo]; idx[lo]=idx[hi]; idx[hi]=tmp;
3624 lo++; hi--;
3625 }
3626 } // while
3627
3628 if( lo0 < hi ) quick_sort_idx(a, idx, lo0, hi );
3629 if( lo < hi0 ) quick_sort_idx(a, idx, lo, hi0 );
3630 } // if ( hi0 > lo0)
3631 return 0;
3632 } // quick_sort_idx
3633
3634
3635 //decreasing can not use reverse of quick_sort_idx due to tie
3636 //quick_sort_idxr calling (a, idx, 0, no-1)
3637 //sort a with another array idx
3638 //so that idx rearranged
3639 int quick_sort_idxr (int *a, int *idx, int lo0, int hi0 ) {
3640 int lo = lo0;
3641 int hi = hi0;
3642 int mid;
3643 int tmp;
3644
3645 if ( hi0 > lo0) {
3646 mid = a[ ( lo0 + hi0 ) / 2 ];
3647
3648 while( lo <= hi ) {
3649 while( ( lo < hi0 ) && ( a[lo] > mid ) ) lo++;
3650 while( ( hi > lo0 ) && ( a[hi] < mid ) ) hi--;
3651 if( lo <= hi ) {
3652 tmp=a[lo]; a[lo]=a[hi]; a[hi]=tmp;
3653 tmp=idx[lo]; idx[lo]=idx[hi]; idx[hi]=tmp;
3654 lo++; hi--;
3655 }
3656 } // while
3657
3658 if( lo0 < hi ) quick_sort_idxr(a, idx, lo0, hi );
3659 if( lo < hi0 ) quick_sort_idxr(a, idx, lo, hi0 );
3660 } // if ( hi0 > lo0)
3661 return 0;
3662 } // quick_sort_idxr
32873663
32883664 /////////////////////////// END ALL ////////////////////////
32893665
3838 #include<vector>
3939 #include<map>
4040
41 #define CDHIT_VERSION "4.6"
41 #define CDHIT_VERSION "4.7"
4242
4343 #ifndef MAX_SEQ
4444 #define MAX_SEQ 655360
279279 int frag_size;
280280 int option_r;
281281 int threads;
282 int PE_mode; // -P
283 int trim_len; // -cx
284 int trim_len_R2; // -cy
285 int align_pos; // -ap for alignment position
282286
283287 size_t max_entries;
284288 size_t max_sequences;
292296 bool backupFile;
293297
294298 string input;
299 string input_pe;
295300 string input2;
301 string input2_pe;
296302 string output;
303 string output_pe;
304
305 int sort_output; // -sc
306 int sort_outputf; // -sf
297307
298308 Options(){
299309 backupFile = false;
331341 frag_size = 0;
332342 des_len = 20;
333343 threads = 1;
344 PE_mode = 0;
345 trim_len = 0;
346 trim_len_R2 = 0;
347 align_pos = 0;
348 sort_output = 0;
349 sort_outputf = 0;
334350 max_entries = 0;
335351 max_sequences = 1<<20;
336352 mem_limit = 100000000;
357373 // length of the sequence:
358374 int size;
359375 int bufsize;
376 int size_R2; // size = size.R1 + size.R2 for back-to-back merged seq
360377
361378 //uint32_t stats;
362379
368385 int offset;
369386
370387 // stream offset of the description string in the database:
371 size_t des_begin;
372 // length of the description:
373 int des_length;
374 // length of the description in quality score part:
375 int des_length2;
376 // length of data in fasta file, including line wrapping:
377 int dat_length;
388 size_t des_begin, des_begin2;
389 // total record length
390 int tot_length, tot_length2;
378391
379392 char *identifier;
380393
388401
389402 Sequence();
390403 Sequence( const Sequence & other );
404 Sequence( const Sequence & other, const Sequence & other2, int mode );
391405 ~Sequence();
392406
393407 void Clear();
402416 int Format();
403417
404418 void ConvertBases();
419 void trim(int trim_len);
405420
406421 void SwapIn();
407422 void SwapOut();
543558 ~SequenceDB(){ Clear(); }
544559
545560 void Read( const char *file, const Options & options );
561 void Read( const char *file, const char *file2, const Options & options );
546562 void WriteClusters( const char *db, const char *newdb, const Options & options );
563 void WriteClusters( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options );
547564 void WriteExtra1D( const Options & options );
548565 void WriteExtra2D( SequenceDB & other, const Options & options );
549566 void DivideSave( const char *db, const char *newdb, int n, const Options & options );
589606 int &best_score, int &iden_no, int &alnln, float &dist, int *alninfo,
590607 int band_left, int band_center, int band_right, WorkingBuffer & buffer);
591608
609 void strrev(char *p);
592610 int print_usage_2d (char *arg);
593611 int print_usage_est (char *arg);
594612 int print_usage_div (char *arg);
605623 int calc_ann_list(int len, char *seqi, int NAA, int& aan_no, Vector<int> & aan_list, Vector<INTs> & aan_list_no, bool est=false);
606624
607625 float current_time();
626
627 //some functions from very old cd-hit
628 int quick_sort_idx(int *a, int *idx, int lo0, int hi0 );
629 int quick_sort_idxr(int *a, int *idx, int lo0, int hi0 );
4848 string db_in;
4949 string db_in2;
5050 string db_out;
51 string db_in_pe;
52 string db_in2_pe;
53 string db_out_pe;
54
5155
5256 options.cluster_thd = 0.95;
5357 options.NAA = 10;
6670 options.Validate();
6771
6872 db_in = options.input;
73 db_in_pe = options.input_pe;
6974 db_in2 = options.input2;
75 db_in2_pe = options.input2_pe;
7076 db_out = options.output;
77 db_out_pe = options.output_pe;
78
7179
7280 InitNAA( MAX_UAA );
7381 options.NAAN = NAAN_array[options.NAA];
7987 make_comp_short_word_index(options.NAA, NAAN_array, Comp_AAN_idx);
8088 }
8189
82 seq_db.Read( db_in.c_str(), options );
90 if ( options.PE_mode ) {seq_db.Read( db_in.c_str(), db_in_pe.c_str(), options );}
91 else {seq_db.Read( db_in.c_str(), options );}
8392 cout << "total seq in db1: " << seq_db.sequences.size() << endl;
8493
85 seq_db2.Read( db_in2.c_str(), options );
94 if ( options.PE_mode ) { seq_db2.Read( db_in2.c_str(), db_in2_pe.c_str(), options );}
95 else { seq_db2.Read( db_in2.c_str(), options );}
8696 cout << "total seq in db2: " << seq_db2.sequences.size() << endl;
8797
8898 seq_db.SortDivide( options );
92102 cout << "writing non-redundant sequences from db2" << endl;
93103 seq_db2.WriteClusters( db_in2.c_str(), db_out.c_str(), options );
94104
105 if ( options.PE_mode ) { seq_db2.WriteClusters( db_in2.c_str(), db_in2_pe.c_str(), db_out.c_str(), db_out_pe.c_str(), options ); }
106 else { seq_db2.WriteClusters( db_in2.c_str(), db_out.c_str(), options ); }
107
95108 seq_db2.WriteExtra2D( seq_db, options );
96109 cout << "program completed !" << endl << endl;
97110 end_time = current_time();
4242 {
4343 string db_in;
4444 string db_out;
45 string db_in_pe;
46 string db_out_pe;
4547
4648 options.cluster_thd = 0.95;
4749 options.NAA = 10;
5961 if (options.SetOptions( argc, argv, false, true ) == 0) print_usage_est(argv[0]);
6062 options.Validate();
6163
62 db_in = options.input;
63 db_out = options.output;
64 db_in = options.input;
65 db_in_pe = options.input_pe;
66 db_out = options.output;
67 db_out_pe = options.output_pe;
6468
6569 InitNAA( MAX_UAA );
6670 seq_db.NAAN = NAAN_array[options.NAA];
7074 make_comp_short_word_index(options.NAA, NAAN_array, Comp_AAN_idx);
7175 }
7276
73 seq_db.Read( db_in.c_str(), options );
77 if ( options.PE_mode ) {seq_db.Read( db_in.c_str(), db_in_pe.c_str(), options );}
78 else {seq_db.Read( db_in.c_str(), options );}
79
7480 cout << "total seq: " << seq_db.sequences.size() << endl;
7581 seq_db.SortDivide( options );
7682 seq_db.DoClustering( options );
7783
7884 printf( "writing new database\n" );
79 seq_db.WriteClusters( db_in.c_str(), db_out.c_str(), options );
85 if ( options.PE_mode ) { seq_db.WriteClusters( db_in.c_str(), db_in_pe.c_str(), db_out.c_str(), db_out_pe.c_str(), options ); }
86 else { seq_db.WriteClusters( db_in.c_str(), db_out.c_str(), options ); }
8087
8188 // write a backup clstr file in case next step crashes
8289 seq_db.WriteExtra1D( options );
66
77 // information
88 char cd_hit_ver[] = "\t\t====== CD-HIT version " CDHIT_VERSION " (built on " __DATE__ ") ======";
9 char cd_hit_ref1[] = "\"Clustering of highly homologous sequences to reduce thesize of large protein database\", Weizhong Li, Lukasz Jaroszewski & Adam Godzik. Bioinformatics, (2001) 17:282-283";
10 char cd_hit_ref2[] = "\"Tolerating some redundancy significantly speeds up clustering of large protein databases\", Weizhong Li, Lukasz Jaroszewski & Adam Godzik. Bioinformatics, (2002) 18:77-82";
11 char cd_hit_ref3[] = "\"Cd-hit: a fast program for clustering and comparing large sets of protein or nucleotide sequences\", Weizhong Li & Adam Godzik. Bioinformatics, (2006) 22:1658-1659";
12 char cd_hit_ref4[] = "\"Beifang Niu, Limin Fu, Shulei Sun and Weizhong Li. Artificial and natural duplicates in pyrosequencing reads of metagenomic data. BMC Bioinformatics (2010) 11:187";
9 char cd_hit_ref1[] = "\"CD-HIT: a fast program for clustering and comparing large sets of protein or nucleotide sequences\", Weizhong Li & Adam Godzik. Bioinformatics, (2006) 22:1658-1659";
10 char cd_hit_ref2[] = "\"CD-HIT: accelerated for clustering the next generation sequencing data\", Limin Fu, Beifang Niu, Zhengwei Zhu, Sitao Wu & Weizhong Li. Bioinformatics, (2012) 28:3150-3152";
11 char cd_hit_ref3[] = "\"Beifang Niu, Limin Fu, Shulei Sun and Weizhong Li. Artificial and natural duplicates in pyrosequencing reads of metagenomic data. BMC Bioinformatics (2010) 11:187";
1312 //
1413
1514 char contacts[] =
1918 " If you find cd-hit useful, please kindly cite:\n\n";
2019
2120 char txt_option_i[] = "\tinput filename in fasta format, required\n";
21 char txt_option_j[] =
22 "\tinput filename in fasta/fastq format for R2 reads if input are paired end (PE) files\n \
23 \t -i R1.fq -j R2.fq -o output_R1 -op output_R2 or\n \
24 \t -i R1.fa -j R2.fa -o output_R1 -op output_R2 \n";
2225 char txt_option_i_2d[] = "\tinput filename for db1 in fasta format, required\n";
2326 char txt_option_i2[] = "\tinput filename for db2 in fasta format, required\n";
27 char txt_option_j2[] =
28 "\tinput filename in fasta/fastq format for R2 reads if input are paired end (PE) files\n \
29 \t -i db1-R1.fq -j db1-R2.fq -i2 db2-R1.fq -j2 db2-R2.fq -o output_R1 -op output_R2 or\n \
30 \t -i db1-R1.fa -j db1-R2.fa -i2 db2-R1.fq -j2 db2-R2.fq -o output_R1 -op output_R2 \n";
2431 char txt_option_o[] = "\toutput filename, required\n";
32 char txt_option_op[] = "\toutput filename for R2 reads if input are paired end (PE) files\n";
2533 char txt_option_c[] =
2634 "\tsequence identity threshold, default 0.9\n \
2735 \tthis is the default cd-hit's \"global sequence identity\" calculated as:\n \
8795 char txt_option_B[] =
8896 "\t1 or 0, default 0, by default, sequences are stored in RAM\n \
8997 \tif set to 1, sequence are stored on hard drive\n \
90 \tit is recommended to use -B 1 for huge databases\n";
98 \t!! No longer supported !!\n";
99 char txt_option_P[] =
100 "\tinput paired end (PE) reads, default 0, single file\n \
101 \tif set to 1, please use -i R1 -j R2 to input both PE files\n";
102 char txt_option_cx[] =
103 "\tlength to keep after trimming the tail of sequence, default 0, not trimming\n \
104 \tif set to 50, the program only uses the first 50 letters of input sequence\n";
105 char txt_option_cy[] =
106 "\tlength to keep after trimming the tail of R2 sequence, default 0, not trimming\n \
107 \tif set to 50, the program only uses the first 50 letters of input R2 sequence\n \
108 \te.g. -cx 100 -cy 80 for paired end reads\n";
109 char txt_option_ap[] =
110 "\talignment position constrains, default 0, no constrain\n \
111 \tif set to 1, the program will force sequences to align at beginings\n \
112 \twhen set to 1, the program only does +/+ alignment\n";
91113 char txt_option_uL[] =
92114 "\tmaximum unmatched percentage for the longer sequence, default 1.0\n \
93115 \tif set to 0.1, the unmatched region (excluding leading and tailing gaps)\n \
107129 \tif set to 0, only +/+ strand alignment\n";
108130 char txt_option_bak[] =
109131 "\twrite backup cluster file (1 or 0, default 0)\n";
132 char txt_option_sc[] =
133 "\tsort clusters by size (number of sequences), default 0, output clusters by decreasing length\n \
134 \tif set to 1, output clusters by decreasing size\n";
135 char txt_option_sf[] =
136 "\tsort fasta/fastq by cluster size (number of sequences), default 0, no sorting\n \
137 \tif set to 1, output sequences by decreasing cluster size\n";
110138
111139 char txt_option_mask[] = "\tmasking letters (e.g. -mask NX, to mask out both 'N' and 'X')\n";
112140 char txt_option_match[] = "\tmatching score, default 2 (1 for T-U and N-N)\n";
144172 cout << " -B" << txt_option_B;
145173 cout << " -p" << txt_option_p;
146174 cout << " -g" << txt_option_g;
175 cout << " -sc"<< txt_option_sc;
176 cout << " -sf"<< txt_option_sf;
147177 cout << " -bak" << txt_option_bak;
148178 cout << " -h\tprint this help\n\n";
149179 cout << contacts;
189219 cout << " Questions, bugs, contact Weizhong Li at liwz@sdsc.edu\n\n";
190220 cout << " If you find cd-hit useful, please kindly cite:\n\n";
191221 cout << " " << cd_hit_ref1 << "\n";
192 cout << " " << cd_hit_ref3 << "\n\n\n";
222 cout << " " << cd_hit_ref2 << "\n\n\n";
193223 exit(1);
194224 } // END print_usage_2d
195225
198228 cout << cd_hit_ver << "\n\n" ;
199229 cout << "Usage: "<< arg << " [Options] \n\nOptions\n\n";
200230 cout << " -i" << txt_option_i;
201 cout << " -o" << txt_option_o;
231 cout << " -j" << txt_option_j;
232 cout << " -o" << txt_option_o;
233 cout << " -op" << txt_option_op;
202234 cout << " -c" << txt_option_c;
203235 cout << " -G" << txt_option_G;
204236 cout << " -b" << txt_option_b;
218250 cout << " -uS" << txt_option_uS;
219251 cout << " -U" << txt_option_U;
220252 cout << " -B" << txt_option_B;
253 cout << " -P" << txt_option_P;
254 cout << " -cx"<< txt_option_cx;
255 cout << " -cy"<< txt_option_cy;
256 cout << " -ap"<< txt_option_ap;
221257 cout << " -p" << txt_option_p;
222258 cout << " -g" << txt_option_g;
223259 cout << " -r" << txt_option_r;
227263 cout << " -gap" << txt_option_gap;
228264 cout << " -gap-ext" << txt_option_gap_ext;
229265 cout << " -bak" << txt_option_bak;
266 cout << " -sc"<< txt_option_sc;
267 cout << " -sf"<< txt_option_sf;
230268 cout << " -h\tprint this help\n\n";
231269 cout << contacts;
232270 cout << " " << cd_hit_ref1 << "\n";
233 cout << " " << cd_hit_ref3 << "\n\n\n";
271 cout << " " << cd_hit_ref2 << "\n\n\n";
234272 exit(1);
235273 } // END print_usage_est
236274
240278 cout << "Usage: "<< arg << " [Options] \n\nOptions\n\n";
241279 cout << " -i" << txt_option_i_2d;
242280 cout << " -i2"<< txt_option_i2;
243 cout << " -o" << txt_option_o;
281 cout << " -j, -j2"<< txt_option_j2;
282 cout << " -o" << txt_option_o;
283 cout << " -op" << txt_option_op;
244284 cout << " -c" << txt_option_c;
245285 cout << " -G" << txt_option_G;
246286 cout << " -b" << txt_option_b;
262302 cout << " -uS" << txt_option_uS;
263303 cout << " -U" << txt_option_U;
264304 cout << " -B" << txt_option_B;
305 cout << " -P" << txt_option_P;
306 cout << " -cx"<< txt_option_cx;
307 cout << " -cy"<< txt_option_cy;
265308 cout << " -p" << txt_option_p;
266309 cout << " -g" << txt_option_g;
267310 cout << " -r" << txt_option_r;
274317 cout << " -h\tprint this help\n\n";
275318 cout << contacts;
276319 cout << " " << cd_hit_ref1 << "\n";
277 cout << " " << cd_hit_ref3 << "\n\n\n";
320 cout << " " << cd_hit_ref2 << "\n\n\n";
278321 exit(1);
279322 } // END print_usage_est_2d
280323
325368 cout << " Questions, bugs, contact Weizhong Li at liwz@sdsc.edu\n\n";
326369 cout << " If you find cd-hit useful, please kindly cite:\n\n";
327370 cout << " " << cd_hit_ref1 << "\n";
328 cout << " " << cd_hit_ref3 << "\n";
329 cout << " " << cd_hit_ref4 << "\n\n\n";
371 cout << " " << cd_hit_ref2 << "\n";
372 cout << " " << cd_hit_ref3 << "\n\n\n";
330373 exit(1);
331374 }
332375
0 #!/usr/bin/perl
1
2 use Storable;
3 use strict;
4 #my $sort_by_what = shift;
5 # $sort_by_what = "no" unless $sort_by_what;
6
7 my $clstr_file = shift;
8 my $store_file = shift;
9
10 my %clstr = (); # an array of hashes for all the cluster
11 my $rep_len = 0;
12 my $rep_acc = "";
13 my @cur_sequences = (); # array of hashes for all sequences in a cluster
14 my $ll = "";
15 my @record = ();
16
17 open(TMP, $clstr_file) || die;
18 while($ll = <TMP>) { # read .clstr files
19 if ($ll =~ /^>/) { # the begin of a cluster
20 if (scalar(@cur_sequences)) { # not the first cluster, therefore collect the information of last clstr
21 #@cur_sequences = sort {$$b{"seq_len"} <=> $$a{"seq_len"}} @cur_sequences;
22 @cur_sequences = sort {$$b[1] <=> $$a[1]} @cur_sequences;
23 @record = ($rep_acc, $rep_len, 1, [@cur_sequences], "");
24 $clstr{$rep_acc} = [@record];
25 }
26 @cur_sequences=();
27 }
28 else { # the sequence line
29 chop($ll);
30 if ($ll =~ /^(\d+)\s+(\d+)(aa|nt),\s+>(.+)\.\.\./) {
31 @record = ($4, $2, 0, [], "");
32 if ($ll =~ /\*$/) { # representative sequence or not
33 $rep_acc = $record[0];
34 $rep_len = $record[1];
35 $record[4] = "100%";
36 }
37 # elsif ($ll =~ / at (\d.+)$/ ) {
38 elsif ($ll =~ / at (.+\d.+)$/ ) {# because cd-hit-est have strand info
39 $record[4] = $1;
40 }
41 }
42 push(@cur_sequences, [@record]);
43 }
44 }
45 if (scalar(@cur_sequences)) {
46 #@cur_sequences = sort {$$b{"seq_len"} <=> $$a{"seq_len"}} @cur_sequences;
47 @cur_sequences = sort {$$b[1] <=> $$a[1]} @cur_sequences;
48 @record = ($rep_acc, $rep_len, 1, [@cur_sequences], "");
49 $clstr{$rep_acc} = [@record];
50 }
51 close(TMP);
52
53 if (-e $store_file){ # already have a cluster file
54 my %old_clstr = %{retrieve($store_file)};
55 foreach my $rep_acc (keys %clstr){
56 my $seqs = $clstr{$rep_acc}[3]; # $seqs a reference to the sequences;
57 my $tmp_size = scalar(@{$seqs}); # how many sequences in a top level cluster, each sequence should be a representative sequence for lower level cluster
58 #print "$rep_acc, $tmp_size\n";
59 my $i;
60 for $i (0..($tmp_size-1)){
61 my $seq = $$seqs[$i];
62 if ($old_clstr{$$seq[0]}){
63 $clstr{$rep_acc}[3][$i][3] = [@{$old_clstr{$$seq[0]}[3]}];
64 $clstr{$rep_acc}[3][$i][2] = 1;
65 }
66 }
67 }
68 }
69
70 store \%clstr, $store_file;
71
72 #~ my $size = scalar(keys %clstr);
73 #~ print "$size\n";
74
75 #~ my $acc = 'D8F4YGO02FSTQP|range|2:370|frame|2|len|123';
76
77 #~ my $temp = $clstr{$acc}[1];
78 #~ print "$temp\n";
79
80 #~ my $temp = scalar(@{$clstr{$acc}[3]});
81 #~ print "$temp\n";
82
83 #~ my $x;
84 #~ for $x (@{$clstr{$acc}[3]} ){
85 #~ my $tmp_1 = scalar(@{$x->[3]});
86 #~ print "$x->[2], $x->[4], $x->[0], $x->[1], $tmp_1\n";
87 #~ }
88
0 #!/usr/bin/perl
1
2 use Storable;
3 use strict;
4
5 my $input_file = shift;
6 my $output_file = shift;
7 my $sort_by_what = shift;
8 $sort_by_what = "no" unless $sort_by_what;
9
10 my @clstr = values %{retrieve($input_file)};
11
12
13 if ($sort_by_what eq "no") {
14
15 ### Added by liwz sort by No. sequences instead of No. nodes
16 my %rep2size = ();
17 my $clstr_no = scalar(@clstr);
18 my ($i);
19
20
21 for ($i=0; $i<$clstr_no; $i++){
22 my $node_size = 0;
23 foreach my $seq1 (@{$clstr[$i][3]}) {
24 if ($$seq1[2]) { # can be futher expanded
25 foreach my $seq2(@{$$seq1[3]}) {
26 if ($$seq2[2]) { $node_size += scalar(@{$$seq2[3]}); }
27 else { $node_size++; }
28 }
29 }
30 else {
31 $node_size++;
32 }
33 }
34 $rep2size{ $clstr[$i][0] } = $node_size;
35 }
36 ### END
37
38 #@clstr = sort {scalar(@{$b->[3]}) <=> scalar(@{$a->[3]})} @clstr;
39 @clstr = sort {$rep2size{$b->[0]} <=> $rep2size{$a->[0]}} @clstr;
40 }
41 elsif ($sort_by_what eq "len") {
42 @clstr = sort {$b->[1] <=> $a->[1]} @clstr;
43 }
44 elsif ($sort_by_what eq "des") {
45 @clstr = sort {$a->[0] cmp $b->[0]} @clstr;
46 }
47
48 store \@clstr, $output_file;
49
50
Binary diff not shown
33
44 [[http://cd-hit.org]]
55
6 Program developed by Weizhong Li's lab at UCSD [[http://weizhong-lab.ucsd.edu]] and JCVI [[http://jcvi.org]] [[liwz@sdsc.edu]]
6 Program developed by Weizhong Li's lab at UCSD [[http://weizhongli-lab.org]] and JCVI [[http://jcvi.org]] [[liwz@sdsc.edu]]
77
88 ===== Introduction =====
99
6161
6262 **Reduced alphabet (to be implemented)**: This is for protein clustering. In reduced alphabet, a group of exchangeable residues are reduced to a single residue (I/V/L==>I, S/T==>S, D/E==>D, K/R==>K, F/Y==>F), and then conservative mutations would appear as identities in sequence alignments. It improves the short word filter for clustering at low sequence identity below 50%.
6363
64 **Gapped word (to be implemented)**: Short word filter using gapped word allows mismatch within a word such as “ACE” vs “AME”, “ACFE” vs “AMYE”, and “AACTT” vs “AAGTT”, which can be written as “101”, “1001” and “11011”. At low identity cutoff, a gapped word is more efficient than an ungapped word for filtering.
64 **Gapped word (to be implemented)**: Short word filter using gapped word allows mismatch within a word such as “ACE” vs “AME”, “ACFE” vs “AMYE”, and “AACTT” vs “AAGTT”, which can be written as “101”, “1001” and “11011”. At low identity cutoff, a gapped word is more efficient than an ungapped word for filtering.
6565
6666
6767
9494 It can be copied under the GNU General Public License version 2 (GPLv2).
9595
9696 Most CD-HIT programs were written in C++. Installing CD-HIT package is very simple:
97 * download current CD-HIT at [[https://github.com/weizhongli/cdhit/releases]], for example cd-hit-v4.6.2-2015-0511.tar.gz
98 * unpack the file with " tar xvf cd-hit-v4.6.2-2015-0511.tar.gz --gunzip"
99 * change dir by "cd cd-hit-v4.6.2-2015-0511"
97 * download current CD-HIT at [[https://github.com/weizhongli/cdhit/releases]], for example cd-hit-v4.6.6-2016-0711.tar.gz
98 * unpack the file with " tar xvf cd-hit-v4.6.6-2016-0711.tar.gz --gunzip"
99 * change dir by "cd cd-hit-v4.6.6-2016-0711"
100100 * compile the programs by "make" with multi-threading (default), or by "make openmp=no" without multi-threading (on old systems without OpenMP)
101101 * cd cd-hit-auxtools
102102 * compile cd-hit-auxtools by "make"
106106 CD-HIT clusters proteins into clusters that meet a user-defined similarity threshold, usually a sequence identity. Each cluster has one representative sequence. The input is a protein dataset in fasta format and the output are two files: a fasta file of representative sequences and a text file of list of clusters.
107107
108108 Basic command:
109 cd-hit -i nr -o nr100 -c 1.00 -n 5 -M 16000 –d 0 -T 8
110 cd-hit -i db -o db90 -c 0.9 -n 5 -M 16000 –d 0 -T 8,
109 cd-hit -i nr -o nr100 -c 1.00 -n 5 -M 16000 –d 0 -T 8
110 cd-hit -i db -o db90 -c 0.9 -n 5 -M 16000 –d 0 -T 8,
111111
112112 where\\
113113 ''db'' is the filename of input, \\
181181 must not be more than 10 bases
182182 -B 1 or 0, default 0, by default, sequences are stored in RAM
183183 if set to 1, sequence are stored on hard drive
184 it is recommended to use -B 1 for huge databases
184 !! No longer supported !!
185185 -p 1 or 0, default 0
186186 if set to 1, print alignment overlap in .clstr file
187187 -g 1 or 0, default 0
190190 will cluster it into the most similar cluster that meet the threshold
191191 (accurate but slow mode)
192192 but either 1 or 0 won't change the representatives of final clusters
193 -sc sort clusters by size (number of sequences), default 0, output clusters by decreasing length
194 if set to 1, output clusters by decreasing size
195 -sf sort fasta/fastq by cluster size (number of sequences), default 0, no sorting
196 if set to 1, output sequences by decreasing cluster size
193197 -bak write backup cluster file (1 or 0, default 0)
194198 -h print this help
195199
199203
200204 See the figure below, the -aL, -AL, -aS and -AS options can be used to specify the alignment coverage on both the representative sequence and other sequences. -s and -S can control the length difference between the representative sequence and other sequences.
201205
202 {{ :Figure2.png }}
206 {{ :cd-hit-figure2.png }}
207
203208
204209 ''
205210 aL = R<sub>a</sub> / R\\
263268 -n 2 for thresholds 0.4 ~ 0.5
264269 </code>
265270
266 More options:
267
268 Options, -b, -M, -l, -d, -t, -s, -S, -B, -p, -aL, -AL, -aS, -AS, -g, -G, -T
269 are same to CD-HIT, here are few more cd-hit-2d specific options:
270 <code>
271 -i2 input filename for db2 in fasta format, required
272 -s2 length difference cutoff for db1, default 1.0
273 by default, seqs in db1 >= seqs in db2 in a same cluster
274 if set to 0.9, seqs in db1 may just >= 90% seqs in db2
275 -S2 length difference cutoff, default 0
276 by default, seqs in db1 >= seqs in db2 in a same cluster
277 if set to 60, seqs in db2 may 60aa longer than seqs in db1
271 Options:
272 <code>
273 -i input filename for db1 in fasta format, required
274 -i2 input filename for db2 in fasta format, required
275 -o output filename, required
276 -c sequence identity threshold, default 0.9
277 this is the default cd-hit's "global sequence identity" calculated as:
278 number of identical amino acids in alignment
279 divided by the full length of the shorter sequence
280 -G use global sequence identity, default 1
281 if set to 0, then use local sequence identity, calculated as :
282 number of identical amino acids in alignment
283 divided by the length of the alignment
284 NOTE!!! don't use -G 0 unless you use alignment coverage controls
285 see options -aL, -AL, -aS, -AS
286 -b band_width of alignment, default 20
287 -M memory limit (in MB) for the program, default 800; 0 for unlimitted;
288 -T number of threads, default 1; with 0, all CPUs will be used
289 -n word_length, default 5, see user's guide for choosing it
290 -l length of throw_away_sequences, default 10
291 -t tolerance for redundance, default 2
292 -d length of description in .clstr file, default 20
293 if set to 0, it takes the fasta defline and stops at first space
294 -s length difference cutoff, default 0.0
295 if set to 0.9, the shorter sequences need to be
296 at least 90% length of the representative of the cluster
297 -S length difference cutoff in amino acid, default 999999
298 if set to 60, the length difference between the shorter sequences
299 and the representative of the cluster can not be bigger than 60
300 -s2 length difference cutoff for db1, default 1.0
301 by default, seqs in db1 >= seqs in db2 in a same cluster
302 if set to 0.9, seqs in db1 may just >= 90% seqs in db2
303 -S2 length difference cutoff, default 0
304 by default, seqs in db1 >= seqs in db2 in a same cluster
305 if set to 60, seqs in db2 may 60aa longer than seqs in db1
306 -aL alignment coverage for the longer sequence, default 0.0
307 if set to 0.9, the alignment must covers 90% of the sequence
308 -AL alignment coverage control for the longer sequence, default 99999999
309 if set to 60, and the length of the sequence is 400,
310 then the alignment must be >= 340 (400-60) residues
311 -aS alignment coverage for the shorter sequence, default 0.0
312 if set to 0.9, the alignment must covers 90% of the sequence
313 -AS alignment coverage control for the shorter sequence, default 99999999
314 if set to 60, and the length of the sequence is 400,
315 then the alignment must be >= 340 (400-60) residues
316 -A minimal alignment coverage control for the both sequences, default 0
317 alignment must cover >= this value for both sequences
318 -uL maximum unmatched percentage for the longer sequence, default 1.0
319 if set to 0.1, the unmatched region (excluding leading and tailing gaps)
320 must not be more than 10% of the sequence
321 -uS maximum unmatched percentage for the shorter sequence, default 1.0
322 if set to 0.1, the unmatched region (excluding leading and tailing gaps)
323 must not be more than 10% of the sequence
324 -U maximum unmatched length, default 99999999
325 if set to 10, the unmatched region (excluding leading and tailing gaps)
326 must not be more than 10 bases
327 -B 1 or 0, default 0, by default, sequences are stored in RAM
328 if set to 1, sequence are stored on hard drive
329 !! No longer supported !!
330 -p 1 or 0, default 0
331 if set to 1, print alignment overlap in .clstr file
332 -g 1 or 0, default 0
333 by cd-hit's default algorithm, a sequence is clustered to the first
334 cluster that meet the threshold (fast cluster). If set to 1, the program
335 will cluster it into the most similar cluster that meet the threshold
336 (accurate but slow mode)
337 but either 1 or 0 won't change the representatives of final clusters
338 -bak write backup cluster file (1 or 0, default 0)
339 -h print this help
340
278341 </code>
279342
280343 ==== CD-HIT-EST ====
288351 good for non-intron containing sequences like EST.
289352
290353 Basic command:
291 cd-hit-est -i est_human -o est_human95 -c 0.95 -n 10 -d 0 -M 16000 - T 8
354 cd-hit-est -i est_human -o est_human95 -c 0.95 -n 10 -d 0 -M 16000 - T 8
355 cd-hit-est -i R1.fa -j R2.fa -o R1.95.fa -op R2.95.fa -P 1 -c 0.95 -n 10 -d 0 -M 16000 - T 8
292356
293357 Choose of word size:
294358 <code>
300364 -n 4 for thresholds 0.75 ~ 0.8
301365 </code>
302366
303 More options:
304
305 Options, -b, -M, -l, -d, -t, -s, -S, -B, -p, -aL, -AL, -aS, -AS, -g, -G, -T
306 are same to CD-HIT, here are few more cd-hit-est specific options:
307 <code>
367 Options:
368 <code>
369 -i input filename in fasta format, required
370 -j input filename in fasta/fastq format for R2 reads if input are paired end (PE) files
371 -i R1.fq -j R2.fq -o output_R1 -op output_R2 or
372 -i R1.fa -j R2.fa -o output_R1 -op output_R2
373 -o output filename, required
374 -op output filename for R2 reads if input are paired end (PE) files
375 -c sequence identity threshold, default 0.9
376 this is the default cd-hit's "global sequence identity" calculated as:
377 number of identical amino acids in alignment
378 divided by the full length of the shorter sequence
379 -G use global sequence identity, default 1
380 if set to 0, then use local sequence identity, calculated as :
381 number of identical amino acids in alignment
382 divided by the length of the alignment
383 NOTE!!! don't use -G 0 unless you use alignment coverage controls
384 see options -aL, -AL, -aS, -AS
385 -b band_width of alignment, default 20
386 -M memory limit (in MB) for the program, default 800; 0 for unlimitted;
387 -T number of threads, default 1; with 0, all CPUs will be used
388 -n word_length, default 10, see user's guide for choosing it
389 -l length of throw_away_sequences, default 10
390 -d length of description in .clstr file, default 20
391 if set to 0, it takes the fasta defline and stops at first space
392 -s length difference cutoff, default 0.0
393 if set to 0.9, the shorter sequences need to be
394 at least 90% length of the representative of the cluster
395 -S length difference cutoff in amino acid, default 999999
396 if set to 60, the length difference between the shorter sequences
397 and the representative of the cluster can not be bigger than 60
398 -aL alignment coverage for the longer sequence, default 0.0
399 if set to 0.9, the alignment must covers 90% of the sequence
400 -AL alignment coverage control for the longer sequence, default 99999999
401 if set to 60, and the length of the sequence is 400,
402 then the alignment must be >= 340 (400-60) residues
403 -aS alignment coverage for the shorter sequence, default 0.0
404 if set to 0.9, the alignment must covers 90% of the sequence
405 -AS alignment coverage control for the shorter sequence, default 99999999
406 if set to 60, and the length of the sequence is 400,
407 then the alignment must be >= 340 (400-60) residues
408 -A minimal alignment coverage control for the both sequences, default 0
409 alignment must cover >= this value for both sequences
410 -uL maximum unmatched percentage for the longer sequence, default 1.0
411 if set to 0.1, the unmatched region (excluding leading and tailing gaps)
412 must not be more than 10% of the sequence
413 -uS maximum unmatched percentage for the shorter sequence, default 1.0
414 if set to 0.1, the unmatched region (excluding leading and tailing gaps)
415 must not be more than 10% of the sequence
416 -U maximum unmatched length, default 99999999
417 if set to 10, the unmatched region (excluding leading and tailing gaps)
418 must not be more than 10 bases
419 -B 1 or 0, default 0, by default, sequences are stored in RAM
420 if set to 1, sequence are stored on hard drive
421 !! No longer supported !!
422 -P input paired end (PE) reads, default 0, single file
423 if set to 1, please use -i R1 -j R2 to input both PE files
424 -cx length to keep after trimming the tail of sequence, default 0, not trimming
425 if set to 50, the program only uses the first 50 letters of input sequence
426 -cy length to keep after trimming the tail of R2 sequence, default 0, not trimming
427 if set to 50, the program only uses the first 50 letters of input R2 sequence
428 e.g. -cx 100 -cy 80 for paired end reads
429 -ap alignment position constrains, default 0, no constrain
430 if set to 1, the program will force sequences to align at beginings
431 when set to 1, the program only does +/+ alignment
432 -p 1 or 0, default 0
433 if set to 1, print alignment overlap in .clstr file
434 -g 1 or 0, default 0
435 by cd-hit's default algorithm, a sequence is clustered to the first
436 cluster that meet the threshold (fast cluster). If set to 1, the program
437 will cluster it into the most similar cluster that meet the threshold
438 (accurate but slow mode)
439 but either 1 or 0 won't change the representatives of final clusters
308440 -r 1 or 0, default 1, by default do both +/+ & +/- alignments
309441 if set to 0, only +/+ strand alignment
310442 -mask masking letters (e.g. -mask NX, to mask out both 'N' and 'X')
312444 -mismatch mismatching score, default -2
313445 -gap gap opening score, default -6
314446 -gap-ext gap extension score, default -1
447 -bak write backup cluster file (1 or 0, default 0)
448 -sc sort clusters by size (number of sequences), default 0, output clusters by decreasing length
449 if set to 1, output clusters by decreasing size
450 -sf sort fasta/fastq by cluster size (number of sequences), default 0, no sorting
451 if set to 1, output sequences by decreasing cluster size
452 -h print this help
453
454
315455 </code>
316456
317457 ==== CD-HIT-EST-2D ====
325465 sequences like EST.
326466
327467 Basic command:
328 cd-hit-est-2d -i mrna_human -i2 est_human -o est_human_novel -c 0.95 -n 10 -d 0 -M 16000 - T 8
329
468 cd-hit-est-2d -i mrna_human -i2 est_human -o est_human_novel -c 0.95 -n 10 -d 0 -M 16000 - T 8
469 cd-hit-est-2d -i db1.R1.fa -j db1.R2.fa -i2 db2.R1.fa -j2 db2.R2.fa -o db2_novel.R1.fa -op db2_novel.R2.fa -P 1 -c 0.95 -n 10 -d 0 -M 16000 - T 8
470
330471 Choose of word size and options are the same as CD-HIT-EST:
331472
332 cd-hit-est-2d specificnoptions:
333 <code>
473 Options:
474 <code>
475 -i input filename for db1 in fasta format, required
476 -i2 input filename for db2 in fasta format, required
477 -j, -j2 input filename in fasta/fastq format for R2 reads if input are paired end (PE) files
478 -i db1-R1.fq -j db1-R2.fq -i2 db2-R1.fq -j2 db2-R2.fq -o output_R1 -op output_R2 or
479 -i db1-R1.fa -j db1-R2.fa -i2 db2-R1.fq -j2 db2-R2.fq -o output_R1 -op output_R2
480 -o output filename, required
481 -op output filename for R2 reads if input are paired end (PE) files
482 -c sequence identity threshold, default 0.9
483 this is the default cd-hit's "global sequence identity" calculated as:
484 number of identical amino acids in alignment
485 divided by the full length of the shorter sequence
486 -G use global sequence identity, default 1
487 if set to 0, then use local sequence identity, calculated as :
488 number of identical amino acids in alignment
489 divided by the length of the alignment
490 NOTE!!! don't use -G 0 unless you use alignment coverage controls
491 see options -aL, -AL, -aS, -AS
492 -b band_width of alignment, default 20
493 -M memory limit (in MB) for the program, default 800; 0 for unlimitted;
494 -T number of threads, default 1; with 0, all CPUs will be used
495 -n word_length, default 10, see user's guide for choosing it
496 -l length of throw_away_sequences, default 10
497 -d length of description in .clstr file, default 20
498 if set to 0, it takes the fasta defline and stops at first space
499 -s length difference cutoff, default 0.0
500 if set to 0.9, the shorter sequences need to be
501 at least 90% length of the representative of the cluster
502 -S length difference cutoff in amino acid, default 999999
503 if set to 60, the length difference between the shorter sequences
504 and the representative of the cluster can not be bigger than 60
334505 -s2 length difference cutoff for db1, default 1.0
335506 by default, seqs in db1 >= seqs in db2 in a same cluster
336507 if set to 0.9, seqs in db1 may just >= 90% seqs in db2
337508 -S2 length difference cutoff, default 0
338509 by default, seqs in db1 >= seqs in db2 in a same cluster
339510 if set to 60, seqs in db2 may 60aa longer than seqs in db1
511 -aL alignment coverage for the longer sequence, default 0.0
512 if set to 0.9, the alignment must covers 90% of the sequence
513 -AL alignment coverage control for the longer sequence, default 99999999
514 if set to 60, and the length of the sequence is 400,
515 then the alignment must be >= 340 (400-60) residues
516 -aS alignment coverage for the shorter sequence, default 0.0
517 if set to 0.9, the alignment must covers 90% of the sequence
518 -AS alignment coverage control for the shorter sequence, default 99999999
519 if set to 60, and the length of the sequence is 400,
520 then the alignment must be >= 340 (400-60) residues
521 -A minimal alignment coverage control for the both sequences, default 0
522 alignment must cover >= this value for both sequences
523 -uL maximum unmatched percentage for the longer sequence, default 1.0
524 if set to 0.1, the unmatched region (excluding leading and tailing gaps)
525 must not be more than 10% of the sequence
526 -uS maximum unmatched percentage for the shorter sequence, default 1.0
527 if set to 0.1, the unmatched region (excluding leading and tailing gaps)
528 must not be more than 10% of the sequence
529 -U maximum unmatched length, default 99999999
530 if set to 10, the unmatched region (excluding leading and tailing gaps)
531 must not be more than 10 bases
532 -B 1 or 0, default 0, by default, sequences are stored in RAM
533 if set to 1, sequence are stored on hard drive
534 !! No longer supported !!
535 -P input paired end (PE) reads, default 0, single file
536 if set to 1, please use -i R1 -j R2 to input both PE files
537 -cx length to keep after trimming the tail of sequence, default 0, not trimming
538 if set to 50, the program only uses the first 50 letters of input sequence
539 -cy length to keep after trimming the tail of R2 sequence, default 0, not trimming
540 if set to 50, the program only uses the first 50 letters of input R2 sequence
541 e.g. -cx 100 -cy 80 for paired end reads
542 -p 1 or 0, default 0
543 if set to 1, print alignment overlap in .clstr file
544 -g 1 or 0, default 0
545 by cd-hit's default algorithm, a sequence is clustered to the first
546 cluster that meet the threshold (fast cluster). If set to 1, the program
547 will cluster it into the most similar cluster that meet the threshold
548 (accurate but slow mode)
549 but either 1 or 0 won't change the representatives of final clusters
550 -r 1 or 0, default 1, by default do both +/+ & +/- alignments
551 if set to 0, only +/+ strand alignment
552 -mask masking letters (e.g. -mask NX, to mask out both 'N' and 'X')
553 -match matching score, default 2 (1 for T-U and N-N)
554 -mismatch mismatching score, default -2
555 -gap gap opening score, default -6
556 -gap-ext gap extension score, default -1
557 -bak write backup cluster file (1 or 0, default 0)
558 -h print this help
559
340560 </code>
341561
342562
347567 Basic command:
348568 cd-hit-454 -i 454_reads -o 454_reads_95 -c 0.95 -n 10 -d 0 -M 16000 - T 8
349569
350 Full list of options:
570 Options:
351571 <code>
352572 -i input filename in fasta format, required
353573 -o output filename, required
422642 - repeat cd-hit and cd-hit-2d runs till done
423643 - Combine the results
424644
425 {{ :Figure3.png }}
645 {{ :cd-hit-figure3.png }}
426646
427647 Basic command:
428648 cd-hit-para.pl -i nr90 -o nr60 -c 0.6 -n 4 --B hosts --S 64
505725
506726 With multiple-step, iterated runs of CD-HIT, you perform a clustering in a
507727 neighbor-joining method, which generates a hierarchical structure. The third step use psi-cd-hit, please see psi-cd-hit section for details.
728
729 This way is faster than one-step clustering. It can also be more accurate.
730
731 There is a problem with one-step clustering. Two very similar sequences A and B may be clustered into different clusters. For example, let the clustering threshold to be 60%, IAB (identity of AB) = 95%, IAC ≥ 60%, but IBC < 60%. If C was first selected a cluster representative, then A will be in cluster “C”, but “B” will not, resulting near identical AB to be in different clusters. Hierarchically clustering will reduce this problem.
508732
509 {{ :Figure4.png }}
733 {{ :cd-hit-figure4.png }}
510734
511735 Commands:
512736 cd-hit -i nr -o nr80 -c 0.8 -n 5 -d 0 -M 16000 -T 16
524748 clstr_rev.pl nr80-60.clstr nr30.clstr > nr80-60-30.clstr
525749 nr30.clstr only lists sequences from nr60, script clstr_rev.pl add the original sequences into file nr80-60-30.clstr
526750
527 This way is faster than one-step run from nr directly to nr30. It can also
528 more accurate.
529
530
531751
532752
533753 ===== CD-HIT AuxTools =====
540760
541761
542762 cd-hit-dup is a simple tool for removing duplicates from sequencing reads,
543 with optional step to detect and remove chimeric reads.
763 with optional step to detect and remove chimeric reads. When two files of paired end reads are used as inputs, each pair of reads will be concatenated into a single one.
544764 A number of options are provided to tune how the duplicates are removed.
545765 Running the program without arguments should print out the list of available options,
546766 as the following:
570790 </code>
571791
572792 === Option details ===
573
574 == Common options ==
575 Here are the more detailed description of the options.
576 <code>
577 -i Input file;
578 </code>
579 Input file that must be in fasta or fastq format.
580
581 <code>
582 -i2 Second input file;
583 </code>
584 cd-hit-dup can take 2 files of paired end reads.
585 "-i" can be used to specify the file for the R1;
586 and "-i2" can be used to specify the file for R2.
587
588 When two files of paired end reads are used as inputs, each pair of reads will
589 be concatenated into a single one. And the following steps of duplicate and chimeric
590 detection and removing.
591
592 <code>
593 -o Output file;
594 </code>
595 Output file which contains a list of reads without duplicates.
596
597 <code>
598 -o2 Output file for R2, with paired end reads;
599 </code>
600
601 <code>
602 -d Description length (default 0, truncate at the first whitespace character)
603 </code>
604 The length of description line that should be written to the output.
605
606793 <code>
607794 -u Length of prefix to be used in the analysis (default 0, for full/maximum length);
608795 </code>
796
609797 For pair-end inputs, the program will take part (whole or prefix) of the first end
610798 and part (whole or prefix) of the second read,
611799 and join them together to form a single read to do the analysis.
620808 to do the analysis. In case that a read is shorter than this length, no 'N' is appended to
621809 the read since it is not necessary.
622810
623
624 == Options for duplicate detection ==
625811 <code>
626812 -m Match length (true/false, default true);
627813 </code>
636822 no greater than the specified value are considered to be duplicates. For chimeric detection,
637823 this option control how similar a read should be to either of its parents.
638824
639
640 == Options for chimeric filtering ==
641825 <code>
642826 -f Filter out chimeric clusters (true/false, default false);
643827 </code>
8821066 - Repeat until done
8831067
8841068 ==== Installation ====
885 please download legacy BLAST (not BLAST+) and install the executables in your $PATH. The programs
886 required by psi-cd-hit.pl are blastall, megablast, blastpgp and formatdb.
1069 please download either legacy BLAST or BLAST+ and install the executables in your $PATH. The programs
1070 required by psi-cd-hit.pl are blastall, megablast, blastpgp and formatdb for legacy blast, and blastp, blastn, psiblast and makeblastdb for blast+.
8871071
8881072 ==== Usage ====
8891073
9401124 -------------circle-----------
9411125 | |
9421126 seq1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 1
943 \\\\ /////////////
944 \\\\ /////////////
1127 \\\\\\\\ /////////////
1128 \\\\\\\\ /////////////
9451129 HSP 2 -> ////HSP 1 /// <-HSP 2
946 ///////////// \\\\
947 ///////////// \\\\
1130 ///////////// \\\\\\\\
1131 ///////////// \\\\\\\\
9481132 seq2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 2
9491133 | |
9501134 -----------circle--------------
11631347
11641348 The CD-HIT-454 web server is also available from [[http://cd-hit.org]].
11651349
1166 
1350 ===== Use cases =====
1351 Here, a use case is defined as a sequence clustering related problem or application that cannot be easily solved with existing clustering approaches, such as CD-HIT. However, it is feasible to solve such a use case by customizing current clustering algorithms or utilizing current approach in a very intelligent way or non-standard manner. In the last years, we have developed many use cases in addressing various problems. We will release these use cases after additional testing. These use cases will be described in the following chapters.
1352
1353 ===== CD-HIT-OTU-MiSeq =====
1354 This use case is developed for clustering 16S rRNA genes into OTUs for microbiome studies. In recent years, Illumina MiSeq sequencers became dominant in 16S rRNA sequencing. The Paired End (PE) reads need to be assembled first. However many reads can not be accurately assembled because the poor quality at the 3’ ends of both PE reads in the overlapping region. This causes that many sequences are discarded in the analysis. CD-HIT-OTU-MiSeq has unique features to cluster MiSeq 16S sequences.
1355 - The package can clustering PE reads without joining them into contigs.
1356 - Users can choose a high quality portion of the PE reads for analysis (e.g. first 200 / 150 bases from forward / reverse reads), according to base quality profile.
1357 - We implemented a tool that can splice out the target region (e.g. V3-V4) from a full-length 16S reference database into the PE sequences. CD-HIT-OTU-MiSeq can cluster the spliced PE reference database together with samples, so we can derive Operational Tax-onomic Units (OTUs) and annotate these OTUs concurrently.
1358 - Chimeric sequences are effectively identified through both de novo and reference-based approaches.
1359
1360 The most important unique feature of CD-HIT-OTU-MiSeq is to only use high quality region at the 5’ ends of R1 and R2 reads. For example, the effective read length can be 200 bases for R1 and 150 bases for R2. The effective portions of PE reads are clustered together with spliced PE sequences from the reference database to derive OTUs (Figure).
1361
1362 {{:cd-hit-otu-miseq-figure-1.png|}}
1363
1364 ==== Installation ====
1365 First download and install full cd-hit package
1366 * download current CD-HIT at [[https://github.com/weizhongli/cdhit/releases]], for example cd-hit-v4.6.2-2015-0511.tar.gz
1367 * unpack the file with " tar xvf cd-hit-v4.6.2-2015-0511.tar.gz --gunzip"
1368 * change dir by "cd cd-hit-v4.6.2-2015-0511"
1369 * compile the programs by "make" with multi-threading (default), or by "make openmp=no" without multi-threading (on old systems without OpenMP)
1370 * cd cd-hit-auxtools
1371 * compile cd-hit-auxtools by "make"
1372 * CD-HIT-OTU-MiSeq scripts are inside a folder like cd-hit-v4.6.2-2015-0511/usecases/Miseq-16S
1373
1374 CD-HIT-OTU-MiSeq uses Trimmomatic for sequence quality control. It can be downloaded from [[http://www.usadellab.org/cms/?page=trimmomatic]] or [[https://github.com/timflutre/trimmomatic]]. We also have a copy at [[http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/]].
1375
1376 * modify NG-Omics-Miseq-16S.pl
1377 Please edit usecases/Miseq-16S/NG-Omics-Miseq-16S.pl, in the top few lines:
1378 $CD_HIT_dir = "PATH_to_cd-hit";
1379 $NGS_prog_trimmomatic = "PATH/trimmomatic-0.32.jar"; #### where you have installed Trimmomatic
1380
1381 ==== Download reference and sample datasets ====
1382 Reference database and sample datasets can be downloaded from [[http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/]].
1383
1384 The reference database Greengene-13-5-99.fasta.gz was processed from original Greengene database, so that sequences with more specific annotations are at the beginning of the file. You need to download and gunzip it.
1385
1386 You can also download Greengene and generate it. You should download Greengene from [[http://greengenes.secondgenome.com/downloads]], or [[ftp://greengenes.microbio.me/]]. Please download file like greengenes_release/gg_13_5/gg_13_5_otus.tar.gz, unpack the tar file. You may find gg_13_5_otus/taxonomy/99_otu_taxonomy.txt and gg_13_5_otus/rep_set/99_otus.fasta. There is a script: usecases/Miseq-16S/greengene-ann1.pl.
1387
1388 Commands:
1389 /greengene-ann1.pl -i gg_13_5_otus/taxonomy/99_otu_taxonomy.txt -j gg_13_5_otus/rep_set/99_otus.fasta -o Greengene-13-5-99.fasta
1390
1391 The Miseq-otu-example.tar.gz contains two Miseq 16S samples. You can download and unpack to test.
1392
1393 ==== Usage ====
1394
1395 **Step 1. prepare fastq files and sample file:** Most projects have multiple samples sequenced at the same region. You should already have paired ended fastq files for these samples, put them in a working directory in similar way as the testing datasets, where the R1.fq and R2.fq are placed in separate folder for each sample. So in the working directory, you should have files:
1396 sample_name_1/R1.fq
1397 sample_name_1/R2.fq
1398 sample_name_2/R1.fq
1399 sample_name_2/R2.fq
1400 ...
1401 sample_name_N/R1.fq
1402 sample_name_N/R2.fq
1403
1404 Then, please prepare a sample file in the working directory. The file should look like:
1405 sample_name_1 R1.fq R2.fq
1406 sample_name_2 R1.fq R2.fq
1407 sample_name_N R1.fq R2.fq
1408
1409 **Step 2. Reference database preparation:** We implemented a tool that can splice out the target amplicon region (e.g. V3-V4) from a full-length 16S rRNA reference sequence database, such as Greengene, RDP and Silva, into PE sequences. If there are multiple samples in a project sequenced with the same amplicon of same variable region, only one spliced reference database is needed. To run:
1410
1411 path_to_cd-hit_dir/usecases/Miseq-16S/16S-ref-db-PE-splice.pl -i sample_name_1/R1.fq -j sample_name_2/R2.fq -d Greengene-13-5-99.fasta -o gg_13_5-PE99.150-100 -p 150 -q 100 -c 0.99
1412 Where Greengene-13-5-99.fasta is our re-formatted Greengene sequence file. This program will output spliced PE files gg_13_5-PE99.150-100-R1 and gg_13_5-PE99.150-100-R2.
1413
1414 **Step 3. Run sequence QC and OTU clustering for each sample:**. In the working directory, run
1415 PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s sample_file -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
1416 where: 150 and 100 are the effective length, 0.97 is the OTU clustering cutoff, 0.00001 is the abundance cutoff, 75 is the length for chimeric checking at each R1 and R2 read
1417
1418 This command will generate shell scripts for QC and for OTU for each sample. The scripts will be in WF-sh folder. You can first run the qc.sample_name.sh and then run otu.sample_name.sh
1419
1420 NG-Omics-WF.pl [[https://github.com/weizhongli/ngomicswf]] is a very powerful workflow and pipeline tool developed in our group. It is not fully released yet, since we need more time to document this tool. However, you can try to use NG-Omics-WF.pl to automatically run all your samples. First edit NG-Omics-Miseq-16S.pl and modify cores_per_node around line #36, then
1421 nohup PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s sample_file -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 &
1422
1423 After the job finished, the OTU results will be in sample_name/otu folder, important files include
1424 * OTU.clstr: file lists all clusters and sequences
1425 * removed_chimeric*: chimeric sequenced removed
1426 * small_clusters.list: low abundance small clusters removed
1427
1428 **Step 4. pool all the samples together:** Please run
1429 PATH_to_cd-hit-dir/usecases/pool_samples.pl -s sample_file -o pooled_sample.
1430 This will pool sequences from all sample and re-run OTU clustering. We can pool hundred of samples without problem. After job finished, additional files will be available from pooled_sample directory
1431 * OTU.clstr: file list all clusters and sequences from all samples
1432 * removed_chimeric*: chimeric sequenced removed
1433 * small_clusters.list: low abundance small clusters removed
1434 * OTU.txt: spread sheet list number of sequences in each OTU for each sample, it also show annotation for each OTU.
1435 * OTU.biome: OTU.txt in biome format
1436
1437
1438
11671439 ===== References =====
11681440
11691441 If you find cd-hit helpful to your research and study, please kindly cite the
0 #!/usr/bin/perl -w
1 ################################################################################
2 ######### PSI-cd-hit written by Weizhong Li at http://cd-hit.org
3 ################################################################################
4 our $pid = $$;
5 our $db_in = ""; ###################
6 our $db_out = ""; # input / output
7 our $len_t = 10; ###################
8 our $NR_clstr = 0.3; #
9 our $NR_clstre = -1; #thresholds
10 our $g_iden = 1; #
11 our $opt_aS = 0.0; #
12 our $opt_aL = 0.0; #
13 our $circle = 0; #
14 our $opt_g = 1; ####################
15 our $blast_exe = "blastall -p blastp -m 8"; #########################
16 our $prof_exe = "blastpgp -m 8"; #
17 our $prof_para = "-j 3 -F T -e 0.001 -b 500 -v 500"; #
18 our $prof_db = ""; #
19 our $bl_para = "-F T -e 0.000001 -b 100000 -v 100000"; # program
20 our $bl_STDIN = 1; #
21 our $keep_bl = 0; #
22 our $blast_prog= "blastp"; #
23 our $formatdb = "formatdb"; #########################
24 our $exec_mode = "local"; #######################
25 our $num_qsub = 1; #
26 our $para_no = 1; # compute
27 our $sh_file = ""; #
28 our $batch_no_per_node = 50; #######################
29 our $reformat_seg = 50000;
30 our $restart_seg = 20000;
31 our $job = "";
32 our $job_file = "";
33 our $date = `date`;
34 our $restart_in = "";
35 our $pwd = `pwd`; chop($pwd);
36 our $db_clstr;
37 our $db_log;
38 our $db_out1;
39 our $seq_dir;
40 our $bl_dir;
41 our $restart_file;
42 our $tmp_db;
43 our $remote_perl_script;
44 our $remote_sh_script;
45 our $bl_path;
46 our $bl_plus = 1; #### use blast+
47 our $bl_threads = 1;
48 our $skip_long = 0;
49 our %qsub_ids = (); #### a list of qsub ids
50 our %qstat_xml_data = ();
51
52
53 sub parse_para_etc {
54 my ($arg, $cmd);
55 while($arg = shift) {
56 ## input/output:
57 if ($arg eq "-i") { $db_in = shift; }
58 elsif ($arg eq "-o") { $db_out = shift; }
59 elsif ($arg eq "-l") { $len_t = shift; }
60 ## thresholds
61 elsif ($arg eq "-c") { $NR_clstr = shift; }
62 elsif ($arg eq "-ce") { $NR_clstre = shift; }
63 elsif ($arg eq "-G") { $g_iden = shift; }
64 elsif ($arg eq "-aL") { $opt_aL = shift; }
65 elsif ($arg eq "-aS") { $opt_aS = shift; }
66 elsif ($arg eq "-g") { $opt_g = shift; }
67 elsif ($arg eq "-circle") { $circle = shift; }
68 elsif ($arg eq "-sl") { $skip_long = shift; }
69 ## program
70 elsif ($arg eq "-prog") { $blast_prog= shift; }
71 elsif ($arg eq "-p") { $prof_para = shift; }
72 elsif ($arg eq "-dprof") { $prof_db = shift; die "option -dprof no longer supported!";}
73 elsif ($arg eq "-s") { $bl_para = shift; }
74 elsif ($arg eq "-k") { $keep_bl = shift; }
75 elsif ($arg eq "-bs") { $bl_STDIN = shift; }
76 ## compute
77 elsif ($arg eq "-exec") { $exec_mode = shift; }
78 elsif ($arg eq "-host") { $num_qsub = shift; }
79 elsif ($arg eq "-para") { $para_no = shift; }
80 elsif ($arg eq "-shf") { $sh_file = shift; }
81 elsif ($arg eq "-blp") { $bl_threads = shift; }
82 elsif ($arg eq "-bat") { $batch_no_per_node = shift; }
83 ## job:
84 elsif ($arg eq "-rs") { $restart_seg = shift; }
85 elsif ($arg eq "-rf") { $reformat_seg= shift; }
86 elsif ($arg eq "-restart") { $restart_in= shift; }
87 elsif ($arg eq "-J") { $job = shift; $job_file = shift; }
88 ## blast path
89 elsif ($arg eq "-P") { $bl_path = shift; }
90 else { print_usage(); exit(); }
91 }
92
93 # speical jobs
94 if ($job eq "parse_blout") { job_parse_blout(); exit();}
95
96 if ($blast_prog eq "blastn") {
97 $formatdb = "formatdb -p F";
98 $blast_exe = "blastall -p blastn -m 8";
99 }
100 elsif ($blast_prog eq "megablast") {
101 $blast_prog = "blastn"; #### back to blastn for blast parser type
102 $formatdb = "formatdb -p F";
103 $blast_exe = "megablast -H 100 -D 2 -m 8";
104 }
105 elsif ($blast_prog eq "blastpgp") {
106 $blast_exe = "blastpgp -m 8 -j 3";
107 }
108
109 #### for blast+
110 if ($bl_plus) {
111 $formatdb = "makeblastdb -dbtype prot -max_file_sz 8GB";
112 $blast_exe = "blastp -outfmt 6";
113 $bl_para = "-seg yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
114
115 if ($blast_prog eq "blastn") {
116 $formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
117 $blast_exe = "blastp -task blastn -outfmt 6";
118 $bl_para = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
119 }
120 elsif ($blast_prog eq "megablast") {
121 $blast_prog = "blastn"; #### back to blastn for blast parser type
122 $formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
123 $blast_exe = "blastp -task megablast -outfmt 6";
124 $bl_para = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
125 }
126 elsif ($blast_prog eq "blastpgp") {
127 $blast_exe = "psiblast -outfmt 6 -num_iterations 3 -num_threads $bl_threads";
128 }
129 }
130
131 if ($bl_path) {
132 $blast_exe = "$bl_path/$blast_exe";
133 $formatdb = "$bl_path/$formatdb";
134 }
135
136 (-e $db_in) || die "No input";
137 ($db_out) || die "No output";
138
139 $db_clstr = "$db_out.clstr";
140 $db_log = "$db_out.log";
141 $db_out1 = "$db_out.out";
142 $seq_dir = "$db_in-seq";
143 $bl_dir = "$db_in-bl";
144 $restart_file =" $db_out.restart";
145
146 $tmp_db = "$db_in.$pid";
147 $remote_perl_script = "$tmp_db-bl.pl";
148 $remote_sh_script = "$tmp_db-bl.sh";
149
150 $cmd = `mkdir $bl_dir $seq_dir`;
151
152 write_remote_perl_script();
153 write_remote_sh_script();
154 return;
155 }
156 ########## END parse_para_etc
157
158
159 sub read_db {
160 my $des = "";
161 my $seq = "";
162 my $ll;
163
164 open(DBIN, $db_in) || die "Can not open $db_in";
165 while($ll=<DBIN>){
166 chop($ll);
167 if ($ll =~ /^>/) {
168 $seq =~ s/\s//g;
169 if (length($seq) > $len_t) { add_seq($des, $seq); }
170 $des = $ll; $seq = "";
171
172 }
173 else { $seq .= $ll; }
174 }
175 $seq =~ s/\s//g;
176 if (length($seq) > $len_t) { add_seq($des, $seq); }
177 close(DBIN);
178
179 ($NR_no >=1 ) || die "No sequence readin";
180
181 print OUTT "Total seqs $NR_no in $db_in\n";
182 return;
183 }
184 ########## END read_db
185
186
187 sub add_seq {
188 my ($des, $seq) = @_;
189 $des =~ s/\s.+$//;
190 push(@seqs, $seq);
191 push(@dess, $des);
192 push(@lens, length($seq));
193 push(@idens, 0);
194 push(@passeds,0);
195 push(@NR_clstr_nos,0);
196 push(@in_bg, 0);
197 $NR_no++;
198 return;
199 }
200 ########## END add_seq
201
202
203 sub open_LOG {
204 open(OUTT, ">> $db_out1") || die "can not open $db_out1";
205 select(OUTT); $|++; ### file handle flush
206 print OUTT "Started $date";
207
208 open(LOG, ">> $db_log") || die "Can not open $db_log";
209 select(LOG); $|++; ### file handle flush
210 select(STDOUT);
211 return;
212 }
213 ########## END open_LOG
214
215 sub write_LOG {
216 my $txt=shift;
217 print LOG "$txt\n";
218 }
219
220 {## use static variables
221 my $last_NR90_no=0;
222 my $last_NR_passed=0;
223 sub watch_progress {
224 my ($i0, $NR90_no, $NR_passed, $NR_no, $flag) = @_;
225 my $i1 = $i0+1;
226
227 if ( $i1 % 10 == 0 ) {
228 print OUTT ".";
229 $flag = 1 if ( $i1 % 100 == 0 );
230 }
231
232 if ($flag) {
233 my $t1 = (int($NR_passed/$NR_no*10000)) / 100;
234 my $t90 = $NR90_no - $last_NR90_no;
235 my $tno = $NR_passed - $last_NR_passed;
236 my ($tu, $ts, $cu, $cs) = times();
237 my $tt = $tu + $ts + $cu + $cs;
238 print OUTT
239 "$i1 finished $NR90_no clusters $NR_passed passed $t90/$tno clstr/passed $t1% done $tt cpu\n";
240 $last_NR90_no = $NR90_no;
241 $last_NR_passed = $NR_passed;
242 }
243 return;
244 }
245 }
246
247
248 sub close_LOG {
249 my $date = `date`; print OUTT "Completed $date\n";
250 my $total_cpu = total_remote_cpu();
251 print OUTT "Total CPUs on remote hosts: $total_cpu\n";
252 close(OUTT);
253 close(LOG);
254 return;
255 }
256 ########## END close_LOG
257
258 ###### need to change to read dir because
259 sub total_remote_cpu {
260 my ($i, $j, $k, $ll);
261 my $tt = 0;
262 for ($j=0; $j<$num_qsub; $j++) {
263 open(TCPU, "$seq_dir/host.$j.cpu") || next;
264 while($ll = <TCPU>) {
265 chop($ll);
266 $tt += $ll;
267 }
268 close(TCPU);
269 }
270 return $tt;
271 }
272 ########## END total_remote_cpu
273
274
275 sub job_parse_blout {
276 my ($i, $j, $k);
277 my @hits = process_blout_blastp_blastn($job_file);
278
279 open(BLOUT2, "> $job_file.out") || return;
280 foreach $i (@hits) {
281 print BLOUT2 join("\t", @{$i}), "\n";
282 }
283 print BLOUT2 "#\n";
284 close(BLOUT2);
285 return;
286 }
287 ########## END job_parse_blout
288
289
290 sub write_restart {
291 my ($i0, $i, $j, $k);
292 open(RES, "> $restart_file") || die;
293
294 for ($i0=0; $i0<$NR_no; $i0++) {
295 $i = $NR_idx[$i0];
296 print RES "$i\t$NR_clstr_nos[$i]\t$idens[$i]\t$passeds[$i]\n";
297 }
298
299 close(RES);
300 return;
301 }
302 ########## END write_restart
303
304
305 sub read_restart {
306 my ($ii, $i0, $i, $j, $k, $ll);
307 my @lls;
308 open(RESIN, $restart_in) || die;
309
310 $NR_passed = 0;
311 $NR90_no = 0;
312 $ii = -1;
313 $i0 = 0;
314 while($ll = <RESIN>) {
315 chop($ll);
316 @lls = split(/\t/,$ll);
317 $i = $lls[0];
318 $NR_clstr_nos[$i] = $lls[1];
319 $idens[$i] = $lls[2];
320 $passeds[$i] = $lls[3];
321 $NR_passed++ if ($lls[3]);
322
323 if ($lls[2] eq "*") { #rep
324 $NR90_no++;
325 $ii = $i0 if ($lls[3]);
326 }
327 $NR_idx[$i0] = $i;
328 $i0++; # idx of sorted , see write_restart
329 }
330 close(RESIN);
331
332 $ii++; # $ii to be last rep processed
333 return $ii;
334 }
335 ########## END read_restart
336
337
338 sub write_db_clstr {
339 my ($i0, $i, $j, $k);
340
341 my @NR90_seq = ();
342 for ($i=0; $i<$NR90_no; $i++) { $NR90_seq[$i] = []; }
343 for ($i0=0; $i0<$NR_no; $i0++) {
344 $i = $NR_idx[$i0];
345 next unless ($passeds[$i]);
346 $j = $NR_clstr_nos[$i];
347 next unless ($j < $NR90_no);
348 push(@{$NR90_seq[$j]}, $i);
349 }
350
351 open(DBCLS, "> $db_clstr") || die "Can not write $db_clstr";
352 for ($i=0; $i<$NR90_no; $i++) {
353 print DBCLS ">Cluster $i\n";
354 $k = 0;
355 foreach $j (@{ $NR90_seq[$i] }) {
356 my $des = (split(/\s+/,$dess[$j]))[0];
357 print DBCLS "$k\t$lens[$j]"."aa, $des... ";
358 if ($idens[$j] eq "*") { print DBCLS "*\n"; }
359 else { print DBCLS "at $idens[$j]\n";}
360 $k++;
361 }
362 }
363 close(DBCLS);
364
365 @NR90_seq=();
366 return;
367 }
368 ########## END write_db_clstr
369
370
371 sub remove_raw_blout {
372 my $NR_sofar = shift;
373 my ($i0, $i, $j, $k, $cmd);
374 return if ($keep_bl);
375
376 for ($i0=$NR_sofar; $i0>=0; $i0--) {
377 $i = $NR_idx[$i0];
378 next unless $passeds[$i];
379 next unless ($idens[$i] eq "*"); #only reps have blout
380 my $fout = "$bl_dir/$i";
381 last unless (-e "$fout.out"); #removed from last call
382 if (not $bl_STDIN) { $cmd = `rm -f $fout`; }
383 $cmd = `rm -f $bl_dir/$i.out`;
384 }
385 return;
386 }
387 ########## END remove_raw_blout
388
389
390 sub remove_raw_blout_bg {
391 my $NR_sofar = shift;
392 my ($i0, $i, $j, $k, $cmd);
393 return if ($keep_bl);
394
395 my $tmp_sh_script = "$tmp_db-rm-$NR_sofar.sh";
396 open(OUTRM, ">$tmp_sh_script") || die "can not write to $tmp_sh_script";
397
398 for ($i0=$NR_sofar; $i0>=0; $i0--) {
399 $i = $NR_idx[$i0];
400 next unless $passeds[$i];
401 next unless ($idens[$i] eq "*"); #only reps have blout
402 my $fout = "$bl_dir/$i";
403 last unless (-e "$fout.out"); #removed from last call
404 if (not $bl_STDIN) { print OUTRM "rm -f $fout\n"; }
405 print OUTRM "rm -f $bl_dir/$i.out";
406 }
407 print OUTRM "rm -f $tmp_sh_script\n"; ## remove self
408 close(OUTRM);
409 sleep(3);
410
411 $cmd = `sh $tmp_sh_script >/dev/null 2>&1 &`;
412 return;
413 }
414 ########## END remove_raw_blout_bg
415
416
417 sub fish_other_homolog {
418 my ($i, $j, $k, $i0, $j0, $k0);
419 $id = shift; # real idx, not sorted idx
420 my @hits = ();
421
422 wait_blast_out("$bl_dir/$id.out");
423 open(BLPOUT, "$bl_dir/$id.out") || return;
424 while($i=<BLPOUT>) {
425 last if ($i =~ /^#/);
426 chop($i);
427 push(@hits, [split(/\t/,$i)]);
428 }
429 close(BLPOUT);
430 my $rep_len = $lens[$id];
431
432 foreach $i (@hits) {
433 my $id1 = $i->[0];
434 next unless ($id1 < $NR_no);
435 next if ($idens[$id1] eq "*"); #existing reps
436 next if ($lens[$id1] > $rep_len); # in opt_g=1 mode, preventing it from being clustered into short rep
437
438 if ( $passeds[$id1] ) { #### if this hit is better -g 1 mode
439 my $old_e = (split(/\//,$idens[$id1]))[0];
440 if ($i->[3] < $old_e) {
441 $idens[$id1] = "$i->[3]/$i->[2]aa/$i->[1]%";
442 $passeds[$id1] = 1;
443 $NR_clstr_nos[$id1] = $NR90_no;
444 }
445 next;
446 }
447
448 $idens[$id1] = "$i->[3]/$i->[2]aa/$i->[1]%";
449 $passeds[$id1] = 1;
450 $NR_clstr_nos[$id1] = $NR90_no;
451 $NR_passed++;
452 }
453 return;
454 }
455 ########## END fish_other_homolog
456
457
458 ########### if a hit has multiple HSPs on both + - strands
459 ########### keep only the HSPs, whose strand is same as the top HSP
460 sub keep_strand_with_top_hsp {
461 my $self = shift;
462 my ($i,$j,$k);
463
464 my %id_2_strand = ();
465 my @new_sbj = ();
466 my $new_no = 0;
467 for ($i=0; $i<$self->{no}; $i++) {
468 my $p = $self->{sbj}->[$i];
469 my ($id1, $len_sub) = split(/\./, $p->{id});
470 if (not defined($id_2_strand{$id1})) {
471 $id_2_strand{$id1} = $p->{frame};
472 }
473 if ($p->{frame} eq $id_2_strand{$id1}) { #### this stand is same as the top strand
474 push(@new_sbj, $self->{sbj}->[$i]);
475 $new_no++;
476 }
477 }
478 $self->{no} = $new_no;
479 $self->{sbj} = [@new_sbj];
480 }
481 ########## END keep_strand_with_top_hsp
482
483 ########## for blastpgp -j no (no>1)
484 ########## keep hits from the last round
485 sub keep_hsp_of_last_round {
486 my $self = shift;
487 my ($i,$j,$k);
488
489 my @new_sbj = ();
490 my $new_no = 0;
491 my $last_score = 9999999*9999999*9999999; # a big one
492 for ($i=0; $i<$self->{no}; $i++) {
493 my $p = $self->{sbj}->[$i];
494 my $score = $p->{score};
495
496 if ($score > $last_score) { ## this is new round of hits
497 @new_sbj = ();
498 $new_no = 0;
499 }
500 $last_score = $score;
501 push(@new_sbj, $self->{sbj}->[$i]);
502 $new_no++;
503 }
504 $self->{no} = $new_no;
505 $self->{sbj} = [@new_sbj];
506 }
507 ########## END keep_hsp_of_last_round
508
509 ########## if a query hit a subject with multiple HSPs
510 ########## only the top HSP is kept
511 sub keep_top_hsp {
512 my $self = shift;
513 my ($i,$j,$k);
514
515 my %id_exist = ();
516 my @new_sbj = ();
517 my $new_no = 0;
518 for ($i=0; $i<$self->{no}; $i++) {
519 my $p = $self->{sbj}->[$i];
520 my ($id1, $len_sub) = split(/\./, $p->{id});
521 next unless ($len_sub >0) ;
522
523 if (not defined($id_exist{$id1})) {
524 $id_exist{$id1} = 1;
525 push(@new_sbj, $self->{sbj}->[$i]);
526 $new_no++;
527 }
528 }
529 $self->{no} = $new_no;
530 $self->{sbj} = [@new_sbj];
531 }
532 ########## keep_top_hsp
533
534 ########## let the top hsp to start at 0 for both query and subject
535 ########## i.e. the begining of HSP to be new original - coordinate 0
536 ########## then reset all other HSPs' alignment coordinates
537 sub reset_alignment_coor_for_circle_seq {
538 my $self = shift;
539 my ($i,$j,$k);
540
541 my $last_id = "";
542 $j = 0;
543 my $hsp_count = 0; # number of HSPs for a subject
544 for ($i=0; $i<$self->{no}; $i++) {
545 my $p = $self->{sbj}->[$i];
546 my ($id1, $len_sub) = split(/\./, $p->{id});
547
548 if ($id1 ne $last_id) {
549 if ($hsp_count > 1) { # it is necessary to reset coordinate when at least 2 HSP
550 my $p_top_hsp = $self->{sbj}->[$j];
551 my $len_q = (split(/\./, $p_top_hsp->{qid}))[1];
552 my $len_s = (split(/\./, $p_top_hsp->{id}))[1];
553 my $ref_q = ($p_top_hsp->{qfrom} < $p_top_hsp->{qend}) ? $p_top_hsp->{qfrom} : $p_top_hsp->{qend};
554 my $ref_s = ($p_top_hsp->{sfrom} < $p_top_hsp->{send}) ? $p_top_hsp->{sfrom} : $p_top_hsp->{send};
555 for ($k = $j; $k<$j+$hsp_count; $k++) {
556 $self->{sbj}->[$k]->{qfrom} -= $ref_q; if ($self->{sbj}->[$k]->{qfrom} < 0) {$self->{sbj}->[$k]->{qfrom} += $len_q;}
557 $self->{sbj}->[$k]->{qend} -= $ref_q; if ($self->{sbj}->[$k]->{qend} < 0) {$self->{sbj}->[$k]->{qend} += $len_q;}
558 $self->{sbj}->[$k]->{sfrom} -= $ref_s; if ($self->{sbj}->[$k]->{sfrom} < 0) {$self->{sbj}->[$k]->{sfrom} += $len_s;}
559 $self->{sbj}->[$k]->{send} -= $ref_s; if ($self->{sbj}->[$k]->{send} < 0) {$self->{sbj}->[$k]->{send} += $len_s;}
560 }
561 }
562 $j = $i;
563 $hsp_count = 0;
564 }
565 $last_id = $id1;
566 $hsp_count++;
567 }
568
569 #last subject
570 if ($hsp_count > 1) { # it is necessary to reset coordinate when at least 2 HSP
571 my $p_top_hsp = $self->{sbj}->[$j];
572 my $len_q = (split(/\./, $p_top_hsp->{qid}))[1];
573 my $len_s = (split(/\./, $p_top_hsp->{id}))[1];
574 my $ref_q = ($p_top_hsp->{qfrom} < $p_top_hsp->{qend}) ? $p_top_hsp->{qfrom} : $p_top_hsp->{qend};
575 my $ref_s = ($p_top_hsp->{sfrom} < $p_top_hsp->{send}) ? $p_top_hsp->{sfrom} : $p_top_hsp->{send};
576 for ($k = $j; $k<$j+$hsp_count; $k++) {
577 $self->{sbj}->[$k]->{qfrom} -= $ref_q; if ($self->{sbj}->[$k]->{qfrom} < 0) {$self->{sbj}->[$k]->{qfrom} += $len_q;}
578 $self->{sbj}->[$k]->{qend} -= $ref_q; if ($self->{sbj}->[$k]->{qend} < 0) {$self->{sbj}->[$k]->{qend} += $len_q;}
579 $self->{sbj}->[$k]->{sfrom} -= $ref_s; if ($self->{sbj}->[$k]->{sfrom} < 0) {$self->{sbj}->[$k]->{sfrom} += $len_s;}
580 $self->{sbj}->[$k]->{send} -= $ref_s; if ($self->{sbj}->[$k]->{send} < 0) {$self->{sbj}->[$k]->{send} += $len_s;}
581 }
582 }
583
584 return;
585 }
586 ########## reset_alignment_coor_for_circle_seq
587
588
589 sub process_blout_blastp_blastn {
590 my ($i, $j, $k, $i0, $j0, $k0);
591 my $blout = shift;
592 my @blhits = ();
593
594 #### need $len_rep
595 my $len_rep = 0;
596 my $bl = readblast_m8("", $blout);
597 if ($blast_prog eq "blastn") { keep_strand_with_top_hsp($bl); }
598 if (($blast_prog eq "blastpgp") and (not $prof_db)) {keep_hsp_of_last_round($bl); }
599
600 if ($g_iden == 0 ) { #### Local identity
601 keep_top_hsp($bl); #### local alignment, only the top HSP
602
603 for ($i=0; $i<$bl->{no}; $i++) {
604 my $p = $bl->{sbj}->[$i];
605 my ($id1, $len_sub) = split(/\./, $p->{id});
606 my $frame = $p->{frame};
607 if (not $len_rep) {$len_rep = (split(/\./,$p->{qid}))[1]; }
608 my $iden = $p->{iden};
609 next unless (($len_sub >0) and ($len_rep>0));
610 my $cov_aS = $p->{alnln} / $len_sub;
611 my $cov_aL = $p->{alnln} / $len_rep;
612 my $exp1 = $p->{expect};
613
614 if (($iden/100 > $NR_clstr or $exp1<$NR_clstre) and ($cov_aS >= $opt_aS) and ($cov_aL >= $opt_aL) ) {
615 push(@blhits, [$id1, $iden, $p->{alnln}, $exp1, $frame]);
616 }
617 }
618 return @blhits;
619 } #### END if ($g_iden == 0 )
620 else { #### Global idnetity
621 if (($blast_prog eq "blastn") and $circle) { reset_alignment_coor_for_circle_seq($bl); }
622 #### get colinear non-overlapping HSPs
623 my @hsp = (); #### [id, len, qfrom, qend, sbegin, send, expect]
624 my $iden_letters = 0;
625 my $aln_letters = 0;
626 my @aln_lens = ();
627 my $hsp_no = 0;
628 for ($i=0; $i<$bl->{no}; $i++) {
629 my $p = $bl->{sbj}->[$i];
630 my ($id1, $len_sub) = split(/\./, $p->{id});
631 my $frame = $p->{frame};
632 if (not $len_rep) {$len_rep = (split(/\./,$p->{qid}))[1]; }
633 next unless (($len_sub >0) and ($len_rep>0));
634
635 if ($hsp_no) {
636 if ($id1 ne $hsp[0]->[0]) {
637 #### 1. parse previous subject's HSPs
638 my $iden = int($iden_letters / $hsp[0]->[1] * 10000)/100;
639 my $cov_aS = $aln_letters / $hsp[0]->[1];
640 my $cov_aL = $aln_letters / $len_rep;
641 my $exp1 = $hsp[0]->[6];
642 my $frame = $hsp[0]->[7];
643
644 if (($iden/100 > $NR_clstr or $exp1<$NR_clstre) and ($cov_aS >= $opt_aS) and ($cov_aL >= $opt_aL) ) {
645 #push(@blhits, [$hsp[0]->[0], $iden, $aln_letters, $exp1, $frame]);
646 push(@blhits, [$hsp[0]->[0], $iden, join(":", @aln_lens), $exp1, $frame]);
647 }
648 #### 2. init some values
649 @hsp = ();
650 $iden_letters = 0;
651 $aln_letters = 0;
652 @aln_lens = ();
653 $hsp_no = 0;
654 }
655 }
656
657 #check whether overlap with previous high score HSPs
658 my $overlap_flag = 0;
659 for ($j=0; $j<$hsp_no; $j++) {
660 if (overlap1($p->{qfrom}, $p->{qend}, $hsp[$j]->[2], $hsp[$j]->[3])) { $overlap_flag = 1; last; }
661 if (overlap1($p->{sfrom}, $p->{send}, $hsp[$j]->[4], $hsp[$j]->[5])) { $overlap_flag = 1; last; }
662 }
663 next if ($overlap_flag);
664
665 #check whether this HSP cross with previous high score HSPs
666 my $cross_flag = 0;
667 for ($j=0; $j<$hsp_no; $j++) {
668 if (cross1($p->{qfrom}, $p->{qend}, $hsp[$j]->[2], $hsp[$j]->[3],
669 $p->{sfrom}, $p->{send}, $hsp[$j]->[4], $hsp[$j]->[5])) {
670 $cross_flag = 1; last;
671 }
672 }
673 next if ($cross_flag);
674
675 push(@hsp, [$id1, $len_sub, $p->{qfrom}, $p->{qend}, $p->{sfrom}, $p->{send}, $p->{expect}, $p->{frame}]);
676 $iden_letters += int($p->{iden} * $p->{alnln} / 100);
677 $aln_letters += $p->{alnln};
678 push(@aln_lens, $p->{alnln});
679 $hsp_no++;
680 }
681
682 if ($hsp_no) { #last record
683 #### 1. parse previous subject's HSPs
684 my $iden = int($iden_letters / $hsp[0]->[1] * 10000)/100;
685 my $cov_aS = $aln_letters / $hsp[0]->[1];
686 my $cov_aL = $aln_letters / $len_rep;
687 my $exp1 = $hsp[0]->[6];
688 my $frame = $hsp[0]->[7];
689
690 if (($iden/100 > $NR_clstr or $exp1<$NR_clstre) and ($cov_aS >= $opt_aS) and ($cov_aL >= $opt_aL) ) {
691 #push(@blhits, [$hsp[0]->[0], $iden, $aln_letters, $exp1, $frame]);
692 push(@blhits, [$hsp[0]->[0], $iden, join(":", @aln_lens), $exp1, $frame]);
693 }
694 }
695
696 return @blhits;
697 }
698 }
699 ########## END process_blout_blastp_blastn
700
701
702 sub overlap1 {
703 my ($b1, $e1, $b2, $e2) = @_;
704
705 my $t; ###
706 if ($e1 < $b1) { $t = $e1; $e1 = $b1; $b1 = $t; }
707 if ($e2 < $b2) { $t = $e2; $e2 = $b2; $b2 = $t; }
708
709 return 0 if ($e2 < $b1);
710 return 0 if ($b2 > $e1);
711 return ( ($e1<$e2)? $e1:$e2 )-( ($b1>$b2)? $b1:$b2);
712 }
713 ########## END overlap1
714
715 ## modified on 2013_0818 to hancle +- frames
716 sub cross1 {
717 my ($q_b1, $q_e1, $q_b2, $q_e2,
718 $s_b1, $s_e1, $s_b2, $s_e2) = @_;
719
720 my $fr_q1 = ($q_b1 < $q_e1) ? 1 : -1;
721 my $fr_q2 = ($q_b2 < $q_e2) ? 1 : -1;
722 my $fr_s1 = ($s_b1 < $s_e1) ? 1 : -1;
723 my $fr_s2 = ($s_b2 < $s_e2) ? 1 : -1;
724
725 my $fr1 = $fr_q1 * $fr_s1;
726 my $fr2 = $fr_q2 * $fr_s2;
727 return 1 if (($fr1 * $fr2) < 0); # one ++ and one +-
728
729 my $t;
730 if ($q_e1 < $q_b1) { $t = $q_e1; $q_e1 = $q_b1; $q_b1 = $t; }
731 if ($q_e2 < $q_b2) { $t = $q_e2; $q_e2 = $q_b2; $q_b2 = $t; }
732 if ($s_e1 < $s_b1) { $t = $s_e1; $s_e1 = $s_b1; $s_b1 = $t; }
733 if ($s_e2 < $s_b2) { $t = $s_e2; $s_e2 = $s_b2; $s_b2 = $t; }
734
735 # after above transformation
736 # 0 q_b1 q_e1 q_b2 q_e2 qlen
737 # query 5' ====================================================================
738 # match |||||||||||||||| |||||||||||||
739 # subject 5' ========================================================================>>>>>> frame +
740 # 0 s_b1 s_e1 s_b2 s_e2 slen
741
742 # match |||||||||||||||| |||||||||||||
743 # subject 3' ========================================================================>>>>>> frame -
744 # slen s_e1 s_b1 s_e2 s_b2 0
745
746 if (($fr1 > 0) and ($fr2>0)) { # both ++
747 return ( (($q_b2-$q_b1)*($s_b2-$s_b1) <0) ? 1 : 0);
748 }
749 else { # both --
750 return ( (($q_b2-$q_b1)*($s_e1-$s_e2) <0) ? 1 : 0);
751 }
752
753 }
754 ########## END cross1
755
756
757 ## modified on 2013_0818 to hancle +- frames
758 sub cross1_before_2013_0818 {
759 my ($q_b1, $q_e1, $q_b2, $q_e2,
760 $s_b1, $s_e1, $s_b2, $s_e2) = @_;
761
762 my $t;
763 if ($q_e1 < $q_b1) { $t = $q_e1; $q_e1 = $q_b1; $q_b1 = $t; }
764 if ($q_e2 < $q_b2) { $t = $q_e2; $q_e2 = $q_b2; $q_b2 = $t; }
765 if ($s_e1 < $s_b1) { $t = $s_e1; $s_e1 = $s_b1; $s_b1 = $t; }
766 if ($s_e2 < $s_b2) { $t = $s_e2; $s_e2 = $s_b2; $s_b2 = $t; }
767
768 return ( (($q_b2-$q_b1)*($s_b2-$s_b1) <0) ? 1 : 0);
769 }
770 ########## END cross1
771
772 sub readblast_m8 {
773 my ($i, $j, $k, $ll, $no);
774 my ($q_seq, $filename) = @_;
775
776
777 my $fh = "BL" ;
778 if ($bl_STDIN) { $fh = "STDIN"; }
779 else { open($fh, $filename) || return; }
780
781 my @this_sbj = ();
782 $no = 0;
783 while($ll = <$fh>) {
784 chop($ll);
785 my @lls = split(/\t/,$ll);
786 my $frame = "";
787 $frame .= ($lls[6] < $lls[7]) ? "+" : "-";
788 $frame .= ($lls[8] < $lls[9]) ? "+" : "-";
789 next unless ($lls[0] and $lls[1]);
790 $this_sbj[$no] = {
791 'qid' => $lls[0],
792 'id' => $lls[1],
793 'iden' => $lls[2],
794 'alnln' => $lls[3],
795 'ms' => $lls[4],
796 'gap' => $lls[5],
797 'qfrom' => $lls[6],
798 'qend' => $lls[7],
799 'sfrom' => $lls[8],
800 'send' => $lls[9],
801 'expect' => $lls[10],
802 'score' => $lls[11],
803 'frame' => $frame,
804 };
805
806 $no++;
807 # BLASTP 2.2.24 [Aug-08-2010]
808 # Query: gi|388328107|pdb|4DDG|A Chain A, Crystal Structure Of Human Otub1UBCH5B~UBUB
809 # Database: pdbaa.fa
810 # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
811 #gi|388328107|pdb|4DDG|A gi|388328107|pdb|4DDG|A 91.81 171 9 3 6 171 1 171 6e-89 323
812 #gi|388328107|pdb|4DDG|A gi|388328107|pdb|4DDG|A 96.51 86 3 0 235 320 155 240 2e-41 166
813 }
814 close($fh) if (not $bl_STDIN);
815
816 my $self = {
817 'no' => $no,
818 'sbj' => [@this_sbj],
819 };
820 return $self;
821 }
822 ########## END readblast_m8
823
824
825 sub blast_formatdb {
826 my ($i0, $i, $j, $k, $len1);
827
828 open(FDB, "> $tmp_db") || die;
829 $j = 0;
830 $len1 = 0;
831 for ($i0=$NR_no-1; $i0>=0; $i0--) { ### from shortest to longest
832 $i = $NR_idx[$i0];
833 last if ($idens[$i] eq "*"); ### last if reach rep
834 next if ($lens[$i] < $opt_aL_lower_band);
835 next if ($passeds[$i] and ($opt_g==0));
836 my $seq = $seqs[$i];
837 $seq =~ s/(.{70})/$1\n/g;
838 $seq =~ s/\n$//;
839 #print FDB ">$i $dess[$i]\n$seq\n";
840 print FDB ">$i.$lens[$i]\n$seq\n";
841 $j++;
842 $len1 += $lens[$i];
843 }
844 close(FDB);
845
846 while(1) {
847 opendir(SEQDB, $seq_dir) || next;
848 my @leftseqs = grep {/lock/} readdir(SEQDB);
849 closedir(SEQDB);
850
851 last unless @leftseqs;
852 sleep(3);
853 }
854
855 return(0, 0) unless ($j > 0);
856
857 my $cmd_line = "$formatdb -i $tmp_db";
858 $cmd_line = "$formatdb -in $tmp_db" if ($bl_plus);
859 my $cmd = `$cmd_line`;
860
861 ((-e "$tmp_db.phr") and (-e "$tmp_db.pin") and (-e "$tmp_db.psq")) ||
862 ((-e "$tmp_db.nhr") and (-e "$tmp_db.nin") and (-e "$tmp_db.nsq")) ||
863 ((-e "$tmp_db.00.phr") and (-e "$tmp_db.00.pin") and (-e "$tmp_db.00.psq")) ||
864 ((-e "$tmp_db.00.nhr") and (-e "$tmp_db.00.nin") and (-e "$tmp_db.00.nsq"))
865 || die "Can not formatdb";
866
867 return($j, $len1);
868 }
869 ########## END blast_formatdb
870
871
872 sub remove_blast_db {
873 my ($i, $j, $k);
874 $cmd = `rm -f $tmp_db`;
875 $cmd = `rm -f $tmp_db.p*`;
876 $cmd = `rm -f $tmp_db.n*`;
877
878 return;
879 }
880 ########## END remove_blast_db
881
882
883 my $common_usage = <<EOD;
884
885 Options
886 input/output:
887 -i in_dbname, required
888 -o out_dbname, required
889 -l length_of_throw_away_sequences, default 10
890
891 thresholds:
892 -c clustering threshold (sequence identity), default 0.3
893 -ce clustering threshold (blast expect), default -1,
894 it means by default it doesn't use expect threshold,
895 but with positive value, the program cluster seqs if similarities
896 meet either identity threshold or expect threshold
897 -G (1/0) use global identity? default 1
898 two sequences Long (i.e. representative) and Short (redunant) may have multiple
899 alignment fragments (i.e. HSPs), see:
900 seq1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx Long sequence
901 |||||||||||||||||| ///////////// i.e. representative
902 |||||||||||||||||| ///////////// sequence
903 ||||||||HSP 1 |||| ////HSP 2 ///
904 |||||||||||||||||| /////////////
905 |||||||||||||||||| /////////////
906 seq2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx Short sequence
907 << length 1 >> << len 2 >> i.e. redundant
908 <<<<<<<<<<<< length of short sequence >>>>>>>>>>>>>> sequence
909
910 total identical letters from all co-linear and non-overlapping HSPs
911 Glogal identity = -------------------------------------------------------------------
912 length of short sequence
913 Local identity = identity of the top high score HSP
914 if you prefer to use -G 0, it is suggested that you also
915 use -aS, -aL, such as -aS 0.8, to prevent very short matches.
916 -aL alignment coverage for the longer sequence, default 0.0
917 if set to 0.9, the alignment must covers 90% of the sequence
918 -aS alignment coverage for the shorter sequence, default 0.0
919 if set to 0.9, the alignment must covers 90% of the sequence
920 -g (1/0), default 0
921 by cd-hit's default algorithm, a sequence is clustered to the first
922 cluster that meet the threshold (fast cluster). If set to 1, the program
923 will cluster it into the most similar cluster that meet the threshold
924 (accurate but slow mode)
925 but either 1 or 0 won't change the representatives of final clusters
926 -circle (1/0), default 0
927 when set to 1, treat sequences as circular sequence.
928 bacterial genomes, plasmids are circular, but their genome coordinate maybe arbitary,
929 the 2 HSPs below will be treated as non co-linear with -circle 0
930 the 2 HSPs below will be treated as co-linear with -circle 1
931 -------------circle-----------
932 | |
933 seq1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 1
934 \\\\\\\\ /////////////
935 \\\\\\\\ /////////////
936 HSP 2 -> ////HSP 1 /// <-HSP 2
937 ///////////// \\\\\\\\
938 ///////////// \\\\\\\\
939 seq2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 2
940 | |
941 -----------circle--------------
942 -sl, length of very long sequences to be skipped, default 0, no skipping
943 e.g. -sl 5000 means sequences longer than 5000 aa will be treated as singleton clusters
944 without clustering, to save time, especially when there is -aL option in place, very
945 long sequences will not be clustered anyway.
946 program:
947 -prog (blastp, blastn, megablast, blastpgp), default blastp
948 -p profile search para, default
949 "-j 3 -F F -e 0.001 -b 500 -v 500"
950 -dprof database for building PSSM, default using input
951 you can also use another database that is more comprehensive like NR80
952 -s blast search para, default
953 "-F F -e 0.000001 -b 100000 -v 100000"
954 -bs (1/0) default 1
955 pipe blast results from into parser instead of save in hard drive (save time)
956
957 compute:
958 -exec (qsub, local) default local
959 this program writes a shell script to run blast, this script is
960 either performed locally by sh or remotely by qsub
961 with qsub, you can use PBS, SGE etc
962 -host number of hosts, ie number of qsub jobs
963 -para number of parallel blast job per qsub job (each blast can use multi cores), default 1
964 -blp number of threads per blast job, default 1
965 number of threads per blast job X number of parallel blast job per qsub job
966 should <= the number of cores in your computer
967 if your computer grid has 32 cores / node, do either of the followings
968 -para 4 -blp 8
969 -para 8 -blp 4
970 -para 16 -blp 2
971 -para 32 -blp 1
972 -bat number of sequences a blast job to process
973 -shf a filename for add local settings into the job shell script
974 for example, when you run PBS jobs, you can add quene name etc in this
975 file and this script will add them into the job shell script
976 e.g. template file for PBS
977 #!/bin/sh
978 #PBS -v PATH
979 #PBS -l walltime=8:00:00
980 #PBS -q job_queue.q
981
982 e.g. template file for SGE or OGE
983 #!/bin/sh
984 #\$ -v PATH
985 #\$ -q job_queue.q
986 #\$ -V
987 #\$ -pe orte 8
988
989 job:
990 -rs steps of save restart file and clustering output, default 5000
991 everytime after process 5000 sequences, program write a
992 restart file and current clustering information
993 -restart restart file, readin a restart file
994 if program crash, stoped, termitated, you can restart it by
995 add a option "-restart sth.restart"
996 -rf steps of re format blast database, default 200,000
997 if program clustered 200,000 seqs, it remove them from seq
998 pool, and re format blast db to save time
999 -J job, job_file, exe specific jobs like parse blast outonly
1000 DO NOT use it, it is only used by this program itself
1001 -k (1/0) keep blast raw output file, default $keep_bl
1002
1003 -P path to executables
1004 EOD
1005
1006
1007 sub print_usage {
1008 print <<EOD;
1009 Usage psi-cd-hit [Options]
1010 $common_usage
1011
1012 ==============================
1013 by Weizhong Li, liwz\@sdsc.edu
1014 ==============================
1015 If you find cd-hit useful, please kindly cite:
1016
1017 "Clustering of highly homologous sequences to reduce thesize of large protein database", Weizhong Li, Lukasz Jaroszewski & Adam GodzikBioinformatics, (2001) 17:282-283
1018 "Cd-hit: a fast program for clustering and comparing large sets of protein or nucleotide sequences", Weizhong Li & Adam Godzik Bioinformatics, (2006) 22:1658-1659
1019
1020 EOD
1021 return;
1022 }
1023 ########## END print_usage
1024
1025
1026 ## like above, but don't assign seqs to specific node
1027 ## while let nodes run them autoly
1028 sub run_batch_blast3 {
1029 my $i0 = shift;
1030 my ($id, $i, $j, $k, $cmd);
1031
1032 #### wait before qsubs
1033 if ($exec_mode eq "qsub") {
1034 while(1) {
1035 SGE_qstat_xml_query();
1036 last unless (%qsub_ids);
1037
1038 my $wait_flag = 0;
1039 foreach my $qsub_id (keys %qsub_ids) {
1040 if (defined($qstat_xml_data{$qsub_id})) { #### still running
1041 $wait_flag = 1;
1042 $cmd = `qdel -f $qsub_id`; #### at this point, all running jobs are not necessary,
1043 print LOG "force delete un necessary job $qsub_id\n";
1044 }
1045 else {
1046 delete $qsub_ids{$qsub_id};
1047 }
1048 }
1049
1050 if ($wait_flag) {print LOG "wait submitted jobs\n"; sleep(1); }
1051 }
1052
1053 #### delete seq files from last batch
1054 opendir(DIR1, $seq_dir);
1055 my @files = grep { /^\d/ } readdir(DIR1);
1056 closedir(DIR1);
1057 foreach $i (@files) {
1058 $cmd = `rm -f $seq_dir/$i`;
1059 print LOG "remove un necessary seq file $i\n"
1060 }
1061 }
1062
1063 my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
1064
1065 for ($k=0; $i0<$NR_no; $i0++) {
1066 $id = $NR_idx[$i0];
1067 next if ($passeds[$id]);
1068 next if ($in_bg[$id]);
1069 next if ($lens[$id] < $opt_aL_upper_band);
1070 $in_bg[$id] = 1;
1071
1072 my $seq = $seqs[$id];
1073 open(SEQ, "> $seq_dir/$id") || die "Can not write";
1074 #print SEQ "$dess[$id]\n$seq\n";
1075 print SEQ ">$id.$lens[$id]\n$seq\n";
1076 close(SEQ);
1077 $k++;
1078 last if ($k >= $total_jobs);
1079 }
1080
1081 if ($exec_mode eq "qsub") {
1082 for ($j=0; $j<$num_qsub; $j++) {
1083 my $t = "psi-cd-hit-$j";
1084 my $cmd = `qsub -N $t $remote_sh_script`;
1085 my $qsub_id = 0;
1086 if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
1087 print LOG "qsub querying $j, PID $qsub_id\n";
1088 $qsub_ids{$qsub_id} = 1;
1089 }
1090 }
1091 elsif ($exec_mode eq "local") {
1092 #my $cmd = `sh $remote_sh_script >/dev/null 2>&1 &`;
1093 my $cmd = `sh $remote_sh_script`;
1094 }
1095
1096 return;
1097 }
1098 ########## END run_batch_blast3
1099
1100
1101 sub write_remote_sh_script {
1102 my ($i, $j, $k);
1103 my $local_sh = <<EOD;
1104 #!/bin/sh
1105 #PBS -v PATH
1106 #\$ -v PATH
1107 EOD
1108
1109 if ($sh_file) {
1110 $local_sh = `cat $sh_file`;
1111 }
1112
1113 open(RESH, "> $remote_sh_script") || die;
1114 print RESH <<EOD;
1115 $local_sh
1116
1117 cd $pwd
1118 EOD
1119
1120 for ($k=0; $k<$para_no; $k++){
1121 print RESH "./$remote_perl_script $k&\n"
1122 }
1123 print RESH "wait\n\n";
1124
1125 close(RESH);
1126 return;
1127 }
1128 ########## END write_remote_sh_script
1129
1130 sub write_remote_perl_script {
1131 my $dir1 = ".";
1132 my $bl2 = "$blast_exe -d $dir1/$tmp_db $bl_para";
1133 $bl2 = "$blast_exe -db $dir1/$tmp_db $bl_para" if ($bl_plus);
1134
1135 my $opti = "-i"; $opti = "-query" if ($bl_plus);
1136 my $opto = "-o"; $opto = "-out" if ($bl_plus);
1137
1138 open(REPERL, "> $remote_perl_script") || die;
1139 print REPERL <<EOD;
1140 #!/usr/bin/perl
1141 \$host = shift;
1142 \$arg = shift;
1143
1144 #### random sleep, rand() can be a fraction of second
1145 select(undef,undef,undef,rand());
1146
1147 if (\$arg) {
1148 \@ids = split(/,/, \$arg);
1149 }
1150 else {
1151 while(1) {
1152 if (opendir(DDIR, "$seq_dir")) {
1153 \@ids = grep {/^\\d+\$/} readdir(DDIR);
1154 last;
1155 }
1156 else {
1157 sleep(1);
1158 }
1159 }
1160 }
1161
1162 foreach \$id (\@ids) {
1163
1164 next unless (-e "$seq_dir/\$id");
1165 next if (-e "$seq_dir/\$id.lock");
1166 \$cmd = `touch $seq_dir/\$id.lock`;
1167
1168 if ($bl_STDIN) {
1169 \$cmd = `$bl2 $opti $seq_dir/\$id | $script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 1`;
1170 }
1171 else {
1172 \$cmd = `$bl2 $opti $seq_dir/\$id $opto $bl_dir/\$id`;
1173 \$cmd = `$script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 0`;
1174 }
1175 \$cmd = `rm -f $seq_dir/\$id`;
1176 \$cmd = `rm -f $seq_dir/\$id.lock`;
1177 }
1178
1179 (\$tu, \$ts, \$cu, \$cs) = times();
1180 \$tt = \$tu + \$ts + \$cu + \$cs;
1181 \$cmd = `echo \$tt >> $seq_dir/host.\$host.cpu`;
1182
1183 EOD
1184 close(REPERL);
1185 my $cmd = `chmod 755 $remote_perl_script`;
1186
1187 return;
1188 }
1189 ########## END write_remote_perl_script
1190
1191
1192 sub wait_blast_out {
1193 my $out = shift;
1194 print LOG "waiting for $out";
1195 while(1) {
1196 if (-e $out) {
1197 my $last = `tail -1 $out`;
1198 chop($last);
1199 last if ($last =~ /^#$/);
1200 }
1201 sleep(1);
1202 print LOG ".";
1203 }
1204 print LOG "\n";
1205
1206 return;
1207 }
1208 ########## END wait_blast_out
1209
1210
1211 sub SGE_qstat_xml_query {
1212 my ($i, $j, $k, $cmd, $ll);
1213 %qstat_xml_data = (); #### global
1214 $cmd = `qstat -f -xml`;
1215 if ($cmd =~ /<queue_info/) { #### dummy
1216 $qstat_xml_data{"NULL"}= ["NULL","NULL"];
1217 }
1218 my $tmp = <<EOD;
1219 <?xml version='1.0'?>
1220 <job_info xmlns:xsd="http://gridscheduler.svn.sourceforge.net/viewvc/gridscheduler/trunk/source/dist/util/resources/schemas/qstat/qstat.xsd?revision=11">
1221 <queue_info>
1222 <Queue-List>
1223 <name>all.q\@master</name>
1224 <qtype>BIP</qtype>
1225 <slots_used>0</slots_used>
1226 <slots_resv>0</slots_resv>
1227 <slots_total>0</slots_total>
1228 <load_avg>0.08000</load_avg>
1229 <arch>linux-x64</arch>
1230 </Queue-List>
1231 ...
1232 <Queue-List>
1233 <name>all.q\@node016</name>
1234 <qtype>BIP</qtype>
1235 <slots_used>32</slots_used>
1236 <slots_resv>0</slots_resv>
1237 <slots_total>32</slots_total>
1238 <load_avg>42.59000</load_avg>
1239 <arch>linux-x64</arch>
1240 <job_list state="running"> ####### running jobs in this section
1241 <JB_job_number>3535</JB_job_number>
1242 <JAT_prio>0.51468</JAT_prio>
1243 <JB_name>cd-hit</JB_name>
1244 <JB_owner>ubuntu</JB_owner>
1245 <state>r</state>
1246 <slots>4</slots>
1247 </job_list>
1248 ...
1249 </queue_info>
1250 <job_info>
1251 <job_list state="pending"> ######## pending jobs in this section
1252 <JB_job_number>3784</JB_job_number>
1253 <JAT_prio>0.60500</JAT_prio>
1254 <JB_name>cd-hit</JB_name>
1255 <JB_owner>ubuntu</JB_owner>
1256 <state>qw</state>
1257 <slots>32</slots>
1258 </job_list>
1259 ...
1260 </job_info>
1261 </job_info>
1262
1263 EOD
1264 my @lls = split(/\n/, $cmd);
1265 $i = 2; #### skip first 2 lines
1266 for (; $i<$#lls+1; $i++) {
1267 if ($lls[$i] =~ /<job_list/) {
1268 my ($id, $name, $state);
1269 for (; $i<$#lls+1; $i++) {
1270 last if ($lls[$i] =~ /<\/job_list/);
1271 if ($lls[$i] =~ /<JB_job_number>(\d+)/) { $id = $1;}
1272 if ($lls[$i] =~ /<JB_name>([^<]+)/) { $name = $1;}
1273 if ($lls[$i] =~ /<state>([^<]+)/) {$state = $1;}
1274 }
1275 if (defined($id) and defined($name) and defined($state)) {
1276 $qstat_xml_data{$id} = [$name, $state];
1277 }
1278 }
1279 }
1280 }
1281
1282
1283 1;
1284
22 ######### PSI-cd-hit written by Weizhong Li at http://cd-hit.org
33 ################################################################################
44 our $pid = $$;
5 our $db_in = ""; ###################
6 our $db_out = ""; # input / output
5 our $db_in; ###################
6 our $db_out; # input / output
77 our $len_t = 10; ###################
88 our $NR_clstr = 0.3; #
99 our $NR_clstre = -1; #thresholds
1414 our $opt_g = 1; ####################
1515 our $blast_exe = "blastall -p blastp -m 8"; #########################
1616 our $prof_exe = "blastpgp -m 8"; #
17 our $prof_para = "-j 3 -F F -e 0.001 -b 500 -v 500"; #
17 our $prof_para = "-j 3 -F T -e 0.001 -b 500 -v 500"; #
1818 our $prof_db = ""; #
19 our $bl_para = "-F F -e 0.000001 -b 100000 -v 100000"; # program
19 our $bl_para = "-F T -e 0.000001 -b 100000 -v 100000"; # program
2020 our $bl_STDIN = 1; #
2121 our $keep_bl = 0; #
2222 our $blast_prog= "blastp"; #
2323 our $formatdb = "formatdb"; #########################
2424 our $exec_mode = "local"; #######################
25 our $host_no = 1; #
26 our $core_no = 1; # compute
25 our $num_qsub = 1; #
26 our $para_no = 1; # compute
2727 our $sh_file = ""; #
28 our $batch_no_per_node = 50; #######################
28 our $num_multi_seq = 50; #
29 our $batch_no_per_node = 100; #######################
2930 our $reformat_seg = 50000;
3031 our $restart_seg = 20000;
3132 our $job = "";
3839 our $db_out1;
3940 our $seq_dir;
4041 our $bl_dir;
42 our $blm_dir;
4143 our $restart_file;
4244 our $tmp_db;
4345 our $remote_perl_script;
4446 our $remote_sh_script;
4547 our $bl_path;
48 our $bl_plus = 1; #### use blast+
49 our $bl_threads = 1;
50 our $skip_long = 0;
51 our %qsub_ids = (); #### a list of qsub ids
52 our %qstat_xml_data = ();
53 our @blm8_buffer = ();
54 our %blm8_data = ();
55
4656
4757 sub parse_para_etc {
4858 my ($arg, $cmd);
5969 elsif ($arg eq "-aS") { $opt_aS = shift; }
6070 elsif ($arg eq "-g") { $opt_g = shift; }
6171 elsif ($arg eq "-circle") { $circle = shift; }
72 elsif ($arg eq "-sl") { $skip_long = shift; }
6273 ## program
6374 elsif ($arg eq "-prog") { $blast_prog= shift; }
6475 elsif ($arg eq "-p") { $prof_para = shift; }
65 elsif ($arg eq "-dprof") { $prof_db = shift; }
76 elsif ($arg eq "-dprof") { $prof_db = shift; die "option -dprof no longer supported!";}
6677 elsif ($arg eq "-s") { $bl_para = shift; }
6778 elsif ($arg eq "-k") { $keep_bl = shift; }
6879 elsif ($arg eq "-bs") { $bl_STDIN = shift; }
6980 ## compute
7081 elsif ($arg eq "-exec") { $exec_mode = shift; }
71 elsif ($arg eq "-host") { $host_no = shift; }
72 elsif ($arg eq "-core") { $core_no = shift; }
82 elsif ($arg eq "-host") { $num_qsub = shift; }
83 elsif ($arg eq "-para") { $para_no = shift; }
7384 elsif ($arg eq "-shf") { $sh_file = shift; }
85 elsif ($arg eq "-blp") { $bl_threads = shift; }
86 elsif ($arg eq "-bat") { $batch_no_per_node = shift; }
7487 ## job:
7588 elsif ($arg eq "-rs") { $restart_seg = shift; }
7689 elsif ($arg eq "-rf") { $reformat_seg= shift; }
8295 }
8396
8497 # speical jobs
85 if ($job eq "parse_blout") { job_parse_blout(); exit();}
98 if ($job eq "parse_blout") { job_parse_blout(); exit();}
99 elsif ($job eq "parse_blout_multi") { job_parse_blout_multi(); exit();}
100
101 if (not (defined($db_in) and defined($db_out))) {
102 print_usage(); exit();
103 }
86104
87105 if ($blast_prog eq "blastn") {
88106 $formatdb = "formatdb -p F";
94112 $blast_exe = "megablast -H 100 -D 2 -m 8";
95113 }
96114 elsif ($blast_prog eq "blastpgp") {
97 $blast_exe = ($prof_db) ? "blastpgp -m 8" : "blastpgp -m 8 -j 3";
115 $blast_exe = "blastpgp -m 8 -j 3";
116 }
117
118 #### for blast+
119 if ($bl_plus) {
120 $formatdb = "makeblastdb -dbtype prot -max_file_sz 8GB";
121 $blast_exe = "blastp -outfmt 6";
122 $bl_para = "-seg yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
123
124 if ($blast_prog eq "blastn") {
125 $formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
126 $blast_exe = "blastn -task blastn -outfmt 6";
127 $bl_para = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
128 }
129 elsif ($blast_prog eq "megablast") {
130 $blast_prog = "blastn"; #### back to blastn for blast parser type
131 $formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
132 $blast_exe = "blastn -task megablast -outfmt 6";
133 $bl_para = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
134 }
135 elsif ($blast_prog eq "blastpgp") {
136 $blast_exe = "psiblast -outfmt 6 -num_iterations 3 -num_threads $bl_threads";
137 }
98138 }
99139
100140 if ($bl_path) {
110150 $db_out1 = "$db_out.out";
111151 $seq_dir = "$db_in-seq";
112152 $bl_dir = "$db_in-bl";
153 $blm_dir = "$db_in-blm";
113154 $restart_file =" $db_out.restart";
114155
115156 $tmp_db = "$db_in.$pid";
116157 $remote_perl_script = "$tmp_db-bl.pl";
117158 $remote_sh_script = "$tmp_db-bl.sh";
118159
119 $cmd = `mkdir $bl_dir $seq_dir`;
160 $cmd = `mkdir $bl_dir $blm_dir $seq_dir`;
120161
121162 write_remote_perl_script();
122163 write_remote_sh_script();
137178 $seq =~ s/\s//g;
138179 if (length($seq) > $len_t) { add_seq($des, $seq); }
139180 $des = $ll; $seq = "";
181
140182 }
141183 else { $seq .= $ll; }
142184 }
154196
155197 sub add_seq {
156198 my ($des, $seq) = @_;
199 $des =~ s/\s.+$//;
157200 push(@seqs, $seq);
158201 push(@dess, $des);
159202 push(@lens, length($seq));
179222 }
180223 ########## END open_LOG
181224
225 sub write_LOG {
226 my $txt=shift;
227 print LOG "$txt\n";
228 }
182229
183230 {## use static variables
184231 my $last_NR90_no=0;
222269 sub total_remote_cpu {
223270 my ($i, $j, $k, $ll);
224271 my $tt = 0;
225 for ($j=0; $j<$host_no; $j++) {
272 for ($j=0; $j<$num_qsub; $j++) {
226273 open(TCPU, "$seq_dir/host.$j.cpu") || next;
227274 while($ll = <TCPU>) {
228275 chop($ll);
233280 return $tt;
234281 }
235282 ########## END total_remote_cpu
283
284 #### process m8 format output from multi-query search
285 sub job_parse_blout_multi{
286 my ($i, $j, $k, $tfh, $ll, $t1, $t2);
287
288 $tfh="BLM8";
289 open($tfh, $job_file) || die "can not open $job_file";
290
291 @blm8_buffer = ();
292 my $last_id = "";
293 my $this_id = "";
294 my $tquery;
295 while($ll = <$tfh>) {
296 next if ($ll =~ /^#/);
297 ($this_id, $t1) = split(/\s+/, $ll, 2);
298
299 if (@blm8_buffer and ($this_id ne $last_id)) { #### blast results of last query
300 my @hits = process_blout_blastp_blastn();
301 $tquery = (split(/\./, $last_id))[0];
302 my $no1 = $#hits+1;
303 print ">$tquery\t$no1\n";
304 foreach $i (@hits) {
305 print join("\t", @{$i}), "\n";
306 }
307 print "#\n";
308 @blm8_buffer = ();
309 }
310 push(@blm8_buffer, $ll);
311 $last_id = $this_id;
312 }
313
314 if (@blm8_buffer and ($this_id ne $last_id)) { #### blast results of last query
315 my @hits = process_blout_blastp_blastn();
316 $tquery = (split(/\./, $last_id))[0];
317 my $no1 = $#hits+1;
318 print ">$tquery\t$no1\n";
319 foreach $i (@hits) {
320 print join("\t", @{$i}), "\n";
321 }
322 print "#\n";
323 @blm8_buffer = ();
324 }
325 close($tfh);
326 return;
327 }
328 ########## END job_parse_blout_multi
236329
237330
238331 sub job_parse_blout {
376469 }
377470 ########## END remove_raw_blout_bg
378471
379
380 sub fish_other_homolog {
472 sub fish_other_homolog_multi {
381473 my ($i, $j, $k, $i0, $j0, $k0);
382474 $id = shift; # real idx, not sorted idx
383475 my @hits = ();
384476
385 wait_blast_out("$bl_dir/$id.out");
386 open(BLPOUT, "$bl_dir/$id.out") || return;
387 while($i=<BLPOUT>) {
388 last if ($i =~ /^#/);
389 chop($i);
390 push(@hits, [split(/\t/,$i)]);
391 }
392 close(BLPOUT);
477 if (defined($blm8_data{$id})) {
478 @hits = @{$blm8_data{$id}};
479 }
480
393481 my $rep_len = $lens[$id];
394482
395483 foreach $i (@hits) {
396484 my $id1 = $i->[0];
397 next unless ($id1 < $NR_no);
485 next unless ($id1 < $NR_no);
398486 next if ($idens[$id1] eq "*"); #existing reps
399487 next if ($lens[$id1] > $rep_len); # in opt_g=1 mode, preventing it from being clustered into short rep
400488
413501 $NR_clstr_nos[$id1] = $NR90_no;
414502 $NR_passed++;
415503 }
504 if (defined($blm8_data{$id})) {
505 delete $blm8_data{$id};
506 }
507 return;
508 }
509 ########## END fish_other_homolog_multi
510
511
512 sub fish_other_homolog {
513 my ($i, $j, $k, $i0, $j0, $k0);
514 $id = shift; # real idx, not sorted idx
515 my @hits = ();
516
517 wait_blast_out("$bl_dir/$id.out");
518 open(BLPOUT, "$bl_dir/$id.out") || return;
519 while($i=<BLPOUT>) {
520 last if ($i =~ /^#/);
521 chop($i);
522 push(@hits, [split(/\t/,$i)]);
523 }
524 close(BLPOUT);
525 my $rep_len = $lens[$id];
526
527 foreach $i (@hits) {
528 my $id1 = $i->[0];
529 next unless ($id1 < $NR_no);
530 next if ($idens[$id1] eq "*"); #existing reps
531 next if ($lens[$id1] > $rep_len); # in opt_g=1 mode, preventing it from being clustered into short rep
532
533 if ( $passeds[$id1] ) { #### if this hit is better -g 1 mode
534 my $old_e = (split(/\//,$idens[$id1]))[0];
535 if ($i->[3] < $old_e) {
536 $idens[$id1] = "$i->[3]/$i->[2]aa/$i->[1]%";
537 $passeds[$id1] = 1;
538 $NR_clstr_nos[$id1] = $NR90_no;
539 }
540 next;
541 }
542
543 $idens[$id1] = "$i->[3]/$i->[2]aa/$i->[1]%";
544 $passeds[$id1] = 1;
545 $NR_clstr_nos[$id1] = $NR90_no;
546 $NR_passed++;
547 }
416548 return;
417549 }
418550 ########## END fish_other_homolog
481613 for ($i=0; $i<$self->{no}; $i++) {
482614 my $p = $self->{sbj}->[$i];
483615 my ($id1, $len_sub) = split(/\./, $p->{id});
616 next unless ($len_sub >0) ;
484617
485618 if (not defined($id_exist{$id1})) {
486619 $id_exist{$id1} = 1;
555688
556689 #### need $len_rep
557690 my $len_rep = 0;
558 my $bl = readblast_m8("", $blout);
691 my $bl = defined($blout) ? readblast_m8("", $blout) : readblast_m8_buffer();
559692 if ($blast_prog eq "blastn") { keep_strand_with_top_hsp($bl); }
560693 if (($blast_prog eq "blastpgp") and (not $prof_db)) {keep_hsp_of_last_round($bl); }
561694
568701 my $frame = $p->{frame};
569702 if (not $len_rep) {$len_rep = (split(/\./,$p->{qid}))[1]; }
570703 my $iden = $p->{iden};
704 next unless (($len_sub >0) and ($len_rep>0));
571705 my $cov_aS = $p->{alnln} / $len_sub;
572706 my $cov_aL = $p->{alnln} / $len_rep;
573707 my $exp1 = $p->{expect};
591725 my ($id1, $len_sub) = split(/\./, $p->{id});
592726 my $frame = $p->{frame};
593727 if (not $len_rep) {$len_rep = (split(/\./,$p->{qid}))[1]; }
728 next unless (($len_sub >0) and ($len_rep>0));
594729
595730 if ($hsp_no) {
596731 if ($id1 ne $hsp[0]->[0]) {
729864 }
730865 ########## END cross1
731866
732 sub readblast_m8 {
867 sub readblast_m8_buffer {
733868 my ($i, $j, $k, $ll, $no);
734 my ($q_seq, $filename) = @_;
735
736
737 my $fh = "BL" ;
738 if ($bl_STDIN) { $fh = "STDIN"; }
739 else { open($fh, $filename) || return; }
740
741869 my @this_sbj = ();
742870 $no = 0;
743 while($ll = <$fh>) {
871 while($ll = shift @blm8_buffer) {
744872 chop($ll);
745873 my @lls = split(/\t/,$ll);
746874 my $frame = "";
747875 $frame .= ($lls[6] < $lls[7]) ? "+" : "-";
748876 $frame .= ($lls[8] < $lls[9]) ? "+" : "-";
877 next unless ($lls[0] and $lls[1]);
749878 $this_sbj[$no] = {
750879 'qid' => $lls[0],
751880 'id' => $lls[1],
770899 #gi|388328107|pdb|4DDG|A gi|388328107|pdb|4DDG|A 91.81 171 9 3 6 171 1 171 6e-89 323
771900 #gi|388328107|pdb|4DDG|A gi|388328107|pdb|4DDG|A 96.51 86 3 0 235 320 155 240 2e-41 166
772901 }
902 my $self = {
903 'no' => $no,
904 'sbj' => [@this_sbj],
905 };
906 return $self;
907 }
908 ########## END readblast_m8
909
910 sub readblast_m8 {
911 my ($i, $j, $k, $ll, $no);
912 my ($q_seq, $filename) = @_;
913
914
915 my $fh = "BL" ;
916 if ($bl_STDIN) { $fh = "STDIN"; }
917 else { open($fh, $filename) || return; }
918
919 my @this_sbj = ();
920 $no = 0;
921 while($ll = <$fh>) {
922 chop($ll);
923 my @lls = split(/\t/,$ll);
924 my $frame = "";
925 $frame .= ($lls[6] < $lls[7]) ? "+" : "-";
926 $frame .= ($lls[8] < $lls[9]) ? "+" : "-";
927 next unless ($lls[0] and $lls[1]);
928 $this_sbj[$no] = {
929 'qid' => $lls[0],
930 'id' => $lls[1],
931 'iden' => $lls[2],
932 'alnln' => $lls[3],
933 'ms' => $lls[4],
934 'gap' => $lls[5],
935 'qfrom' => $lls[6],
936 'qend' => $lls[7],
937 'sfrom' => $lls[8],
938 'send' => $lls[9],
939 'expect' => $lls[10],
940 'score' => $lls[11],
941 'frame' => $frame,
942 };
943
944 $no++;
945 # BLASTP 2.2.24 [Aug-08-2010]
946 # Query: gi|388328107|pdb|4DDG|A Chain A, Crystal Structure Of Human Otub1UBCH5B~UBUB
947 # Database: pdbaa.fa
948 # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
949 #gi|388328107|pdb|4DDG|A gi|388328107|pdb|4DDG|A 91.81 171 9 3 6 171 1 171 6e-89 323
950 #gi|388328107|pdb|4DDG|A gi|388328107|pdb|4DDG|A 96.51 86 3 0 235 320 155 240 2e-41 166
951 }
773952 close($fh) if (not $bl_STDIN);
774953
775954 my $self = {
790969 for ($i0=$NR_no-1; $i0>=0; $i0--) { ### from shortest to longest
791970 $i = $NR_idx[$i0];
792971 last if ($idens[$i] eq "*"); ### last if reach rep
972 next if ($lens[$i] < $opt_aL_lower_band);
793973 next if ($passeds[$i] and ($opt_g==0));
794974 my $seq = $seqs[$i];
795975 $seq =~ s/(.{70})/$1\n/g;
812992
813993 return(0, 0) unless ($j > 0);
814994
815 my $cmd = `$formatdb -i $tmp_db`;
995 my $cmd_line = "$formatdb -i $tmp_db";
996 $cmd_line = "$formatdb -in $tmp_db" if ($bl_plus);
997 my $cmd = `$cmd_line`;
998
816999 ((-e "$tmp_db.phr") and (-e "$tmp_db.pin") and (-e "$tmp_db.psq")) ||
8171000 ((-e "$tmp_db.nhr") and (-e "$tmp_db.nin") and (-e "$tmp_db.nsq")) ||
8181001 ((-e "$tmp_db.00.phr") and (-e "$tmp_db.00.pin") and (-e "$tmp_db.00.psq")) ||
8411024 input/output:
8421025 -i in_dbname, required
8431026 -o out_dbname, required
844 -l length_of_throw_away_sequences, default 10
1027 -l length_of_throw_away_sequences, default $len_t
8451028
8461029 thresholds:
847 -c clustering threshold (sequence identity), default 0.3
848 -ce clustering threshold (blast expect), default -1,
1030 -c clustering threshold (sequence identity), default $NR_clstr
1031 -ce clustering threshold (blast expect), default $NR_clstre,
8491032 it means by default it doesn't use expect threshold,
8501033 but with positive value, the program cluster seqs if similarities
8511034 meet either identity threshold or expect threshold
852 -G (1/0) use global identity? default 1
1035 -G (1/0) use global identity? default $g_iden
8531036 two sequences Long (i.e. representative) and Short (redunant) may have multiple
8541037 alignment fragments (i.e. HSPs), see:
8551038 seq1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx Long sequence
8681051 Local identity = identity of the top high score HSP
8691052 if you prefer to use -G 0, it is suggested that you also
8701053 use -aS, -aL, such as -aS 0.8, to prevent very short matches.
871 -aL alignment coverage for the longer sequence, default 0.0
1054 -aL alignment coverage for the longer sequence, default $opt_aL
8721055 if set to 0.9, the alignment must covers 90% of the sequence
873 -aS alignment coverage for the shorter sequence, default 0.0
1056 -aS alignment coverage for the shorter sequence, default $opt_aS
8741057 if set to 0.9, the alignment must covers 90% of the sequence
875 -g (1/0), default 0
1058 -g (1/0), default $opt_g
8761059 by cd-hit's default algorithm, a sequence is clustered to the first
8771060 cluster that meet the threshold (fast cluster). If set to 1, the program
8781061 will cluster it into the most similar cluster that meet the threshold
8791062 (accurate but slow mode)
8801063 but either 1 or 0 won't change the representatives of final clusters
881 -circle (1/0), default 0
1064 -circle (1/0), default $circle
8821065 when set to 1, treat sequences as circular sequence.
8831066 bacterial genomes, plasmids are circular, but their genome coordinate maybe arbitary,
8841067 the 2 HSPs below will be treated as non co-linear with -circle 0
8861069 -------------circle-----------
8871070 | |
8881071 seq1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 1
889 \\\\\\\\ /////////////
890 \\\\\\\\ /////////////
1072 \\\\\\\\\\\\\\\\ /////////////
1073 \\\\\\\\\\\\\\\\ /////////////
8911074 HSP 2 -> ////HSP 1 /// <-HSP 2
892 ///////////// \\\\\\\\
893 ///////////// \\\\\\\\
1075 ///////////// \\\\\\\\\\\\\\\\
1076 ///////////// \\\\\\\\\\\\\\\\
8941077 seq2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 2
8951078 | |
8961079 -----------circle--------------
1080 -sl, length of very long sequences to be skipped, default $skip_long,
1081 e.g. -sl 5000 means sequences longer than 5000 aa will be treated as singleton clusters
1082 without clustering, to save time, especially when there is -aL option in place, very
1083 long sequences will not be clustered anyway.
1084 -sl 0 means no skipping
8971085 program:
898 -prog (blastp, blastn, megablast, blastpgp), default blastp
899 -p profile search para, default
900 "-j 3 -F F -e 0.001 -b 500 -v 500"
1086 -prog (blastp, blastn, megablast, blastpgp), default $blast_prog
1087 -p profile search para, default
1088 "$prof_para"
9011089 -dprof database for building PSSM, default using input
9021090 you can also use another database that is more comprehensive like NR80
903 -s blast search para, default
904 "-F F -e 0.000001 -b 100000 -v 100000"
905 -bs (1/0) default 1
1091 -s blast search para, default
1092 "$bl_para"
1093 -bs (1/0) default $bl_STDIN
9061094 pipe blast results from into parser instead of save in hard drive (save time)
9071095
9081096 compute:
909 -exec (qsub, local) default local
1097 -exec (qsub, local) default $exec_mode
9101098 this program writes a shell script to run blast, this script is
9111099 either performed locally by sh or remotely by qsub
9121100 with qsub, you can use PBS, SGE etc
913 -host number of hosts for qsub
914 -core number of cpu cores per computer, default 1
1101 -host number of qsub jobs, default $num_qsub
1102 -para number of parallel blast job per qsub job (each blast can use multi cores), default $para_no
1103 one qsub script can run multiple blast jobs
1104 -blp number of threads per blast job, default $bl_threads
1105 number of threads per blast job (option -blp) X number of parallel blast job per qsub job (option -para)
1106 should <= the number of cores in your computer
1107 if your computer grid has 32 cores / node, do either of the followings
1108 -para 4 -blp 8
1109 -para 8 -blp 4 preferred
1110 -para 16 -blp 2
1111 -para 32 -blp 1
1112 -bat number of sequences a blast job to process, $batch_no_per_node
9151113 -shf a filename for add local settings into the job shell script
9161114 for example, when you run PBS jobs, you can add quene name etc in this
9171115 file and this script will add them into the job shell script
918 e.g. your file may have followings
1116 e.g. template file for PBS
1117 #!/bin/sh
9191118 #PBS -v PATH
9201119 #PBS -l walltime=8:00:00
921 #PBS -q jobqueue
1120 #PBS -q job_queue.q
1121
1122 e.g. template file for SGE or OGE
1123 #!/bin/sh
1124 #\$ -v PATH
1125 #\$ -q job_queue.q
1126 #\$ -V
1127 #\$ -pe orte 8
9221128
9231129 job:
924 -rs steps of save restart file and clustering output, default 5000
1130 -rs steps of save restart file and clustering output, default $restart_seg
9251131 everytime after process 5000 sequences, program write a
9261132 restart file and current clustering information
9271133 -restart restart file, readin a restart file
9281134 if program crash, stoped, termitated, you can restart it by
9291135 add a option "-restart sth.restart"
930 -rf steps of re format blast database, default 200,000
1136 -rf steps of re format blast database, default $reformat_seg
9311137 if program clustered 200,000 seqs, it remove them from seq
9321138 pool, and re format blast db to save time
9331139 -J job, job_file, exe specific jobs like parse blast outonly
934 DON'T use it, it is only used by this program itself
1140 DO NOT use it, it is only used by this program itself
9351141 -k (1/0) keep blast raw output file, default $keep_bl
9361142
937 -P path to executables
1143 -P path to blast executables
9381144 EOD
9391145
9401146
9571163 ########## END print_usage
9581164
9591165
960 ## like above, but don't assign seqs to specific node
961 ## while let nodes run them autoly
962 sub run_batch_blast3 {
1166 ## copied from run_batch_blast3
1167 ## run multi seq per sample
1168 ## wait for all jobs to finish
1169 sub run_batch_blast3_multi {
9631170 my $i0 = shift;
964 my ($id, $i, $j, $k);
1171 my ($id, $i, $j, $k, $cmd, $ll);
9651172
966 my $total_jobs = $batch_no_per_node * $host_no * $core_no;
1173 my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
9671174
9681175 for ($k=0; $i0<$NR_no; $i0++) {
9691176 $id = $NR_idx[$i0];
9701177 next if ($passeds[$id]);
9711178 next if ($in_bg[$id]);
1179 next if ($lens[$id] < $opt_aL_upper_band);
1180 $in_bg[$id] = 1;
1181
1182 my $seq = $seqs[$id];
1183
1184 if (($k % $num_multi_seq) ==0) { #### reopen
1185 close(SEQ) if ($k > 0);
1186 open(SEQ, "> $seq_dir/$id") || die "Can not write";
1187 }
1188 #print SEQ "$dess[$id]\n$seq\n";
1189 print SEQ ">$id.$lens[$id]\n$seq\n";
1190 $k++;
1191 last if ($k >= $total_jobs);
1192 }
1193 close(SEQ);
1194
1195 if ($exec_mode eq "qsub") {
1196 for ($j=0; $j<$num_qsub; $j++) {
1197 my $t = "psi-cd-hit-$j";
1198 my $cmd = `qsub -N $t $remote_sh_script $j`; #### pass $j to qsub command
1199 my $qsub_id = 0;
1200 if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
1201 print LOG "qsub querying $j, PID $qsub_id\n";
1202 $qsub_ids{$qsub_id} = 1;
1203 }
1204 }
1205 elsif ($exec_mode eq "local") {
1206 #my $cmd = `sh $remote_sh_script >/dev/null 2>&1 &`;
1207 my $cmd = `sh $remote_sh_script`;
1208 }
1209
1210 #### wait finish all submitted
1211 if ($exec_mode eq "qsub") {
1212 while(1) {
1213 SGE_qstat_xml_query();
1214 last unless (%qsub_ids);
1215
1216 my $wait_flag = 0;
1217 foreach my $qsub_id (keys %qsub_ids) {
1218 if (defined($qstat_xml_data{$qsub_id})) { #### still running
1219 $wait_flag = 1;
1220 }
1221 else {
1222 delete $qsub_ids{$qsub_id};
1223 }
1224 }
1225
1226 if ($wait_flag) {print LOG "wait submitted jobs\n"; sleep(1); }
1227 }
1228 }
1229
1230 #### read in all parsed blast output
1231 %blm8_data =();
1232 opendir(BLMDIR, $blm_dir) || die "can not open $blm_dir";
1233 my @bl_files = grep { /^\d/ } readdir(BLMDIR);
1234 closedir(BLMDIR);
1235
1236 foreach my $blf (@bl_files) {
1237 open(BLMTMP, "$blm_dir/$blf") || next;
1238 while($ll = <BLMTMP>) {
1239 next if ($ll =~ /^#/);
1240 chop($ll);
1241 if ($ll =~ /^>/) {
1242
1243 my ($id, $no1) = split(/\s+/, substr($ll,1));
1244 my @hits = ();
1245 for ($j=0; $j<$no1; $j++) {
1246 $ll=<BLMTMP>; chop($ll);
1247 push(@hits, [split(/\t/,$ll)]);
1248 }
1249 if ($no1>=1) {
1250 $blm8_data{$id} = [@hits];
1251 }
1252 }
1253 }
1254 close(BLMTMP);
1255
1256 $cmd = `rm -f $blm_dir/$blf`;
1257 print LOG "parse and then rm $blm_dir/$blf\n";
1258 }
1259 return;
1260 }
1261
1262 sub run_batch_blast3 {
1263 my $i0 = shift;
1264 my ($id, $i, $j, $k, $cmd);
1265
1266 #### wait before qsubs
1267 if ($exec_mode eq "qsub") {
1268 while(1) {
1269 SGE_qstat_xml_query();
1270 last unless (%qsub_ids);
1271
1272 my $wait_flag = 0;
1273 foreach my $qsub_id (keys %qsub_ids) {
1274 if (defined($qstat_xml_data{$qsub_id})) { #### still running
1275 $wait_flag = 1;
1276 $cmd = `qdel -f $qsub_id`; #### at this point, all running jobs are not necessary,
1277 print LOG "force delete un necessary job $qsub_id\n";
1278 }
1279 else {
1280 delete $qsub_ids{$qsub_id};
1281 }
1282 }
1283
1284 if ($wait_flag) {print LOG "wait submitted jobs\n"; sleep(1); }
1285 }
1286
1287 #### delete seq files from last batch
1288 opendir(DIR1, $seq_dir);
1289 my @files = grep { /^\d/ } readdir(DIR1);
1290 closedir(DIR1);
1291 foreach $i (@files) {
1292 $cmd = `rm -f $seq_dir/$i`;
1293 print LOG "remove un necessary seq file $i\n"
1294 }
1295 }
1296
1297 my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
1298
1299 for ($k=0; $i0<$NR_no; $i0++) {
1300 $id = $NR_idx[$i0];
1301 next if ($passeds[$id]);
1302 next if ($in_bg[$id]);
1303 next if ($lens[$id] < $opt_aL_upper_band);
9721304 $in_bg[$id] = 1;
9731305
9741306 my $seq = $seqs[$id];
9811313 }
9821314
9831315 if ($exec_mode eq "qsub") {
984 for ($j=0; $j<$host_no; $j++) {
1316 for ($j=0; $j<$num_qsub; $j++) {
9851317 my $t = "psi-cd-hit-$j";
986 print LOG "PBS querying $j\n";
9871318 my $cmd = `qsub -N $t $remote_sh_script`;
1319 my $qsub_id = 0;
1320 if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
1321 print LOG "qsub querying $j, PID $qsub_id\n";
1322 $qsub_ids{$qsub_id} = 1;
9881323 }
9891324 }
9901325 elsif ($exec_mode eq "local") {
991 my $cmd = `sh $remote_sh_script >/dev/null 2>&1 &`;
1326 #my $cmd = `sh $remote_sh_script >/dev/null 2>&1 &`;
1327 my $cmd = `sh $remote_sh_script`;
9921328 }
9931329
9941330 return;
9981334
9991335 sub write_remote_sh_script {
10001336 my ($i, $j, $k);
1001 my $local_sh = "";
1337 my $local_sh = <<EOD;
1338 #!/bin/sh
1339 #PBS -v PATH
1340 #\$ -v PATH
1341 EOD
1342
10021343 if ($sh_file) {
10031344 $local_sh = `cat $sh_file`;
10041345 }
10051346
10061347 open(RESH, "> $remote_sh_script") || die;
10071348 print RESH <<EOD;
1008 #!/bin/bash
1009 #\$ -S /bin/bash
1010 #\$ -v PATH
1011 #PBS -v PATH
10121349 $local_sh
10131350
1351 para=\$1
10141352 cd $pwd
10151353 EOD
10161354
1017 for ($k=0; $k<$core_no; $k++){
1018 print RESH "./$remote_perl_script $k&\n"
1355 for ($k=0; $k<$para_no; $k++){
1356 print RESH "./$remote_perl_script $k \$para &\n"
10191357 }
10201358 print RESH "wait\n\n";
10211359
10261364
10271365 sub write_remote_perl_script {
10281366 my $dir1 = ".";
1029 my $bl2 = ($prof_db) ?
1030 "$blast_exe -d $dir1/$tmp_db $bl_para -R $bl_dir/\$id.prof":
1031 "$blast_exe -d $dir1/$tmp_db $bl_para";
1032 my $cc = ($prof_db) ? 1 : 0;
1033 if ($prof_db) { my $cmd=`formatdb -i $prof_db`; }
1367 my $bl2 = "$blast_exe -d $dir1/$tmp_db $bl_para";
1368 $bl2 = "$blast_exe -db $dir1/$tmp_db $bl_para" if ($bl_plus);
1369
1370 my $opti = "-i"; $opti = "-query" if ($bl_plus);
1371 my $opto = "-o"; $opto = "-out" if ($bl_plus);
10341372
10351373 open(REPERL, "> $remote_perl_script") || die;
10361374 print REPERL <<EOD;
10371375 #!/usr/bin/perl
10381376 \$host = shift;
1377 \$instance = shift;
10391378 \$arg = shift;
10401379
10411380 #### random sleep, rand() can be a fraction of second
10621401 next if (-e "$seq_dir/\$id.lock");
10631402 \$cmd = `touch $seq_dir/\$id.lock`;
10641403
1065 if ($cc) {
1066 \$cmd = `$prof_exe -d $prof_db $prof_para -i $seq_dir/\$id -C $bl_dir/\$id.prof`;
1067 }
1068
1069 if ($bl_STDIN) {
1070 \$cmd = `$bl2 -i $seq_dir/\$id | $script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 1`;
1404 if ($num_multi_seq) {
1405 \$cmd = `$bl2 $opti $seq_dir/\$id $opto $bl_dir/\$id`;
1406 \$cmd = `$script_name -J parse_blout_multi $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 0 >> $blm_dir/\$host.\$instance`;
1407 }
1408 elsif ($bl_STDIN) {
1409 \$cmd = `$bl2 $opti $seq_dir/\$id | $script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 1`;
10711410 }
10721411 else {
1073 \$cmd = `$bl2 -i $seq_dir/\$id -o $bl_dir/\$id`;
1412 \$cmd = `$bl2 $opti $seq_dir/\$id $opto $bl_dir/\$id`;
10741413 \$cmd = `$script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 0`;
10751414 }
10761415 \$cmd = `rm -f $seq_dir/\$id`;
10771416 \$cmd = `rm -f $seq_dir/\$id.lock`;
1078 if ($cc) { \$cmd = `rm -f $bl_dir/\$id.prof`; }
10791417 }
10801418
10811419 (\$tu, \$ts, \$cu, \$cs) = times();
10821420 \$tt = \$tu + \$ts + \$cu + \$cs;
1083 \$cmd = `echo \$tt >> $seq_dir/host.\$host.cpu`;
1421 \$cmd = `echo \$tt >> $seq_dir/host.\$host.\$instance.cpu`;
10841422
10851423 EOD
10861424 close(REPERL);
11101448 ########## END wait_blast_out
11111449
11121450
1451 sub SGE_qstat_xml_query {
1452 my ($i, $j, $k, $cmd, $ll);
1453 %qstat_xml_data = (); #### global
1454 $cmd = `qstat -f -xml`;
1455 if ($cmd =~ /<queue_info/) { #### dummy
1456 $qstat_xml_data{"NULL"}= ["NULL","NULL"];
1457 }
1458 my $tmp = <<EOD;
1459 <?xml version='1.0'?>
1460 <job_info xmlns:xsd="http://gridscheduler.svn.sourceforge.net/viewvc/gridscheduler/trunk/source/dist/util/resources/schemas/qstat/qstat.xsd?revision=11">
1461 <queue_info>
1462 <Queue-List>
1463 <name>all.q\@master</name>
1464 <qtype>BIP</qtype>
1465 <slots_used>0</slots_used>
1466 <slots_resv>0</slots_resv>
1467 <slots_total>0</slots_total>
1468 <load_avg>0.08000</load_avg>
1469 <arch>linux-x64</arch>
1470 </Queue-List>
1471 ...
1472 <Queue-List>
1473 <name>all.q\@node016</name>
1474 <qtype>BIP</qtype>
1475 <slots_used>32</slots_used>
1476 <slots_resv>0</slots_resv>
1477 <slots_total>32</slots_total>
1478 <load_avg>42.59000</load_avg>
1479 <arch>linux-x64</arch>
1480 <job_list state="running"> ####### running jobs in this section
1481 <JB_job_number>3535</JB_job_number>
1482 <JAT_prio>0.51468</JAT_prio>
1483 <JB_name>cd-hit</JB_name>
1484 <JB_owner>ubuntu</JB_owner>
1485 <state>r</state>
1486 <slots>4</slots>
1487 </job_list>
1488 ...
1489 </queue_info>
1490 <job_info>
1491 <job_list state="pending"> ######## pending jobs in this section
1492 <JB_job_number>3784</JB_job_number>
1493 <JAT_prio>0.60500</JAT_prio>
1494 <JB_name>cd-hit</JB_name>
1495 <JB_owner>ubuntu</JB_owner>
1496 <state>qw</state>
1497 <slots>32</slots>
1498 </job_list>
1499 ...
1500 </job_info>
1501 </job_info>
1502
1503 EOD
1504 my @lls = split(/\n/, $cmd);
1505 $i = 2; #### skip first 2 lines
1506 for (; $i<$#lls+1; $i++) {
1507 if ($lls[$i] =~ /<job_list/) {
1508 my ($id, $name, $state);
1509 for (; $i<$#lls+1; $i++) {
1510 last if ($lls[$i] =~ /<\/job_list/);
1511 if ($lls[$i] =~ /<JB_job_number>(\d+)/) { $id = $1;}
1512 if ($lls[$i] =~ /<JB_name>([^<]+)/) { $name = $1;}
1513 if ($lls[$i] =~ /<state>([^<]+)/) {$state = $1;}
1514 }
1515 if (defined($id) and defined($name) and defined($state)) {
1516 $qstat_xml_data{$id} = [$name, $state];
1517 }
1518 }
1519 }
1520 }
1521
11131522
11141523 1;
11151524
0 #!/usr/bin/perl -w
1 ################################################################################
2 ######### PSI-cd-hit written by Weizhong Li at http://cd-hit.org
3 ################################################################################
4
5 our $script_name = $0;
6 our $script_dir = $0;
7 $script_dir =~ s/[^\/]+$//;
8 $script_dir = "./" unless ($script_dir);
9 require "$script_dir/psi-cd-hit-local-old.pl";
10
11 parse_para_etc(@ARGV);
12 open_LOG();
13
14 our @seqs = ();
15 our @dess = ();
16 our @idens = ();
17 our @lens = ();
18 our @passeds = ();
19 our @NR_clstr_nos = ();
20 our @in_bg = ();
21 our @NR_idx = ();
22 our $NR_no = 0;
23 our $DB_no = 0;
24 our $DB_len = 0;
25 our $DB_len0 = 0;
26 our $DB_len_reduced = 0;
27 our $DB_len_reduced2 = 0; #### for write_restart etc purpose
28
29 our $opt_aL_upper_band = 0; #### sequences < this length will not be submitted unless reformatdb
30 our $opt_al_upper_bandi= 0;
31 our $opt_aL_lower_band = 0; #### sequences < this length don't need to be searched
32 my ($i, $j, $k, $i0, $j0, $k0, $ll);
33
34 read_db();
35
36 our $NR_passed = 0;
37 our $formatdb_no = $NR_no;;
38
39 @NR_idx = (0..($NR_no-1));
40 @NR_idx = sort { $lens[$b] <=> $lens[$a] } @NR_idx unless (-e $restart_in);
41
42 our $NR90_no = 0;
43 our @NR90_seq = ();
44
45 $i0 = 0;
46 if ( -e $restart_in) { $i0 = read_restart(); } ## restart after crash
47 elsif ($skip_long > 0) { #### skip very long seqs
48 for (; $i0<$NR_no; $i0++) {
49 $i = $NR_idx[$i0];
50 last if ($lens[$i] < $skip_long);
51
52 $NR_passed++;
53 $NR_clstr_nos[$i] = $NR90_no;
54 $idens[$i] = "*";
55 $passeds[$i] = 1;
56 $NR90_seq[$NR90_no] = [$i];
57 $NR90_no++;
58 $DB_len_reduced += $lens[$i];
59 }
60 }
61
62 #### set init opt_aL_uppper/lower_bands
63 if ( ($opt_aL > 0.3) ) {
64 die ("option -aL > 1.0") if ($opt_aL > 1.0);
65
66 ####################
67 ###################
68 ##################
69 #################
70 ################
71 ############### <-upper band
72 ############## <- seq below not submit, unless band change
73 #############
74 ############
75 ###########
76 ########## <- lower band
77 ######### <- seq below not in format db
78 ########
79 #######
80 #####
81 ####
82 ###
83 ##
84 #
85 my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
86 my $space = ($total_jobs > $restart_seg) ? $total_jobs : $restart_seg;
87 my $d1 = $i0+$space;
88 $d1 = ($NR_no-1) if ($d1 >= $NR_no-1);
89 $opt_aL_upper_band = $lens[ $NR_idx[$d1] ];
90 $opt_aL_lower_band = int($opt_aL_upper_band * $opt_aL);
91 $opt_aL_upper_bandi= $d1;
92 write_LOG("set opt_aL_band $opt_aL_upper_band($opt_aL_upper_bandi) $opt_aL_lower_band");
93 }
94
95
96 ($DB_no, $DB_len) = blast_formatdb();
97 $DB_len0 = $DB_len;
98 $DB_len_reduced = 0;
99 $DB_len_reduced2 = 0;
100 for (; $i0<$NR_no; $i0++) {
101 $i = $NR_idx[$i0];
102 run_batch_blast3($i0) unless ($in_bg[$i] or (-e "$bl_dir/$i.out") or $passeds[$i]);
103
104 if ( not $passeds[$i] ) { # this is a new representative
105 $NR_passed++;
106 $NR_clstr_nos[$i] = $NR90_no;
107 $idens[$i] = "*";
108 $passeds[$i] = 1;
109 $NR90_seq[$NR90_no] = [$i];
110 fish_other_homolog($i);
111 $NR90_no++;
112 $DB_len_reduced += $lens[$i];
113 $DB_len_reduced2 += $lens[$i];
114 }
115
116 watch_progress($i0, $NR90_no, $NR_passed, $NR_no, 0);
117
118 if ((($i0+1) % $restart_seg == 0) or ($DB_len_reduced2 > $DB_len0/10) ) {
119 write_restart(); write_db_clstr(); remove_raw_blout_bg($i0);
120 $DB_len_reduced2 = 0;
121 }
122
123 my $opt_aL_format_flag = 0;
124 if ( ($opt_aL > 0.3) ) { #### formatdb maybe needed if current length of seq.i0 close to opt_aL_upper_band
125 my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
126 if ( ($opt_aL_upper_bandi - $i0) < $total_jobs ) { #### seqs left for possible submission < total_jobs
127
128 my $space = ($total_jobs > $restart_seg) ? $total_jobs : $restart_seg;
129 my $d1 = $i0+$space;
130 $d1 = ($NR_no-1) if ($d1 >= $NR_no-1);
131 $opt_aL_upper_band = $lens[ $NR_idx[$d1] ];
132 $opt_aL_lower_band = int($opt_aL_upper_band * $opt_aL);
133 $opt_aL_upper_bandi= $d1;
134 $opt_aL_format_flag = 1;
135 write_LOG("set opt_aL_band $opt_aL_upper_band($opt_aL_upper_bandi) $opt_aL_lower_band");
136 }
137 }
138 if ((($i0+1) % (int($NR_no/10)) == 0) or ($DB_len_reduced > $DB_len/10) or $opt_aL_format_flag ) {
139 ($DB_no, $DB_len) = blast_formatdb();
140 $DB_len_reduced = 0;
141 }
142 #if ($formatdb_no - ($NR_no-$NR_passed) >= $reformat_seg) {blast_formatdb(); }
143 }
144 ## END for ($i=0; $i<$NR_no; $i++)
145 watch_progress($NR_no-1, $NR90_no, $NR_passed, $NR_no, 1);
146
147 if (1) { ### print NR db
148 open(DBOUT, "> $db_out") || die "Can not write $db_out";
149 for ($i=0; $i<$NR_no; $i++) {
150 next unless ($idens[$i] eq "*");
151 my $seq = $seqs[$i];
152 $seq =~ s/(.{70})/$1\n/g;
153 $seq =~ s/\n$//;
154 print DBOUT "$dess[$i]\n$seq\n";
155 }
156 close(DBOUT);
157 }
158
159 write_restart();
160 write_db_clstr();
161 remove_blast_db();
162 close_LOG();
163
164
2525 our $DB_len0 = 0;
2626 our $DB_len_reduced = 0;
2727 our $DB_len_reduced2 = 0; #### for write_restart etc purpose
28
29 our $opt_aL_upper_band = 0; #### sequences < this length will not be submitted unless reformatdb
30 our $opt_al_upper_bandi= 0;
31 our $opt_aL_lower_band = 0; #### sequences < this length don't need to be searched
2832 my ($i, $j, $k, $i0, $j0, $k0, $ll);
2933
3034 read_db();
4044
4145 $i0 = 0;
4246 if ( -e $restart_in) { $i0 = read_restart(); } ## restart after crash
47 elsif ($skip_long > 0) { #### skip very long seqs
48 for (; $i0<$NR_no; $i0++) {
49 $i = $NR_idx[$i0];
50 last if ($lens[$i] < $skip_long);
51
52 $NR_passed++;
53 $NR_clstr_nos[$i] = $NR90_no;
54 $idens[$i] = "*";
55 $passeds[$i] = 1;
56 $NR90_seq[$NR90_no] = [$i];
57 $NR90_no++;
58 $DB_len_reduced += $lens[$i];
59 }
60 }
61
62 #### set init opt_aL_uppper/lower_bands
63 if ( ($opt_aL > 0.3) ) {
64 die ("option -aL > 1.0") if ($opt_aL > 1.0);
65
66 ####################
67 ###################
68 ##################
69 #################
70 ################
71 ############### <-upper band
72 ############## <- seq below not submit, unless band change
73 #############
74 ############
75 ###########
76 ########## <- lower band
77 ######### <- seq below not in format db
78 ########
79 #######
80 #####
81 ####
82 ###
83 ##
84 #
85 my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
86 my $space = ($total_jobs > $restart_seg) ? $total_jobs : $restart_seg;
87 my $d1 = $i0+$space;
88 $d1 = ($NR_no-1) if ($d1 >= $NR_no-1);
89 $opt_aL_upper_band = $lens[ $NR_idx[$d1] ];
90 $opt_aL_lower_band = int($opt_aL_upper_band * $opt_aL);
91 $opt_aL_upper_bandi= $d1;
92 write_LOG("set opt_aL_band $opt_aL_upper_band($opt_aL_upper_bandi) $opt_aL_lower_band");
93 }
94
4395
4496 ($DB_no, $DB_len) = blast_formatdb();
4597 $DB_len0 = $DB_len;
4799 $DB_len_reduced2 = 0;
48100 for (; $i0<$NR_no; $i0++) {
49101 $i = $NR_idx[$i0];
50 run_batch_blast3($i0) unless ($in_bg[$i] or (-e "$bl_dir/$i.out") or $passeds[$i]);
102 run_batch_blast3_multi($i0) unless ($in_bg[$i] or (-e "$bl_dir/$i.out") or $passeds[$i]);
51103
52104 if ( not $passeds[$i] ) { # this is a new representative
53105 $NR_passed++;
55107 $idens[$i] = "*";
56108 $passeds[$i] = 1;
57109 $NR90_seq[$NR90_no] = [$i];
58 fish_other_homolog($i);
110 fish_other_homolog_multi($i);
59111 $NR90_no++;
60112 $DB_len_reduced += $lens[$i];
61113 $DB_len_reduced2 += $lens[$i];
63115
64116 watch_progress($i0, $NR90_no, $NR_passed, $NR_no, 0);
65117
66 if ((($i0+1) % $restart_seg == 0) or ($DB_len_reduced2 > $DB_len0/10) ) {
67 write_restart(); write_db_clstr(); remove_raw_blout_bg($i0);
68 $DB_len_reduced2 = 0;
118 if (($i0+1) % $restart_seg == 0 ) {
119 write_restart(); write_db_clstr();
69120 }
70 if ((($i0+1) % (int($NR_no/10)) == 0) or ($DB_len_reduced > $DB_len/10)) {
121
122 my $opt_aL_format_flag = 0;
123 if ( ($opt_aL > 0.3) ) { #### formatdb maybe needed if current length of seq.i0 close to opt_aL_upper_band
124 my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
125 my $opt_aL_upper_band_old = $opt_aL_upper_band;
126 if ( ($opt_aL_upper_bandi - $i0) < $total_jobs ) { #### seqs left for possible submission < total_jobs
127
128 my $space = ($total_jobs > $restart_seg) ? $total_jobs : $restart_seg;
129 my $d1 = $i0+$space;
130 $d1 = ($NR_no-1) if ($d1 >= $NR_no-1);
131 $opt_aL_upper_band = $lens[ $NR_idx[$d1] ];
132 if ($opt_aL_upper_band < $opt_aL_upper_band_old) {
133 $opt_aL_lower_band = int($opt_aL_upper_band * $opt_aL);
134 $opt_aL_upper_bandi= $d1;
135 $opt_aL_format_flag = 1;
136 write_LOG("set opt_aL_band $opt_aL_upper_band($opt_aL_upper_bandi) $opt_aL_lower_band");
137 }
138 }
139 }
140 if ((($i0+1) % (int($NR_no/10)) == 0) or ($DB_len_reduced > $DB_len/10) or $opt_aL_format_flag ) {
71141 ($DB_no, $DB_len) = blast_formatdb();
72142 $DB_len_reduced = 0;
73143 }
0 #!/usr/bin/perl
1 ## =========================== NGS tools ==========================================
2 ## NGS tools for metagenomic sequence analysis
3 ## May also be used for other type NGS data analysis
4 ##
5 ## Weizhong Li, UCSD
6 ## liwz@sdsc.edu
7 ## http://weizhongli-lab.org/
8 ## ================================================================================
9
10 my $script_name = $0;
11 my $script_dir = $0;
12 $script_dir =~ s/[^\/]+$//;
13 chop($script_dir);
14 $script_dir = "./" unless ($script_dir);
15
16 use Getopt::Std;
17 getopts("i:j:o:r:e:p:q:c:d:N:t:u:d:M:T:S:",\%opts);
18 die usage() unless ($opts{i} and $opts{j} and $opts{o} and $opts{d});
19 my ($i, $j, $k, $cmd);
20 my ($ll, $lla, $llb, $id, $ida, $idb, $seq, $seqa, $seqb, $qua, $quaa, $quab);
21 my ($len, $lena, $lenb);
22
23 my $fastq = $opts{i};
24 my $fastq2 = $opts{j};
25 my $ref = $opts{d};
26 my $output = $opts{o};
27 my $trim_R1 = $opts{p}; $trim_R1 = 100 unless ($trim_R1);
28 my $trim_R2 = $opts{q}; $trim_R2 = 100 unless ($trim_R2);
29 my $clstr_cutoff = $opts{c}; #### post clustering
30 my $full_frag = $opts{S};
31 my $prime_len = 45;
32 my $output_R1 = "$output-R1";
33 my $output_R2 = "$output-R2";
34 my $session = "OTU-session-$$";
35 my $output_S = "$output-single";
36 my $consensus_db = "$output-consensus";
37 my $cd_hit_2d = "$script_dir/../../cd-hit-est-2d"; die "no $cd_hit_2d" unless (-e $cd_hit_2d);
38 my $cd_hit_est = "$script_dir/../../cd-hit-est"; die "no $cd_hit_est" unless (-e $cd_hit_est);
39 my $format = input_test($fastq); #fasta or fastq
40 my $cdhit_opt_M = $opts{M}; $cdhit_opt_M = 16000 unless defined($cdhit_opt_M);
41
42 if (defined($clstr_cutoff)) {
43 die "Clustering cutoff $clstr_cutoff is not reasonable, should be <=1.0 and >= 0.97" unless (($clstr_cutoff <=1.0) and ($clstr_cutoff>=0.97));
44 }
45
46 my %FHZ=();
47
48 my %ref_map = ();
49 foreach my $f (($fastq, $fastq2)) {
50 my $R = ( $f eq $fastq ) ? "R1" : "R2";
51 open(OUT, "> $consensus_db.$R") || die "can not write to $consensus_db.$R";
52
53 my %con = ();
54 my $num_seq = 0;
55 open_files_z_safe("TTTa", $f);
56
57 if ($format eq "fastq") {
58 while(1) {
59 ($lla, $ida, $seqa, $quaa, $lena) = read_next_fastq("TTTa");
60 last unless ($lla);
61 for ($i=0; $i<$prime_len; $i++) {
62 $c=uc(substr($seqa, $i, 1));
63 $con{$i}{$c}++;
64 }
65 $num_seq++;
66 }
67 }
68 else { #### fasta
69 my $seqa = "";
70 while($ll = <TTTa>) {
71 if ($ll =~ /^>/) {
72 if ($seqa) {
73 for ($i=0; $i<$prime_len; $i++) {
74 $c=uc(substr($seqa, $i, 1));
75 $con{$i}{$c}++;
76 }
77 $num_seq++;
78 }
79 chop($ll);
80 $seqa = "";
81 }
82 else {
83 chop($ll);
84 $seqa .= $ll;
85 }
86 }
87 if ($seqa) {
88 for ($i=0; $i<$prime_len; $i++) {
89 $c=uc(substr($seqa, $i, 1));
90 $con{$i}{$c}++;
91 }
92 $num_seq++;
93 }
94 } #### END fasta
95
96 close(TTTa);
97
98 my @cons = (); #which letter
99 my @cons_v = (); #abundance
100 for ($i=0; $i<$prime_len; $i++) {
101 my %t = %{ $con{$i} };
102 my @k = keys %t;
103 @k = sort { $t{$b} <=> $t{$a} } @k;
104 push(@cons, $k[0]);
105 push(@cons_v, $t{ $k[0] } / $num_seq);
106 }
107 ## set minimal consensus to be 30
108 for ($i=33; $i<$prime_len; $i++) {
109 if ( ($cons_v[$i ] <0.75) and
110 ($cons_v[$i-1] <0.75) and
111 ($cons_v[$i-2] <0.75) ) {
112 $i = $i-2; last;
113 }
114 }
115 my $trim_len_new = $i;
116
117 print OUT ">$R\n";
118 for ($i=0; $i<$trim_len_new; $i++) {
119 print OUT $cons[$i];
120 }
121 print OUT "\n";
122 close(OUT);
123
124 my $cmd_line = "$cd_hit_2d -i $consensus_db.$R -i2 $ref -d 0 -c 0.8 -n 5 -r 1 -p 1 -b 5 -o $session.$R-vs-ref -G 0 -A 30 -s2 0.01 -M $cdhit_opt_M > $session.$R-vs-ref.log";
125 print "running $cmd_line\n";
126 $cmd = `$cmd_line`;
127
128 my $parse_template=<<EOD;
129 >Cluster 0
130 0 45nt, >R1... *
131 1 1479nt, >1111882... at 1:42:4:45/+/95.24%
132 2 1500nt, >1111856... at 1:42:4:45/+/88.10%
133 3 1426nt, >1111848... at 2:44:3:45/+/90.70%
134 4 1530nt, >1111847... at 1:42:4:45/+/85.71%
135 5 1497nt, >1111839... at 1:41:5:45/+/85.37%
136 6 1492nt, >1111819... at 1:42:4:45/+/88.10%
137 7 1482nt, >1111782... at 1:42:4:45/+/88.10%
138 8 1496nt, >1111776... at 1:42:4:45/+/88.10%
139 9 1500nt, >1111768... at 1:42:4:45/+/85.71%
140 ...
141 >Cluster 0
142 0 45nt, >R2... *
143 1 1428nt, >1111883... at 483:440:2:45/-/84.09%
144 2 1479nt, >1111882... at 511:468:2:45/-/88.64%
145 3 1336nt, >1111879... at 435:399:2:38/-/86.49%
146 4 1402nt, >1111874... at 469:426:2:45/-/84.09%
147 5 1500nt, >1111856... at 513:470:2:45/-/84.09%
148 6 1530nt, >1111847... at 532:489:2:45/-/86.36%
149 7 1497nt, >1111839... at 509:473:2:38/-/86.49%
150 8 1492nt, >1111819... at 514:471:2:45/-/88.64%
151 9 1482nt, >1111782... at 502:464:2:40/-/84.62%
152 10 1496nt, >1111776... at 516:473:2:45/-/84.09%
153 EOD
154
155 open(TMP, "$session.$R-vs-ref.clstr") || die "can not open $session.$R-vs-ref.clstr";
156 while($ll=<TMP>){
157 next if ($ll =~ /^>/);
158 next if ($ll =~ /^0/);
159 chop($ll);
160 if ($ll =~ /^\d+\s+\d+(aa|nt), >(.+)\.\.\./) {
161 my $id = $2;
162 my @lls = split(/\s+/, $ll);
163 my @lls = split(/\//, $lls[-1]); ##516:473:2:45/-/84.09%
164 my ($query_start, $query_end, $rep_star, $rep_end) = split(/:/, $lls[0]);
165 $ref_map{$id}{$R}=[$query_start, $query_end, $rep_star, $rep_end, $lls[1]];
166 }
167 }
168 close(TMP);
169 }
170
171 my %ref_cut;
172 foreach $id (keys %ref_map) {
173 next unless (defined $ref_map{$id}{"R1"});
174 next unless (defined $ref_map{$id}{"R2"});
175
176 my @R1_info = @{$ref_map{$id}{"R1"}};
177 my @R2_info = @{$ref_map{$id}{"R2"}};
178
179 next unless ($R1_info[4] eq "+");
180 next unless ($R2_info[4] eq "-");
181
182 my $p1 = $R1_info[0] - ($R1_info[2]-1); #### 1-based, can be -1 value for V1
183 my $p2 = $R2_info[0] + ($R2_info[2]-1); #### 1-based, can be longer than len($seq)
184 $ref_cut{$id} = [$p1, $p2];
185 }
186
187 open(TMP, $ref) || die "can not open $ref";
188 open(OUT1, "> $output_R1") || die "can not write to $output_R1";
189 open(OUT2, "> $output_R2") || die "can not write to $output_R2";
190 if ($full_frag) {
191 open(OUT3, "> $output_S") || die "can not write to $output_S";
192 }
193 my $seq;
194 my $des;
195 $id = "";
196
197 while($ll = <TMP>) {
198 if ($ll =~ /^>/) {
199 if ($seq) {
200 if ($ref_cut{$id}) {
201 $seq =~ s/\s//g;
202 my ($p1, $p2) = @{$ref_cut{$id}};
203 my $len = length($seq);
204 my $seq1 = "";
205 my $seq2 = "";
206 if ($p1>=1) {
207 $seq1 = substr($seq, $p1-1, $trim_R1);
208 }
209 else {
210 my $pad = 1 - $p1; #### add NNN at 5'
211 $seq1 = "N" x $pad;
212 $seq1 .= substr($seq, 0, $trim_R1-$pad);
213 }
214
215 if ($p2 <= $len) {
216 my $p2a = $p2 - $trim_R2; #### 0 - based substr idx
217 if ($p2a < 0) { #### not long enough
218 $seq2 = substr($seq, 0, $p2);
219 }
220 else {
221 $seq2 = substr($seq, $p2a, $trim_R2);
222 }
223 }
224 else { #### add NNN at 5'
225 my $pad = $p2 - $len;
226 my $trim_t2_t = $trim_R2 - $pad;
227 $seq2 = "N" x $pad;
228
229 my $p2a = $len - $trim_R2_t; #### 0 - based substr idx
230 if ($p2a < 0) { #### not long enough
231 $seq2.= $seq;
232 }
233 else {
234 $seq2 .= substr($seq, $p2a, $trim_R2_t);
235 }
236 }
237 $seq2 = reverse_complement($seq2);
238 ### now have $seq1 $seq2
239 print OUT1 "$des loc=$p1 len=", length($seq1), "\n$seq1\n";
240 print OUT2 "$des loc=$p2 len=", length($seq2), "\n$seq2\n";
241 if ($full_frag) {
242 if ($p1 < 1 ) {$p1 = 1; }
243 if ($p2 > $len) {$p2 = $len;}
244 my $eff_len = $p2-$p1+1;
245 my $seq1 = substr($seq, $p1-1, $eff_len);
246 print OUT3 "$des loc=$p1:$p2 len=$eff_len\n$seq1\n";
247 }
248 }
249 }
250 chop($ll);
251 $des = $ll;
252 $id = substr($ll,1);
253 $id =~ s/\s.+$//;
254 $seq = "";
255 }
256 else {
257 $seq .= $ll;
258 }
259 }
260
261 if ($seq) {
262 if ($ref_cut{$id}) {
263 $seq =~ s/\s//g;
264 my ($p1, $p2) = @{$ref_cut{$id}};
265 my $len = length($seq);
266 my $seq1 = "";
267 my $seq2 = "";
268 if ($p1>=1) {
269 $seq1 = substr($seq, $p1-1, $trim_R1);
270 }
271 else {
272 my $pad = 1 - $p1; #### add NNN at 5'
273 $seq1 = "N" x $pad;
274 $seq1 .= substr($seq, 0, $trim_R1-$pad);
275 }
276
277 if ($p2 <= $len) {
278 my $p2a = $p2 - $trim_R2; #### 0 - based substr idx
279 if ($p2a < 0) { #### not long enough
280 $seq2 = substr($seq, 0, $p2);
281 }
282 else {
283 $seq2 = substr($seq, $p2a, $trim_R2);
284 }
285 }
286 else { #### add NNN at 5'
287 my $pad = $p2 - $len;
288 my $trim_t2_t = $trim_R2 - $pad;
289 $seq2 = "N" x $pad;
290
291 my $p2a = $len - $trim_R2_t; #### 0 - based substr idx
292 if ($p2a < 0) { #### not long enough
293 $seq2.= $seq;
294 }
295 else {
296 $seq2 .= substr($seq, $p2a, $trim_R2_t);
297 }
298 }
299 $seq2 = reverse_complement($seq2);
300 ### now have $seq1 $seq2
301 print OUT1 "$des loc=$p1 len=", length($seq1), "\n$seq1\n";
302 print OUT2 "$des loc=$p2 len=", length($seq2), "\n$seq2\n";
303 if ($full_frag) {
304 if ($p1 < 1 ) {$p1 = 1; }
305 if ($p2 > $len) {$p2 = $len;}
306 my $eff_len = $p2-$p1+1;
307 my $seq1 = substr($seq, $p1-1, $eff_len);
308 print OUT3 "$des loc=$p1:$p2 len=$eff_len\n$seq1\n";
309 }
310 }
311 }
312
313 close(OUT1);
314 close(OUT2);
315 if ($full_frag) { close(OUT3); }
316 close(TMP);
317
318 if (defined($clstr_cutoff)) {
319 my $output_R1_tmp = "$output_R1.$$";
320 my $output_R2_tmp = "$output_R2.$$";
321
322 my $cmd_line = "$cd_hit_est -i $output_R1 -j $output_R2 -d 0 -c $clstr_cutoff -n 10 -p 1 -b 5" .
323 " -o $output_R1_tmp -op $output_R2_tmp -G 1 -g 1 -M $cdhit_opt_M -P 1 -l 11 -sc 1 > $output_R1_tmp.log";
324 print "running $cmd_line\n";
325 $cmd = `$cmd_line`;
326
327 die "Can not run $cd_hit_est" unless (-e "$output_R1_tmp.clstr");
328 $cmd = `mv $output_R1_tmp $output_R1`;
329 $cmd = `mv $output_R2_tmp $output_R2`;
330 $cmd = `mv $output_R1_tmp.clstr $output.clstr`;
331
332 if ($full_frag) {
333 my $output_S_tmp = "$output_S.$$";
334 my $cmd_line = "$cd_hit_est -i $output_S -d 0 -c $clstr_cutoff -n 10 -p 1 -b 5" .
335 " -o $output_S_tmp -G 1 -g 1 -M $cdhit_opt_M -l 11 -sc 1 > $output_S_tmp.log";
336 print "running $cmd_line\n";
337 $cmd = `$cmd_line`;
338 die "Can not run $cd_hit_est" unless (-e "$output_S_tmp.clstr");
339 $cmd = `mv $output_S_tmp $output_S`;
340 $cmd = `mv $output_S_tmp.clstr $output_S.clstr`;
341 }
342 }
343
344 $cmd = `rm -f $session*`;
345
346 # need %FHZ
347 # open one or more files including zipped files
348 # above open_files_z may have broken pipe problem
349 # so this safe sub, open each file individually
350 sub open_files_z_safe {
351 my ($fh, @files) = @_;
352 my ($i, $j, $k);
353
354 my $no = $#files+1;
355
356 $FHZ{$fh} = {
357 'files' => [@files],
358 'no' => $no,
359 'open_idx' => 0,
360 };
361
362 my $f0 = $files[0];
363 if ($f0 =~ /\.gz$/ ) { open($fh, "gunzip -c $f0 |") || die "can not gunzip -c $f0\n"; }
364 elsif ($f0 =~ /\.bz2$/) { open($fh, "bzcat $f0 |") || die "can not bzcat $f0\n"; }
365 else { open($fh, $f0 ) || die "can not open $f0\n"; }
366 return 0;
367 }
368 ########## END open_files_z_safe
369
370
371 sub read_FHZ {
372 my $fh = shift;
373 my $ll;
374
375 $ll = <$fh>;
376 if ($ll) { return $ll;} ##### read from existing opened file
377
378 #otherwise, last opened file reaches EOF
379 if ($FHZ{$fh}->{open_idx} < $FHZ{$fh}->{no} -1 ) { ### still file not opened yet
380 close($fh); #### close last open file
381
382 $FHZ{$fh}->{open_idx}++;
383 my $f0 = $FHZ{$fh}->{files}->[ $FHZ{$fh}->{open_idx} ];
384
385 if ($f0 =~ /\.gz$/ ) { open($fh, "gunzip -c $f0 |") || die "can not gunzip -c $f0\n"; }
386 elsif ($f0 =~ /\.bz2$/) { open($fh, "bzcat $f0 |") || die "can not bzcat $f0\n"; }
387 else { open($fh, $f0 ) || die "can not open $f0\n"; }
388
389 $ll = <$fh>;
390 return $ll;
391 }
392 else { #### no more file to open, return undef
393 return undef;
394 }
395 }
396 ########### END read_FHZ
397
398
399 ########## read_next_fastq
400 sub read_next_fastq {
401 my $fh = shift;
402 my ($lla, $seqa, $lla2, $quaa, $ida, $lena);
403 $lla = read_FHZ($fh); return unless ($lla);
404 chop($lla); $lla =~ s/\s.+$//;
405 $ida = substr($lla,1);
406 $seqa = read_FHZ($fh); $seqa =~ s/\s+$//g; $lena = length($seqa);
407 $lla2 = read_FHZ($fh); #read ID
408 $quaa = read_FHZ($fh); $quaa =~ s/\s+$//g;
409 return ($lla, $ida, $seqa, $quaa, $lena);
410 }
411 ########## END read_next_fastq
412
413
414 sub reverse_complement {
415 my ($in_seq) = @_;
416 my $opposite = reverse $in_seq;
417 $opposite =~ tr/ACGT/TGCA/;
418 return("$opposite");
419 }
420
421
422 sub input_test {
423 my $f = shift;
424 open(TTT, $f) || die "can not open $f\n";
425 my $ll = <TTT>;
426 close(TTT);
427
428 my $c = substr($ll,0,1);
429 if ($c eq ">") {return "fasta";}
430 else {return "fastq";}
431 }
432 ########## END input_test
433
434
435 sub usage {
436 <<EOD;
437 This script takes a paired-end (PE) read files (Fastq or Fasta) for a 16S dataset, e.g. from V3-V4
438 region, it also takes a Fasta file of full-length 16S reference database, e.g. Greengene.
439 this script identifies the sequencing region on the reference sequencs and it cuts the forward
440 and reverse segments and outputs them in PE fasta files. The output PE reference database can be used
441 to cluster together with 16S datasets
442
443 Options:
444 ======================
445 -i input fasta or fastq file for R1
446 -j input fasta or fastq file for R2
447 -d 16S reference sequence file in fasta format
448 -o output prefix
449 -p lenght of forward sequence in output file
450 -q length of reverse sequence in output file
451 -S also output full fragment
452 -c cutoff for clustering the output PE files to remove redundant reference seqeunces.
453 Suggested cutoffs: 1.00, 0.99, 0.98 and 0.97
454 The script will not cluster the output unless user specifies this cutoff.
455 -M available memory to use, default 16000, means 16000MB. This option will be passed to cd-hit.
456 EOD
457 }
0 #!/usr/bin/perl
1 ################################################################################
2 # NGS workflow by Weizhong Li, http://weizhongli-lab.org
3 ################################################################################
4
5 ########## local variables etc. Please edit
6 $CD_HIT_dir = "/home/oasis/data/etc/git/cdhit";
7 $NGS_prog_trimmomatic = "/home/oasis/data/NGS-ann-project/apps/Trimmomatic/trimmomatic-0.32.jar";
8
9
10 ########## computation resources for execution of jobs
11 %NGS_executions = ();
12 $NGS_executions{"qsub_1"} = {
13 "type" => "qsub-pe",
14 "cores_per_node" => 8,
15 "number_nodes" => 64,
16 "user" => "weizhong", #### I will use command such as qstat -u weizhong to query submitted jobs
17 "command" => "qsub",
18 "command_name_opt" => "-N",
19 "command_err_opt" => "-e",
20 "command_out_opt" => "-o",
21 "template" => <<EOD,
22 #!/bin/sh
23 #PBS -v PATH
24 #PBS -V
25
26 #\$ -v PATH
27 #\$ -V
28
29 EOD
30 };
31
32
33 $NGS_executions{"sh_1"} = {
34 "type" => "sh",
35 "cores_per_node" => 8,
36 "number_nodes" => 1,
37 };
38
39 $NGS_batch_jobs{"qc"} = {
40 "CMD_opts" => ["100"],
41 "execution" => "sh_1", # where to execute
42 "cores_per_cmd" => 4, # number of threads used by command below
43 "no_parallel" => 1, # number of total jobs to run using command below
44 "command" => <<EOD,
45 java -jar $NGS_prog_trimmomatic PE -threads 4 -phred33 \\DATA.0 \\DATA.1 \\SELF/R1.fq \\SELF/R1-s.fq \\SELF/R2.fq \\SELF/R2-s.fq \\
46 SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:\\CMDOPTS.0 MAXINFO:80:0.5 1>\\SELF/qc.stdout 2>\\SELF/qc.stderr
47
48 perl -e '\$i=0; while(<>){ if (/^\@/) {\$i++; print ">Sample|\\SAMPLE|\$i ", substr(\$_,1); \$a=<>; print \$a; \$a=<>; \$a=<>;}}' < \\SELF/R1.fq > \\SELF/R1.fa &
49 perl -e '\$i=0; while(<>){ if (/^\@/) {\$i++; print ">Sample|\\SAMPLE|\$i ", substr(\$_,1); \$a=<>; print \$a; \$a=<>; \$a=<>;}}' < \\SELF/R2.fq > \\SELF/R2.fa &
50
51 wait
52 rm -f \\SELF/R1.fq \\SELF/R2.fq \\SELF/R1-s.fq \\SELF/R2-s.fq
53 EOD
54 };
55
56
57 $NGS_batch_jobs{"otu"} = {
58 "injobs" => ["qc"],
59 "CMD_opts" => ["150", "100", "0.97", "0.0001", "path_to_spliced_ref_db-R1", "path_to_spliced_ref_db-R1", "75"],
60 "execution" => "sh_1", # where to execute
61 "cores_per_cmd" => 2, # number of threads used by command below
62 "no_parallel" => 1, # number of total jobs to run using command below
63 "command" => <<EOD,
64 #### cluster at 100% PE
65 $CD_HIT_dir/cd-hit-est -i \\INJOBS.0/R1.fa -j \\INJOBS.0/R2.fa -o \\SELF/seq.nr -op \\SELF/seq.nr.2 -sf 1 -sc 1 -P 1 -r 0 \\
66 -cx \\CMDOPTS.0 -cy \\CMDOPTS.1 -c 1.0 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > \\SELF/seq.nr.log
67 #### cluster at 99% PE and SE for R1,R2
68 $CD_HIT_dir/cd-hit-est -i \\SELF/seq.nr -o \\SELF/seq.chimeric-clstr.R1 -r 0 -cx \\CMDOPTS.6 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000 -d 0 -p 1 > \\SELF/seq.chimeric-clstr.R1.log
69 $CD_HIT_dir/cd-hit-est -i \\SELF/seq.nr.2 -o \\SELF/seq.chimeric-clstr.R2 -r 0 -cx \\CMDOPTS.6 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000 -d 0 -p 1 > \\SELF/seq.chimeric-clstr.R2.log
70 $CD_HIT_dir/cd-hit-est -i \\SELF/seq.nr -j \\SELF/seq.nr.2 -o \\SELF/seq.99 -op \\SELF/seq.99.2 -P 1 -r 0 \\
71 -cx \\CMDOPTS.0 -cy \\CMDOPTS.1 -c 0.99 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > \\SELF/seq.99.log
72 $CD_HIT_dir/usecases/Miseq-16S/filter-chimeric-and-small.pl -c \\CMDOPTS.3 -k \\SELF/seq.nr.clstr \\
73 -i \\SELF/seq.chimeric-clstr.R1.clstr -j \\SELF/seq.chimeric-clstr.R2.clstr \\
74 -a \\SELF/seq.99.clstr -f \\SELF/seq.99 -g \\SELF/seq.99.2 -o \\SELF/seq.99f
75 $CD_HIT_dir/clstr_rev.pl \\SELF/seq.nr.clstr \\SELF/seq.99f.clstr > \\SELF/seq.99f-all.clstr
76 $CD_HIT_dir/cd-hit-est -i \\SELF/seq.99f -j \\SELF/seq.99f.2 -o \\SELF/seq.97 -op \\SELF/seq.97.2 -P 1 -r 0 \\
77 -cx \\CMDOPTS.0 -cy \\CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > \\SELF/seq.97.log
78 $CD_HIT_dir/cd-hit-est-2d -i \\SELF/seq.97 -j \\SELF/seq.97.2 -i2 \\CMDOPTS.4 -j2 \\CMDOPTS.5 -o \\SELF/seq.97.ref -op \\SELF/seq.97.ref.2 -P 1 -r 0 \\
79 -cx \\CMDOPTS.0 -cy \\CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > \\SELF/seq.97.ref.log
80 $CD_HIT_dir/clstr_rev.pl \\SELF/seq.99f-all.clstr \\SELF/seq.97.clstr > \\SELF/seq.97-all.clstr
81 $CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < \\SELF/seq.97.ref.clstr > \\SELF/seq.97.reftop.clstr
82 $CD_HIT_dir/clstr_merge.pl \\SELF/seq.97-all.clstr \\SELF/seq.97.reftop.clstr > \\SELF/OTU.clstr
83
84 rm -f \\SELF/seq.chimeric-clstr.R1 \\SELF/seq.chimeric-clstr.R1.log \\SELF/seq.chimeric-clstr.R2 \\SELF/seq.chimeric-clstr.R2.log
85 rm -f \\SELF/seq.97.ref \\SELF/seq.97.ref.2 \\SELF/seq.97.ref.log
86 mv \\SELF/seq.99f.log \\SELF/chimeric-small-clusters-list.txt
87
88 EOD
89 };
90
91
92 $NGS_batch_jobs{"otu-pooled"} = {
93 "CMD_opts" => ["150", "100", "0.97", "0.0001", "path_to_spliced_ref_db-R1", "path_to_spliced_ref_db-R1", "75"],
94 "execution" => "sh_1", # where to execute
95 "cores_per_cmd" => 2, # number of threads used by command below
96 "no_parallel" => 1, # number of total jobs to run using command below
97 "command" => <<EOD,
98 #### before running
99 #### concat seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt
100 $CD_HIT_dir/cd-hit-est -i seq.99f -j seq.99f.2 -o seq.97 -op seq.97.2 -P 1 -r 0 \\
101 -cx \\CMDOPTS.0 -cy \\CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > seq.97.log
102 $CD_HIT_dir/cd-hit-est-2d -i seq.97 -j seq.97.2 -i2 \\CMDOPTS.4 -j2 \\CMDOPTS.5 -o seq.97.ref -op seq.97.ref.2 -P 1 -r 0 \\
103 -cx \\CMDOPTS.0 -cy \\CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > seq.97.ref.log
104 $CD_HIT_dir/clstr_rev.pl seq.99f-all.clstr seq.97.clstr > seq.97-all.clstr
105 $CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.97.ref.clstr > seq.97.reftop.clstr
106 $CD_HIT_dir/clstr_merge.pl seq.97-all.clstr seq.97.reftop.clstr > OTU.clstr
107 $CD_HIT_dir/usecases/clstr_2_OTU_table.pl -i OTU.clstr -o OTU.txt
108 rm -f seq.97.ref seq.97.ref.2 seq.97.ref.log
109
110 EOD
111 };
112
113 ##############################################################################################
114 ########## END
115 1;
116
0 #!/usr/bin/perl
1 # =============================== NG-Omics-WF ==================================
2 # _ _ _____ ____ _ __ ________
3 # | \ | |/ ____| / __ \ (_) \ \ / / ____|
4 # | \| | | __ ______| | | |_ __ ___ _ ___ ___ _____\ \ /\ / /| |__
5 # | . ` | | |_ |______| | | | '_ ` _ \| |/ __/ __|______\ \/ \/ / | __|
6 # | |\ | |__| | | |__| | | | | | | | (__\__ \ \ /\ / | |
7 # |_| \_|\_____| \____/|_| |_| |_|_|\___|___/ \/ \/ |_|
8 #
9 # =========================== Next Generation Omics data workflow tools ========
10 #
11 # Workflow tools for next generation genomics, metagenomics, RNA-seq
12 # and other type of omics data analyiss,
13 #
14 # Software originally developed since 2010 by Weizhong Li at UCSD
15 # currently at JCVI
16 #
17 # http://weizhongli-lab.org/ngomicswf liwz@sdsc.edu
18 # ==============================================================================
19
20 use Getopt::Std;
21 use POSIX;
22
23 getopts("i:R:s:J:Q:r:j:Z:t:S:T:",\%opts);
24 die usage() unless ($opts{i} and ($opts{s} or $opts{S}));
25
26 my $sample_in = $opts{s};
27 my $sample_command_in = $opts{S}; #### ';' delimited samples, ':' delimited entries, e.g. sample1:R1.fq:R2.fq;sample2:R1.fq:R2.fq or sample1;sample2;sample3
28 my $input_conf = $opts{i};
29 my $this_task = $opts{J};
30 our $G_NGS_root = $opts{r};
31 my $queue_system = $opts{Q}; $queue_system = "SGE" unless $queue_system;
32 my $subset_wfs = $opts{R};
33 my $subset_jobs = $opts{j};
34 my $second_opt = $opts{Z};
35 my $opt_file = $opts{t};
36 my $opt_command_in = $opts{T}; #### ';' delimited jobs, ":" delimited entries, e.g. JobID_A:opt0:opt1:opt2;JobID_B:opt0:opt1
37
38 my $pwd = `pwd`; chop($pwd);
39 my $sleep_time_min = 15;
40 my $sleep_time_max = 120;
41 my $log_dir = "$pwd/WF-LOG";
42 my $log_file = "$log_dir/LOG";
43 my $log_fileq = "$log_dir/LOGq";
44 my $sh_dir = "$pwd/WF-sh";
45 my $sh_bundle_dir = "$pwd/WF-sh-bundle";
46 my $subset_flag = 0; #### run only one job, subset of jobs, or jobs in sub workflows
47 my %subset_jobs = ();
48 my %qstat_xml_data = ();
49 my ($i, $j, $k, $ll, $cmd);
50
51 ######## scan through WF configration
52 ######## and generate job list
53 require $input_conf;
54 my %job_list = (); # as $job_list{$t_job_id}{$t_sample_id} = {};
55 my ($t_sample_id, $t_job_id, $t_execution_id);
56 my ($t_sample, $t_job, $t_execution);
57 task_level_jobs();
58 my @NGS_batch_jobs = sort {($NGS_batch_jobs{$a}->{'job_level'} <=> $NGS_batch_jobs{$b}->{'job_level'}) or ($a cmp $b)} keys %NGS_batch_jobs;
59
60 $cmd = `mkdir -p $log_dir` unless (-e $log_dir);
61 $cmd = `mkdir -p $sh_dir` unless (-e $sh_dir);
62 $cmd = `mkdir -p $sh_bundle_dir` unless (-e $sh_bundle_dir);
63 open(LOG, ">> $log_file") || die "can not write to $log_file";
64
65 ######## parse NGS_samples
66 my %NGS_sample_data = ();
67 my @NGS_samples = ();
68 if (defined($sample_in)) {
69 open(TMP, $sample_in) || die "can not open $sample_in";
70 while($ll=<TMP>){
71 next if ($ll =~ /^#/);
72 next unless ($ll =~ /^\w/); chop($ll);
73 my ($id, @data) = split(/\s+/,$ll);
74 push(@NGS_samples, $id);
75 $NGS_sample_data{$id} = [@data];
76 if (not (-e $id)) { $cmd = `mkdir $id`;}
77 }
78 close(TMP);
79 }
80 elsif (defined($sample_command_in)) {
81 my @lls = split(/,/, $sample_command_in);
82 foreach $ll (@lls) {
83 my ($id, @data) = split(/:/, $ll);
84 push(@NGS_samples, $id);
85 $NGS_sample_data{$id} = [@data];
86 if (not (-e $id)) { $cmd = `mkdir $id`;}
87 }
88 }
89 else {
90 die "no input samples";
91 }
92
93 my %CMD_opts = ();
94 if (-e $opt_file) {
95 ##format example
96 ##CMDOPT JobID_A:opt0:opt1:opt2
97 ##CMDOPT JobID_B:opt0:opt1
98 ##CMDOPT JobID_C:opt0:opt1:opt2:opt3
99 open(TMP, $opt_file) || die "can not open $opt_file";
100 while($ll = <TMP>){
101 next if ($ll =~ /^#/);
102 next unless ($ll =~ /^CMDOPT/);
103 chop($ll);
104 my ($i, $opt1) = split(/\s+/, $ll);
105 my ($job_id, @opts) = split(/:/, $opt1);
106 $CMD_opts{$job_id} = [@opts];
107 }
108 close(TMP);
109 }
110 elsif ($opt_command_in) {
111 my @lls = split(/,/, $opt_command_in);
112 foreach $ll (@lls) {
113 my ($job_id, @opts) = split(/:/, $ll);
114 $CMD_opts{$job_id} = [@opts];
115 }
116 }
117
118 ########## processing subset of jobs
119 if ($subset_wfs) {
120 my @wfs = split(/,/, $subset_wfs);
121 $subset_flag = 1;
122 foreach $i (@wfs) {
123 my @jobs = @{ $NGS_batch_sets{$i}->{"jobs"} };
124 foreach $j (@jobs) { $subset_jobs{$j} = 1; }
125 }
126 }
127 if ($subset_jobs) {
128 $subset_flag = 1;
129 my @jobs = split(/,/, $subset_jobs);
130 foreach $j (@jobs) { $subset_jobs{$j} = 1; }
131 add_subset_jobs_by_dependency();
132 }
133 if ($subset_flag) {
134 my $job_str = join(" ", keys %subset_jobs);
135 write_log("Running subset of jobs: $job_str");
136 }
137
138 my $verify_flag = 0;
139 foreach $t_job_id (keys %NGS_batch_jobs) {
140 if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
141 $t_job = $NGS_batch_jobs{$t_job_id};
142 $t_execution = $NGS_executions{ $t_job->{"execution"} };
143
144 my $pe_parameter = ""; #### setup pe parameters
145 if ($t_execution->{'type'} eq "qsub-pe") {
146 my $t_cores_per_cmd = $t_job->{"cores_per_cmd"};
147 $t_cores_per_cmd = 1 unless ($t_cores_per_cmd);
148 $pe_parameter = "#\$ -pe orte $t_cores_per_cmd";
149 }
150
151 if ($t_job->{"cores_per_cmd"} > $t_execution->{"cores_per_node"} ) {
152 $verify_flag = 1;
153 write_log("$t_job_id needs $t_job->{\"cores_per_cmd\"} cores, but $t_job->{\"execution\"} only has $t_execution->{\"cores_per_node\"} cores");
154 }
155
156 my $cmds_per_node = POSIX::floor( $t_execution->{"cores_per_node"} / $t_job->{"cores_per_cmd"});
157 my $nodes_total = POSIX::ceil($t_job->{"no_parallel"} / $cmds_per_node);
158 $t_job->{"cmds_per_node"} = $cmds_per_node;
159 $t_job->{"nodes_total"} = $nodes_total;
160
161 if ($t_job->{"nodes_total"} > $t_execution->{"number_nodes"}) {
162 $verify_flag = 1;
163 write_log("$t_job_id needs $t_job->{\"nodes_total\"} nodes, but $t_job->{\"execution\"} only has $t_execution->{\"number_nodes\"} nodes");
164 }
165
166 my @CMD_opts = ();
167 @CMD_opts = @{$t_job->{CMD_opts}} if (defined($t_job->{CMD_opts} ));
168 @CMD_opts = @{$CMD_opts{$t_job_id}} if (defined($CMD_opts{$t_job_id})); #### command line take over default
169
170 foreach $t_sample_id (@NGS_samples) {
171 my @t_commands = split(/\t/, $t_job->{"command"});
172 my $t_command = "";
173 foreach my $c0 (@t_commands) {
174 my $c1 = $c0;
175 $c1 =~ s/\\SAMPLE/$t_sample_id/g;
176 $c1 =~ s/\\SELF/$t_job_id/g;
177 # take it easy, assuming maxium 20 input files
178 $c1 =~ s/\\INFILES\.0/$t_job->{"infiles"}->[0]/g; $c1 =~ s/\\INFILES\.10/$t_job->{"infiles"}->[10]/g;
179 $c1 =~ s/\\INFILES\.1/$t_job->{"infiles"}->[1]/g; $c1 =~ s/\\INFILES\.11/$t_job->{"infiles"}->[11]/g;
180 $c1 =~ s/\\INFILES\.2/$t_job->{"infiles"}->[2]/g; $c1 =~ s/\\INFILES\.12/$t_job->{"infiles"}->[12]/g;
181 $c1 =~ s/\\INFILES\.3/$t_job->{"infiles"}->[3]/g; $c1 =~ s/\\INFILES\.13/$t_job->{"infiles"}->[13]/g;
182 $c1 =~ s/\\INFILES\.4/$t_job->{"infiles"}->[4]/g; $c1 =~ s/\\INFILES\.14/$t_job->{"infiles"}->[14]/g;
183 $c1 =~ s/\\INFILES\.5/$t_job->{"infiles"}->[5]/g; $c1 =~ s/\\INFILES\.15/$t_job->{"infiles"}->[15]/g;
184 $c1 =~ s/\\INFILES\.6/$t_job->{"infiles"}->[6]/g; $c1 =~ s/\\INFILES\.16/$t_job->{"infiles"}->[16]/g;
185 $c1 =~ s/\\INFILES\.7/$t_job->{"infiles"}->[7]/g; $c1 =~ s/\\INFILES\.17/$t_job->{"infiles"}->[17]/g;
186 $c1 =~ s/\\INFILES\.8/$t_job->{"infiles"}->[8]/g; $c1 =~ s/\\INFILES\.18/$t_job->{"infiles"}->[18]/g;
187 $c1 =~ s/\\INFILES\.9/$t_job->{"infiles"}->[9]/g; $c1 =~ s/\\INFILES\.19/$t_job->{"infiles"}->[19]/g;
188
189 $c1 =~ s/\\DATA\.0/$NGS_sample_data{$t_sample_id}->[0]/g; $c1 =~ s/\\DATA\.10/$NGS_sample_data{$t_sample_id}->[10]/g;
190 $c1 =~ s/\\DATA\.1/$NGS_sample_data{$t_sample_id}->[1]/g; $c1 =~ s/\\DATA\.11/$NGS_sample_data{$t_sample_id}->[11]/g;
191 $c1 =~ s/\\DATA\.2/$NGS_sample_data{$t_sample_id}->[2]/g; $c1 =~ s/\\DATA\.12/$NGS_sample_data{$t_sample_id}->[12]/g;
192 $c1 =~ s/\\DATA\.3/$NGS_sample_data{$t_sample_id}->[3]/g; $c1 =~ s/\\DATA\.13/$NGS_sample_data{$t_sample_id}->[13]/g;
193 $c1 =~ s/\\DATA\.4/$NGS_sample_data{$t_sample_id}->[4]/g; $c1 =~ s/\\DATA\.14/$NGS_sample_data{$t_sample_id}->[14]/g;
194 $c1 =~ s/\\DATA\.5/$NGS_sample_data{$t_sample_id}->[5]/g; $c1 =~ s/\\DATA\.15/$NGS_sample_data{$t_sample_id}->[15]/g;
195 $c1 =~ s/\\DATA\.6/$NGS_sample_data{$t_sample_id}->[6]/g; $c1 =~ s/\\DATA\.16/$NGS_sample_data{$t_sample_id}->[16]/g;
196 $c1 =~ s/\\DATA\.7/$NGS_sample_data{$t_sample_id}->[7]/g; $c1 =~ s/\\DATA\.17/$NGS_sample_data{$t_sample_id}->[17]/g;
197 $c1 =~ s/\\DATA\.8/$NGS_sample_data{$t_sample_id}->[8]/g; $c1 =~ s/\\DATA\.18/$NGS_sample_data{$t_sample_id}->[18]/g;
198 $c1 =~ s/\\DATA\.9/$NGS_sample_data{$t_sample_id}->[9]/g; $c1 =~ s/\\DATA\.19/$NGS_sample_data{$t_sample_id}->[19]/g;
199
200 $c1 =~ s/\\INJOBS\.0/$t_job->{"injobs"}->[0]/g; $c1 =~ s/\\INJOBS\.10/$t_job->{"injobs"}->[10]/g;
201 $c1 =~ s/\\INJOBS\.1/$t_job->{"injobs"}->[1]/g; $c1 =~ s/\\INJOBS\.11/$t_job->{"injobs"}->[11]/g;
202 $c1 =~ s/\\INJOBS\.2/$t_job->{"injobs"}->[2]/g; $c1 =~ s/\\INJOBS\.12/$t_job->{"injobs"}->[12]/g;
203 $c1 =~ s/\\INJOBS\.3/$t_job->{"injobs"}->[3]/g; $c1 =~ s/\\INJOBS\.13/$t_job->{"injobs"}->[13]/g;
204 $c1 =~ s/\\INJOBS\.4/$t_job->{"injobs"}->[4]/g; $c1 =~ s/\\INJOBS\.14/$t_job->{"injobs"}->[14]/g;
205 $c1 =~ s/\\INJOBS\.5/$t_job->{"injobs"}->[5]/g; $c1 =~ s/\\INJOBS\.15/$t_job->{"injobs"}->[15]/g;
206 $c1 =~ s/\\INJOBS\.6/$t_job->{"injobs"}->[6]/g; $c1 =~ s/\\INJOBS\.16/$t_job->{"injobs"}->[16]/g;
207 $c1 =~ s/\\INJOBS\.7/$t_job->{"injobs"}->[7]/g; $c1 =~ s/\\INJOBS\.17/$t_job->{"injobs"}->[17]/g;
208 $c1 =~ s/\\INJOBS\.8/$t_job->{"injobs"}->[8]/g; $c1 =~ s/\\INJOBS\.18/$t_job->{"injobs"}->[18]/g;
209 $c1 =~ s/\\INJOBS\.9/$t_job->{"injobs"}->[9]/g; $c1 =~ s/\\INJOBS\.19/$t_job->{"injobs"}->[19]/g;
210
211 $c1 =~ s/\\CMDOPTS\.0/$CMD_opts[0]/g; $c1 =~ s/\\CMDOPTS\.10/$CMD_opts[10]/g;
212 $c1 =~ s/\\CMDOPTS\.1/$CMD_opts[1]/g; $c1 =~ s/\\CMDOPTS\.11/$CMD_opts[11]/g;
213 $c1 =~ s/\\CMDOPTS\.2/$CMD_opts[2]/g; $c1 =~ s/\\CMDOPTS\.12/$CMD_opts[12]/g;
214 $c1 =~ s/\\CMDOPTS\.3/$CMD_opts[3]/g; $c1 =~ s/\\CMDOPTS\.13/$CMD_opts[13]/g;
215 $c1 =~ s/\\CMDOPTS\.4/$CMD_opts[4]/g; $c1 =~ s/\\CMDOPTS\.14/$CMD_opts[14]/g;
216 $c1 =~ s/\\CMDOPTS\.5/$CMD_opts[5]/g; $c1 =~ s/\\CMDOPTS\.15/$CMD_opts[15]/g;
217 $c1 =~ s/\\CMDOPTS\.6/$CMD_opts[6]/g; $c1 =~ s/\\CMDOPTS\.16/$CMD_opts[16]/g;
218 $c1 =~ s/\\CMDOPTS\.7/$CMD_opts[7]/g; $c1 =~ s/\\CMDOPTS\.17/$CMD_opts[17]/g;
219 $c1 =~ s/\\CMDOPTS\.8/$CMD_opts[8]/g; $c1 =~ s/\\CMDOPTS\.18/$CMD_opts[18]/g;
220 $c1 =~ s/\\CMDOPTS\.9/$CMD_opts[9]/g; $c1 =~ s/\\CMDOPTS\.19/$CMD_opts[19]/g;
221 $t_command .= "$c1\n";
222 }
223
224
225 my @t_infiles = map { "$t_sample_id/$_" } @{$t_job->{"infiles"}};
226 my @t_injobs = @{$t_job->{"injobs"}};
227 my $t_sh_file = "$sh_dir/$t_job_id.$t_sample_id.sh";
228 my $f_start = "$pwd/$t_sample_id/$t_job_id/WF.start.date";
229 my $f_complete = "$pwd/$t_sample_id/$t_job_id/WF.complete.date";
230 my $f_cpu = "$pwd/$t_sample_id/$t_job_id/WF.cpu";
231 $job_list{$t_job_id}{$t_sample_id} = {
232 'sample_id' => $t_sample_id,
233 'job_id' => $t_job_id,
234 'status' => 'wait', #### status can be wait (input not ready), ready (input ready), submitted (submitted or running), completed
235 'command' => $t_command,
236 'sh_file' => $t_sh_file,
237 'infiles' => [@t_infiles],
238 'injobs' => [@t_injobs],
239 'start_file' => $f_start,
240 'complete_file'=> $f_complete,
241 'cpu_file' => $f_cpu,
242 };
243
244 my $v_command = "";
245 foreach my $vf (@{$t_job->{"non_zero_files"}}) {
246 $v_command .= "if ! [ -s $t_job_id/$vf ]; then echo \"zero size $t_job_id/$vf\"; exit; fi\n";
247 }
248
249
250 if (not -e $t_sh_file) {
251 write_log("Write sh file to $t_sh_file");
252 open(TSH, "> $t_sh_file") || die "can not write to $t_sh_file\n";
253 print TSH <<EOD;
254 $t_execution->{"template"}
255 $pe_parameter
256
257 my_host=`hostname`
258 my_pid=\$\$
259 my_core=$t_job->{"cores_per_cmd"}
260 my_queue=$t_job->{"execution"}
261 my_time_start=`date +%s`;
262
263 cd $pwd
264 cd $t_sample_id
265 mkdir $t_job_id
266 if ! [ -f $f_start ]; then date +\%s > $f_start; fi
267 $t_command
268 $v_command
269 date +\%s > $f_complete
270 #times >> $f_cpu
271
272 my_time_end=`date +%s`;
273 my_time_spent=\$((my_time_end-my_time_start))
274 echo "sample=$t_sample_id job=$t_job_id host=\$my_host pid=\$my_pid queue=\$my_queue cores=\$my_core time_start=\$my_time_start time_end=\$my_time_end time_spent=\$my_time_spent" >> $f_cpu
275
276 EOD
277 close(TSH);
278 #validate_cmd_line($t_command, $t_sh_file, $t_sample_id);
279 }
280 } ########## foreach my $c0 (@t_commands)
281 } ########## foreach $t_job (keys %NGS_batch_jobs)
282
283 die if ($verify_flag);
284
285 if ($this_task eq "log-cpu" ) { task_log_cpu(); exit 0;}
286 elsif ($this_task eq "list-jobs" ) { task_list_jobs(); exit 0;}
287 elsif ($this_task eq "snapshot" ) { task_snapshot(); exit 0;}
288 elsif ($this_task eq "delete-jobs" ) { task_delete_jobs($second_opt); exit 0;}
289 elsif ($this_task eq "write-sh" ) { exit 0;}
290 elsif ($this_task ) { die "undefined task $this_task";}
291
292 ################################################################################################
293 # _____ _ _ _____ _____ _ _ _ _ _
294 # | __ \ | \ | |/ ____|/ ____|| | | | | | (_) | |
295 # | |__) | _ _ __ | \| | | __| (___ | |__ __ _| |_ ___| |__ _ ___ | |__ ___
296 # | _ / | | | '_ \ | . ` | | |_ |\___ \ | '_ \ / _` | __/ __| '_ \ | |/ _ \| '_ \/ __|
297 # | | \ \ |_| | | | | | |\ | |__| |____) || |_) | (_| | || (__| | | | | | (_) | |_) \__ \
298 # |_| \_\__,_|_| |_| |_| \_|\_____|_____/ |_.__/ \__,_|\__\___|_| |_| | |\___/|_.__/|___/
299 # ______ ______ _/ |
300 # |______| |______|__/
301 ########## Run NGS_batch_jobs for each samples http://patorjk.com/software/taag
302 ################################################################################################
303
304
305 my %execution_submitted = (); # number of submitted jobs (qsub) or threads (local sh)
306 my $sleep_time = $sleep_time_min;
307 while(1) {
308 my $flag_job_done = 1;
309
310 ########## reset execution_submitted to 0
311 foreach $i (keys %NGS_executions) { $execution_submitted{$i} = 0; }
312
313 my $flag_qstat_xml_call = 0;
314 foreach $t_job_id (keys %NGS_batch_jobs) {
315 my $t_job = $NGS_batch_jobs{$t_job_id};
316 my $t_execution = $NGS_executions{ $t_job->{"execution"} };
317 my $exe_type = $t_execution->{type};
318 $flag_qstat_xml_call = 1 if (($queue_system eq "SGE") and (($exe_type eq "qsub") or ($exe_type eq "qsub-pe")));
319 }
320 SGE_qstat_xml_query() if $flag_qstat_xml_call;
321
322 ########## check and update job status for submitted jobs
323 foreach $t_job_id (keys %NGS_batch_jobs) {
324 if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
325 my $t_job = $NGS_batch_jobs{$t_job_id};
326 foreach $t_sample_id (@NGS_samples) {
327 my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
328 my $status = $t_sample_job->{'status'};
329
330 next if ($status eq "completed");
331 ########## check file system to update job status
332 ########## in case this is a restart run
333 check_submitted_job($t_job_id, $t_sample_id);
334 next if ($t_sample_job->{'status'} eq "completed");
335 $flag_job_done = 0;
336 }
337 }
338
339 if ($flag_job_done) { write_log("job completed!"); last; }
340
341 ########## check and update job status based on dependance
342 foreach $t_job_id (keys %NGS_batch_jobs) {
343 if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
344 my $t_job = $NGS_batch_jobs{$t_job_id};
345 foreach $t_sample_id (@NGS_samples) {
346 my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
347 my $status = $t_sample_job->{'status'};
348
349 next unless ($status eq "wait");
350 my @t_infiles = @{ $t_sample_job->{'infiles'} };
351 my @t_injobs = @{ $t_sample_job->{'injobs'} };
352 my $t_ready_flag = 1;
353
354 foreach $i (@t_infiles) {
355 next if (-s $i); #### non-zero size file
356 $t_ready_flag = 0;
357 last;
358 }
359
360 foreach $i (@t_injobs) {
361 next if ( $job_list{$i}{$t_sample_id}->{'status'} eq "completed"); #### injob completed
362 $t_ready_flag = 0;
363 last;
364 }
365 if ($t_ready_flag) {
366 $t_sample_job->{"status"} = "ready";
367 write_log("$t_job_id,$t_sample_id: change status to ready");
368 }
369 }
370 }
371
372 ########## submit local sh jobs
373 my $has_submitted_some_jobs = 0;
374 foreach $t_job_id (keys %NGS_batch_jobs) {
375 if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
376 my $t_job = $NGS_batch_jobs{$t_job_id};
377 my $t_execution = $NGS_executions{ $t_job->{"execution"} };
378 my $t_execution_id = $t_job->{"execution"};
379
380 if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
381 next unless ($t_execution->{'type'} eq "sh");
382 next if ( $execution_submitted{$t_execution_id} >= $t_execution->{"cores_per_node"} ); #### all cores are used
383
384 foreach $t_sample_id (@NGS_samples) {
385 my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
386 my $status = $t_sample_job->{'status'};
387 next unless ($status eq "ready");
388 next if ( ($execution_submitted{$t_execution_id} + $t_job->{"cores_per_cmd"} * $t_job->{"no_parallel"}) > $t_execution->{"cores_per_node"} ); #### no enough available cores
389 #### now submitting
390
391 my $t_sh_file = $t_sample_job->{'sh_file'};
392 my $t_sh_pid = "$t_sh_file.pids";
393 for ($i=0; $i<$t_job->{"no_parallel"}; $i++) {
394 $cmd = `sh $t_sh_file >/dev/null 2>&1 &`;
395 }
396 $cmd = `touch $t_sh_pid`;
397 $t_sample_job->{'status'} = "submitted";
398 write_log("$t_job_id,$t_sample_id: change status to submitted");
399 $execution_submitted{ $t_execution_id } += $t_job->{"cores_per_cmd"} * $t_job->{"no_parallel"};
400 $has_submitted_some_jobs = 1;
401 }
402 }
403
404 ########## submit qsub-pe jobs, multiple jobs may share same node
405 foreach $t_job_id (keys %NGS_batch_jobs) {
406 if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
407 my $t_job = $NGS_batch_jobs{$t_job_id};
408 my $t_execution = $NGS_executions{ $t_job->{"execution"} };
409 my $t_execution_id = $t_job->{"execution"};
410
411 next unless ($t_execution->{'type'} eq "qsub-pe");
412 next if ( $execution_submitted{$t_execution_id} >= $t_execution->{"number_nodes"} ); #### resource full
413
414 my $t_cores_per_node = $t_execution->{"cores_per_node"};
415 my $t_cores_per_cmd = $t_job->{"cores_per_cmd"};
416 my $t_cores_per_job = $t_cores_per_cmd * $t_job->{"no_parallel"};
417 my $t_nodes_per_job = $t_cores_per_job / $t_cores_per_node;
418
419 foreach $t_sample_id (@NGS_samples) {
420 my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
421 my $status = $t_sample_job->{'status'};
422 next unless ($status eq "ready");
423
424 my $t_sh_file = $t_sample_job->{'sh_file'};
425 my $t_sh_pid = "$t_sh_file.pids";
426 open(TID, "> $t_sh_pid") || die "can not write to $t_sh_pid";
427
428 for ($i=0; $i<$t_job->{"no_parallel"}; $i++) {
429 my $t_stderr = "$t_sh_file.$i.stderr";
430 my $t_stdout = "$t_sh_file.$i.stdout";
431 $cmd = `qsub $t_execution->{"command_name_opt"} $t_job_id $t_execution->{"command_err_opt"} $t_stderr $t_execution->{"command_out_opt"} $t_stdout $t_sh_file 2>$log_fileq`;
432 my $qsub_id = 0;
433 if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
434 print TID "$qsub_id\n"; #### $cmd returns qsub id, write these ids to pid file for future qstat
435 $execution_submitted{$t_execution_id} += $t_nodes_per_job;
436 write_log("$t_sh_bundle submitted for sample $t_sample_id, qsubid $cmd");
437 }
438
439 close(TID);
440 $has_submitted_some_jobs = 1;
441 $t_sample_job->{'status'} = "submitted";
442 }
443 } ########## END foreach $t_job_id (keys %NGS_batch_jobs)
444
445 ########## submit qsub jobs
446 foreach $t_job_id (keys %NGS_batch_jobs) {
447 if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
448 my $t_job = $NGS_batch_jobs{$t_job_id};
449 my $t_execution = $NGS_executions{ $t_job->{"execution"} };
450 my $t_execution_id = $t_job->{"execution"};
451
452 if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
453 next unless ($t_execution->{'type'} eq "qsub");
454 next if ( $execution_submitted{$t_execution_id} >= $t_execution->{"number_nodes"} ); #### resource full
455
456 my $t_cores_per_node = $t_execution->{"cores_per_node"};
457 my $t_cores_per_cmd = $t_job->{"cores_per_cmd"};
458 my $t_cores_per_job = $t_cores_per_cmd * $t_job->{"no_parallel"};
459 my $t_nodes_per_job = POSIX::ceil($t_cores_per_job / $t_cores_per_node);
460 my $t_cmds_per_node = int($t_cores_per_node / $t_cores_per_cmd);
461 my $t_jobs_per_node = int($t_cores_per_node / $t_cores_per_job);
462
463 ########## 1. this loop process jobs need 1 or more nodes per sample, ie. bundle within a sample, e.g. blast against refseq
464 foreach $t_sample_id (@NGS_samples) {
465 my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
466 my $status = $t_sample_job->{'status'};
467 next unless ($status eq "ready");
468 next unless ($t_jobs_per_node <= 1); #### unless need >= 1 node, including jobs use between (51%-100%) cores per node
469 last if ( ($execution_submitted{$t_execution_id} + $t_nodes_per_job) > $t_execution->{"number_nodes"}); #### no enough available queues
470
471 my $t_sh_file = $t_sample_job->{'sh_file'};
472 my $t_sh_bundle = "$sh_bundle_dir/$t_job_id.$t_sample_id.$$.sh";
473 my $t_stderr = "$t_sh_bundle.stderr";
474 my $t_stdout = "$t_sh_bundle.stdout";
475 my $t_sh_pid = "$t_sh_file.pids";
476
477 open(TID, "> $t_sh_pid") || die "can not write to $t_sh_pid";
478 open(BSH, "> $t_sh_bundle") || die "can not write to $t_sh_bundle";
479 print BSH <<EOD;
480 $t_execution->{"template"}
481 cd $pwd
482 EOD
483 for ($i=0; $i<$t_cmds_per_node; $i++) {
484 print BSH "sh $t_sh_file &\n";
485 print BSH "sleep 3\n";
486 }
487 print BSH "wait\n";
488 close(BSH);
489
490 for ($i=0; $i<$t_nodes_per_job; $i++) {
491 $cmd = `qsub $t_execution->{"command_name_opt"} $t_job_id $t_execution->{"command_err_opt"} $t_stderr $t_execution->{"command_out_opt"} $t_stdout $t_sh_bundle 2>$log_fileq`;
492 my $qsub_id = 0;
493 if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
494 print TID "$qsub_id\n"; #### $cmd returns qsub id, write these ids to pid file for future qstat
495 $execution_submitted{$t_execution_id}++;
496 write_log("$t_sh_bundle submitted for sample $t_sample_id, qsubid $cmd");
497 }
498 close(TID);
499 $has_submitted_some_jobs = 1;
500 $t_sample_job->{'status'} = "submitted";
501 } ########## END foreach $t_sample_id (@NGS_samples)
502
503
504 ########## 2. this loop process jobs need less than 1 node per sample, ie. bundle jobs across samples, e.g. qc
505 my @t_bundle = ();
506 my $available_nodes = $t_execution->{"number_nodes"} - $execution_submitted{$t_execution_id};
507 my $no_sample_can_be_processed = $available_nodes * $t_jobs_per_node;
508 my @t_samples = ();
509 my $t_batch_no = 0;
510
511 foreach $t_sample_id (@NGS_samples) { #### same loop as next, to find out @t_samples and last sample can run
512 my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
513 my $status = $t_sample_job->{'status'};
514 next unless ($status eq "ready");
515 next unless ($t_jobs_per_node > 1); #### unless a node can host 2 or more jobs
516 last if ( $t_execution->{"number_nodes"} - $execution_submitted{$t_execution_id} <=0);
517 push(@t_samples, $t_sample_id);
518 }
519 my $last_sample_can_run = $t_samples[-1];
520 @t_samples = ();
521
522 foreach $t_sample_id (@NGS_samples) {
523 my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
524 my $status = $t_sample_job->{'status'};
525 next unless ($status eq "ready");
526 next unless ($t_jobs_per_node > 1); #### unless a node can host 2 or more jobs
527 last if ( $t_execution->{"number_nodes"} - $execution_submitted{$t_execution_id} <=0);
528 push(@t_samples, $t_sample_id);
529
530 #### bundle @t_samples to one qsub job
531 if ((($#t_samples+1) == $t_jobs_per_node) or ($t_sample_id eq $last_sample_can_run)) {
532 my $t_sh_bundle = "$sh_bundle_dir/$t_job_id.samples-$t_batch_no.$$.sh";
533 my $t_stderr = "$t_sh_bundle.stderr";
534 my $t_stdout = "$t_sh_bundle.stdout";
535
536 open(BSH, "> $t_sh_bundle") || die "can not write to $t_sh_bundle";
537 print BSH <<EOD;
538 $t_execution->{"template"}
539 cd $pwd
540 EOD
541 foreach $i (@t_samples) {
542 my $t_sh_file = $job_list{$t_job_id}{$i}->{'sh_file'};
543 for ($j=0; $j<$t_job->{"no_parallel"}; $j++) {
544 print BSH "sh $t_sh_file &\n";
545 print BSH "sleep 3\n";
546 }
547 }
548 print BSH "wait\n";
549 close(BSH);
550
551 $cmd = `qsub $t_execution->{"command_name_opt"} $t_job_id $t_execution->{"command_err_opt"} $t_stderr $t_execution->{"command_out_opt"} $t_stdout $t_sh_bundle 2>$log_fileq`;
552 my $qsub_id = 0;
553 if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
554
555 foreach $i (@t_samples) {
556 my $t_sh_file = $job_list{$t_job_id}{$i}->{'sh_file'};
557 my $t_sh_pid = "$t_sh_file.pids";
558 open(TID, "> $t_sh_pid") || die "can not write to $t_sh_pid";
559 print TID "$qsub_id\n"; #### $cmd returns qsub id, write these ids to pid file for future qstat
560 write_log("$t_sh_bundle submitted for sample $i, qsubid $cmd");
561 close(TID);
562 $job_list{$t_job_id}{$i}->{'status'} = "submitted";
563 }
564
565 $has_submitted_some_jobs = 1;
566 $execution_submitted{$t_execution_id}++;
567 @t_samples = (); #### clear
568 $t_batch_no++;
569 }
570 } ########## END foreach $t_sample_id (@NGS_samples)
571 } ########## END foreach $t_job_id (keys %NGS_batch_jobs)
572
573
574 #### if has submitted some jobs, reset waiting time, otherwise double waiting time
575 print_job_status_summary();
576 if ($has_submitted_some_jobs) {
577 $sleep_time = $sleep_time_min;
578 }
579 else {
580 $sleep_time = $sleep_time*2;
581 $sleep_time = $sleep_time_max if ($sleep_time > $sleep_time_max);
582 }
583 write_log("sleep $sleep_time seconds");
584 sleep($sleep_time);
585 } ########## END while(1)
586
587 task_log_cpu();
588 ################################################################################
589 ########## END Run NGS_batch_jobs for each samples
590 ################################################################################
591
592 close(LOG);
593 ##########
594
595
596 sub write_log {
597 my @txt = @_;
598 my $i;
599 my $date = `date`; chop($date);
600 foreach $i (@txt) {
601 print LOG "$date $i\n";
602 print STDERR "$date $i\n";
603 }
604 print LOG "\n";
605 print STDERR "\n";
606 }
607 ########## END write_log
608
609 sub SGE_qstat_xml_query {
610 my ($i, $j, $k, $cmd, $ll);
611 %qstat_xml_data = (); #### global
612 $cmd = `qstat -f -xml`;
613 if ($cmd =~ /<queue_info/) { #### dummy
614 $qstat_xml_data{"NULL"}= ["NULL","NULL"];
615 }
616
617 my @lls = split(/\n/, $cmd);
618 $i = 2; #### skip first 2 lines
619 for (; $i<$#lls+1; $i++) {
620 if ($lls[$i] =~ /<job_list/) {
621 my ($id, $name, $state);
622 for (; $i<$#lls+1; $i++) {
623 last if ($lls[$i] =~ /<\/job_list/);
624 if ($lls[$i] =~ /<JB_job_number>(\d+)/) { $id = $1;}
625 if ($lls[$i] =~ /<JB_name>([^<]+)/) { $name = $1;}
626 if ($lls[$i] =~ /<state>([^<]+)/) {$state = $1;}
627 }
628 if (defined($id) and defined($name) and defined($state)) {
629 $qstat_xml_data{$id} = [$name, $state];
630 }
631 }
632 }
633 }
634
635 ########## check submitted job by checking pids, or qsub ids
636 ########## update job status from wait|ready -> submitted if pid file exit (in case of restart of this script)
637 ########## update job status from wait|ready|submitted -> completed if sh calls or qsub calls finished
638 ########## these pids or qsub ids are done
639 sub check_submitted_job {
640 my ($t_job_id, $t_sample_id) = @_;
641 my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
642 my $t_job = $NGS_batch_jobs{$t_job_id};
643 my $t_execution = $NGS_executions{ $t_job->{"execution"} };
644
645 my ($i, $j, $k, $flag, $ll, $cmd);
646
647 my $t_sh_file = $t_sample_job->{'sh_file'};
648 my $t_sh_pid = "$t_sh_file.pids";
649
650 # status won't change unless there is a pid file
651 return unless (-e $t_sh_pid);
652
653 my $status = $t_sample_job->{'status'};
654 if (($status eq "wait") or ($status eq "ready")) {
655 $t_sample_job->{'status'} = "submitted";
656 write_log("$t_job_id,$t_sample_id: change status to submitted");
657 }
658
659 my $exe_type = $t_execution->{type};
660
661 if ($exe_type eq "sh") {
662 $cmd = `ps -ef | grep "$t_sh_file" | grep -v grep`;
663 if ($cmd =~ /\w/) { # still running
664 $execution_submitted{ $t_job->{"execution"} } += $t_job->{"cores_per_cmd"} * $t_job->{"no_parallel"};
665 }
666 elsif (validate_job_files($t_job_id, $t_sample_id)) {
667 $t_sample_job->{'status'} = "completed";
668 write_log("$t_job_id,$t_sample_id: change status to completed");
669 }
670 else {
671 $t_sample_job->{'status'} = "error";
672 write_log("$t_job_id,$t_sample_id: change status to error");
673 }
674 return;
675 }
676 elsif (($exe_type eq "qsub") or ($exe_type eq "qsub-pe")) {
677 my @pids = ();
678 open(CHECK, $t_sh_pid) || die "Can not open $t_sh_pid\n";
679 while($ll = <CHECK>) {
680 chop($ll); next unless ($ll =~ /\w/);
681 push(@pids, $ll);
682 }
683 close(CHECK);
684
685 my $finish_flag = 1;
686 foreach $i (@pids) {
687 if (($queue_system eq "SGE") and %qstat_xml_data) {
688 if (defined($qstat_xml_data{$i})) {
689 $t_sample_job->{'status'} = "running" if (($qstat_xml_data{$i}->[1] eq "r") and ($t_sample_job->{'status'} eq "submitted"));
690 $finish_flag = 0;
691 $execution_submitted{ $t_job->{"execution"} } ++;
692 }
693 }
694 elsif ($queue_system eq "SGE") {
695 $cmd = `qstat -j $i | grep job_number`;
696 if ($cmd =~ /$i/) {
697 $finish_flag = 0;
698 $execution_submitted{ $t_job->{"execution"} } ++;
699 }
700 }
701 else {
702 $cmd = `qstat -r $i | grep $i`;
703 $j = (split(/\D/,$cmd))[0];
704 if ($j == $i) { # this job is running
705 $finish_flag = 0;
706 $execution_submitted{ $t_job->{"execution"} } ++;
707 }
708 }
709 }
710 if ($finish_flag == 1) {
711 if (validate_job_files($t_job_id, $t_sample_id)) {
712 $t_sample_job->{'status'} = "completed";
713 write_log("$t_job_id,$t_sample_id: change status to completed");
714 }
715 else {
716 $t_sample_job->{'status'} = "error";
717 write_log("$t_job_id,$t_sample_id: change status to error");
718 }
719 }
720 return;
721 }
722 else {
723 die "unknown execution type: $exe_type\n";
724 }
725 }
726 ########## END sub check_submitted_job
727
728
729 # WF.start.date and WF.complete.date need to have non-zero size
730 sub validate_job_files {
731 my ($t_job_id, $t_sample_id) = @_;
732 my ($i, $j, $k);
733 my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
734
735 return 0 unless (-s $t_sample_job->{'start_file'} );
736 return 0 unless (-s $t_sample_job->{'complete_file'} );
737 return 0 unless (-s $t_sample_job->{'cpu_file'} );
738
739 return 1; #### pass
740 }
741 ########## END validate_job_files
742
743
744 sub print_job_status_summary {
745 my ($t_job_id, $t_sample_id);
746 my ($i, $j, $k);
747
748 my %job_status = ();
749 my $job_total = 0;
750 foreach $t_job_id (keys %NGS_batch_jobs) {
751 if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
752 foreach $t_sample_id (@NGS_samples) {
753 my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
754 my $status = $t_sample_job->{'status'};
755 $job_status{$status}++;
756 $job_total++;
757 }
758 }
759
760 print STDERR "total jobs: $job_total,";
761 foreach $i (sort keys %job_status) {
762 print STDERR "$i: $job_status{$i},";
763 }
764 print STDERR "\n";
765 }
766 ########## END print_job_status_summary
767
768
769 sub validate_cmd_line {
770 my ($i, $j, $k);
771 my ($t_command, $t_sh_file, $t_sample_id) = @_;
772 my @cmds = split(/\n/,$t_command);
773
774 my @warn_path = ();
775 foreach $i (@cmds) {
776 my ($key_cmd, @opts) = split(/\s+/, $i);
777 if ($key_cmd =~ /\//) {
778 if (not -e $key_cmd) { push(@warn_path, $key_cmd); }
779 }
780 @opts = grep {/\//} @opts;
781 foreach $j (@opts) {
782 my @opts1 = split(/,|;|>|<|\|/,$j);
783 foreach $k (@opts1) {
784 $k = "$t_sample_id/$k" unless (($k =~ /^\//) or ($k =~ /^\./));
785 if (not -e $k) { push(@warn_path, $k); }
786 }
787 }
788 }
789
790 if (@warn_path) {
791 print STDERR "File or program doesn't exist in $t_sh_file: ", join(" ", @warn_path), "\n";
792 }
793
794 }
795 ########## END validate_cmd_line
796
797 sub add_subset_jobs_by_dependency {
798 my ($i, $j, $k, $ll, $t_job_id, $t_sample_id, $t_job);
799
800 while(1) {
801 my $num_subset_jobs = scalar keys %subset_jobs;
802
803 foreach $t_job_id (keys %subset_jobs) {
804 $t_job = $NGS_batch_jobs{$t_job_id};
805 my @t_injobs = @{$t_job->{"injobs"}};
806
807 for $j (@t_injobs) {
808 $subset_jobs{$j} = 1;
809 }
810 }
811
812 last if ($num_subset_jobs == scalar keys %subset_jobs);
813 }
814 }
815 ########## END add_subset_jobs_by_dependency
816
817
818 sub task_level_jobs {
819 my ($i, $j, $k, $ll, $t_job_id, $t_sample_id, $t_job);
820 my %job_level = ();
821
822 while(1) {
823 my $change_flag = 0;
824
825 foreach $t_job_id (keys %NGS_batch_jobs) {
826 $t_job = $NGS_batch_jobs{$t_job_id};
827 my @t_injobs = @{$t_job->{"injobs"}};
828
829 if (@t_injobs) {
830 my $max_level_injob;
831 foreach $j (@t_injobs) {
832 next unless defined ($job_level{$j});
833 $max_level_injob = $job_level{$j} if ($job_level{$j} > $max_level_injob);
834 }
835
836 next unless (defined($max_level_injob));
837 $max_level_injob++; #### one more level
838 if (not defined ($job_level{$t_job_id})) {
839 $job_level{$t_job_id}=$max_level_injob;
840 $change_flag = 1;
841 }
842 elsif ($max_level_injob > $job_level{$t_job_id}) {
843 $job_level{$t_job_id}=$max_level_injob;
844 $change_flag = 1;
845 }
846 }
847 else {
848 if (not defined ($job_level{$t_job_id})) {
849 $job_level{$t_job_id}=1;
850 $change_flag = 1;
851 }
852 }
853 }
854 last unless ($change_flag);
855 }
856
857 foreach $t_job_id (sort keys %NGS_batch_jobs) {
858 $NGS_batch_jobs{$t_job_id}->{"job_level"} = $job_level{$t_job_id};
859 }
860 }
861 ########## END task_list_jobs
862
863 sub task_snapshot {
864 my ($t_job_id, $t_sample_id);
865 my ($i, $j, $k);
866
867 if ($this_task) {
868 my $flag_qstat_xml_call = 0;
869 foreach $t_job_id (keys %NGS_batch_jobs) {
870 my $t_job = $NGS_batch_jobs{$t_job_id};
871 my $t_execution = $NGS_executions{ $t_job->{"execution"} };
872 my $exe_type = $t_execution->{type};
873 $flag_qstat_xml_call = 1 if (($queue_system eq "SGE") and (($exe_type eq "qsub") or ($exe_type eq "qsub-pe")));
874 }
875 SGE_qstat_xml_query() if $flag_qstat_xml_call;
876
877 foreach $t_sample_id (@NGS_samples) {
878 foreach $t_job_id (keys %NGS_batch_jobs) {
879 check_submitted_job($t_job_id, $t_sample_id);
880 }
881 }
882 }
883
884 my $max_len_sample = 0;
885 foreach $t_sample_id (@NGS_samples) {
886 $max_len_sample = length($t_sample_id) if (length($t_sample_id) > $max_len_sample);
887 }
888 my $max_len_job = 0;
889 foreach $t_job_id (@NGS_batch_jobs) {
890 $max_len_job = length($t_job_id) if (length($t_job_id) > $max_len_job);
891 }
892
893 print <<EOD;
894 Job status:
895 .\twait
896 -\tsubmitted
897 r\trunning
898 +\tcompleted
899 !\terror
900 EOD
901
902 for ($i=$max_len_job-1; $i>=0; $i--) {
903 print ' 'x$max_len_sample, "\t";
904 foreach $t_job_id (@NGS_batch_jobs) {
905 print " ", ($i<length($t_job_id) ? substr(reverse($t_job_id), $i, 1):" ");
906 }
907 print "\n";
908 }
909
910 foreach $t_sample_id (@NGS_samples) {
911 print "$t_sample_id\t";
912 foreach $t_job_id (@NGS_batch_jobs) {
913 my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
914 my $status = $t_sample_job->{'status'};
915 if ($status eq "completed") { print " +";}
916 elsif ($status eq "submitted") { print " -";}
917 elsif ($status eq "running" ) { print " r";}
918 elsif ($status eq "wait" ) { print " .";}
919 elsif ($status eq "error" ) { print " !";}
920 else { print " _";}
921 }
922 print "\n";
923 }
924 }
925 ########## END task_snapshot
926
927 sub task_list_jobs {
928 my ($i, $j, $k, $ll, $t_job_id, $t_sample_id, $t_job);
929 foreach $t_job_id (@NGS_batch_jobs) {
930 $t_job = $NGS_batch_jobs{$t_job_id};
931 #my @t_infiles = @{$t_job->{"infiles"}};
932 my @t_injobs = @{$t_job->{"injobs"}};
933
934 #print "\tInput_files:", join(",", @t_infiles) if @t_infiles;
935 print "$t_job_id\tIn_jobs:[" , join(",", @t_injobs), "]\tJob_level:$t_job->{'job_level'}\n";
936 }
937 }
938 ########## END task_list_jobs
939
940 sub file1_after_file2 {
941 my ($file1, $file2) = @_;
942
943 # if not exist file1, assume it is in future, so it is newer
944 if (not -e ($file1)) {return 0;}
945 if (not -e ($file2)) {return 0;}
946
947 my $mtime1 = (stat($file1))[9];
948 my $mtime2 = (stat($file2))[9];
949
950 return ( ($mtime1 > $mtime2) ? 1 : 0);
951 }
952 ######## END file1_after_file2
953
954 sub file1_same_or_after_file2 {
955 my ($file1, $file2) = @_;
956
957 # if not exist file1, assume it is in future, so it is newer
958 if (not -e ($file1)) {return 0;}
959 if (not -e ($file2)) {return 0;}
960
961 my $mtime1 = (stat($file1))[9];
962 my $mtime2 = (stat($file2))[9];
963
964 return ( ($mtime1 >= $mtime2) ? 1 : 0);
965 }
966 ######## END file1_after_file2
967
968
969 sub task_delete_jobs {
970 my $opt = shift;
971 my ($i, $j, $k, $ll, $t_job_id, $t_sample_id);
972 my ($mode, $c) = split(/:/, $opt);
973 my $tmp_sh = "NGS-$$.sh";
974
975 open(TMPSH, "> $tmp_sh") || die "can not write to file $tmp_sh";
976 print TMPSH "#Please execute the following commands\n";
977 foreach $t_sample_id (@NGS_samples) {
978 my %job_to_delete_ids = ();
979 if ($mode eq "jobids") {
980 %job_to_delete_ids = map {$_, 1} split(/,/,$c);
981 }
982 elsif ($mode eq "run_after") {
983 die "file $c doesn't exist!" unless (-e $c);
984 foreach $t_job_id (keys %NGS_batch_jobs) {
985 my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
986 my $t_sh_file = $t_sample_job->{'sh_file'};
987 my $t_sh_pid = "$t_sh_file.pids";
988 next unless (-e $t_sh_pid); #### unless the job is submitted
989 #$job_to_delete_ids{$t_job_id} = 1 if (file1_same_or_after_file2( $t_sample_job->{'start_file'} , $c));
990 $job_to_delete_ids{$t_job_id} = 1 if (file1_same_or_after_file2( $t_sh_pid , $c));
991
992 }
993 }
994 else {
995 die "unknown option for deleting jobs: $opt";
996 }
997
998 # now %job_to_delete_ids are jobs need to be deleted
999 # next find all jobs that depends on them, recrusively
1000 my $no_jobs_to_delete = scalar keys %job_to_delete_ids;
1001 while(1) {
1002 foreach $t_job_id (keys %NGS_batch_jobs) {
1003 my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
1004 my $t_sh_file = $t_sample_job->{'sh_file'};
1005 my $t_sh_pid = "$t_sh_file.pids";
1006 next unless (-e $t_sh_pid); #### unless the job is submitted
1007 my @t_injobs = @{ $t_sample_job->{'injobs'} };
1008 foreach my $t_job_id_2 (@t_injobs) {
1009 $job_to_delete_ids{$t_job_id} = 1 if ($job_to_delete_ids{$t_job_id_2});
1010 }
1011 }
1012 last if ($no_jobs_to_delete == (scalar keys %job_to_delete_ids)); #### no more depending jobs
1013 $no_jobs_to_delete = scalar keys %job_to_delete_ids;
1014 }
1015
1016 if ($no_jobs_to_delete) {
1017 print TMPSH "#jobs to be deleted for $t_sample_id: ", join(",", keys %job_to_delete_ids), "\n";
1018 print "#jobs to be deleted for $t_sample_id: ", join(",", keys %job_to_delete_ids), "\n";
1019 foreach $t_job_id (keys %job_to_delete_ids) {
1020 my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
1021 my $t_sh_file = $t_sample_job->{'sh_file'};
1022 my $t_sh_pid = "$t_sh_file.pids";
1023 print TMPSH "\\rm -rf $pwd/$t_sample_id/$t_job_id\n";
1024 print TMPSH "\\rm $t_sh_pid\n";
1025 print TMPSH "\\rm $t_sh_file.*.std*\n";
1026
1027 #### find the qsub ids to be deleted
1028 my $qids = `cat $t_sh_pid`; $qids =~ s/\n/ /g; $qids =~ s/\s+/ /g;
1029 print TMPSH "qdel $qids\n";
1030 }
1031 }
1032 }
1033 close(TMPSH);
1034 print "The script is not delete the file, please run $tmp_sh to delete files!!!\n\n";
1035 }
1036 ########## END task_list_jobs
1037
1038 sub task_log_cpu {
1039 my ($i, $j, $k, $ll, $t_job_id, $t_sample_id);
1040
1041 my %cpu_info;
1042 foreach $t_job_id (keys %NGS_batch_jobs) {
1043 if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
1044 my $t_job = $NGS_batch_jobs{$t_job_id};
1045 foreach $t_sample_id (@NGS_samples) {
1046
1047 $cpu_info{$t_job_id}{$t_sample_id} = [$t_wall, $t_cpu];
1048 }
1049 }
1050
1051 foreach $t_sample_id (@NGS_samples) {
1052 my $f_cpu = "$pwd/$t_sample_id/WF.cpu";
1053 open(CPUOUT, "> $f_cpu") || die "Can not open $f_cpu";
1054 print CPUOUT "#job_name\tCores\tWall(s)\tWall_time\tCPU(s)\tCPU_time\n";
1055 my $min_start = 1402092131 * 999999;
1056 my $max_end = 0;
1057 my $sum_cpu = 0;
1058 foreach $t_job_id (keys %NGS_batch_jobs) {
1059 if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
1060 my $t_job = $NGS_batch_jobs{$t_job_id};
1061 my $t_core = $t_job->{"cores_per_cmd"} * $t_job->{"no_parallel"};
1062
1063 my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
1064 my $f_start = $t_sample_job->{'start_file'};
1065 my $f_complete = $t_sample_job->{'complete_file'};
1066 my $f_cpu = $t_sample_job->{'cpu_file'};
1067 my $t_start = `cat $f_start`; $t_start =~ s/\s//g; $min_start = $t_start if ($t_start < $min_start);
1068 my $t_end = `cat $f_complete`; $t_end =~ s/\s//g; $max_end = $t_end if ($t_end > $max_end);
1069 my $t_wall = int($t_end - $t_start);
1070 $t_wall = 0 unless ($t_wall>0);
1071
1072 my $t_cpu = 0;
1073 if (open(TCPU, $f_cpu)) {
1074 while($ll = <TCPU>) {
1075 chop($ll);
1076 if ($ll =~ /^(\d+)m(\d+)/) {
1077 $t_cpu += $1 * 60;
1078 }
1079 }
1080 close(TCPU);
1081 }
1082 $sum_cpu += $t_cpu;
1083
1084 my $t_walls = time_str1($t_wall);
1085 my $t_cpus = time_str1($t_cpu);
1086 print CPUOUT "$t_job_id\t$t_core\t$t_wall\t$t_walls\t$t_cpu\t$t_cpus\n";
1087 }
1088 my $t_wall = ($max_end - $min_start); $t_wall = 0 unless ($t_wall>0);
1089 my $t_walls = time_str1($t_wall);
1090 my $sum_cpus= time_str1($sum_cpu);
1091 print CPUOUT "total\t-\t$t_wall\t$t_walls\t$sum_cpu\t$sum_cpus\n";
1092 close(CPUOUT);
1093 }
1094 }
1095 ######### END task_log_cpu
1096
1097 sub time_str1 {
1098 my $s = shift;
1099 my $str = "";
1100
1101 $str .= int($s/3600); $str .= "h"; $s = $s % 3600;
1102 $str .= int($s/60); $str .= "m"; $s = $s % 60;
1103 $str .= $s; $str .= "s";
1104
1105 return $str;
1106 }
1107 ########## END time_str1;
1108
1109
1110
1111
1112
1113
1114 sub usage {
1115 <<EOD;
1116
1117 # =============================== NG-Omics-WF ==================================
1118 # _ _ _____ ____ _ __ ________
1119 # | \\ | |/ ____| / __ \\ (_) \\ \\ / / ____|
1120 # | \\| | | __ ______| | | |_ __ ___ _ ___ ___ _____\\ \\ /\\ / /| |__
1121 # | . ` | | |_ |______| | | | '_ ` _ \\| |/ __/ __|______\\ \\/ \\/ / | __|
1122 # | |\\ | |__| | | |__| | | | | | | | (__\\__ \\ \\ /\\ / | |
1123 # |_| \\_|\\_____| \\____/|_| |_| |_|_|\\___|___/ \\/ \\/ |_|
1124 #
1125 # =========================== Next Generation Omics data workflow tools ========
1126
1127 To run workflow:
1128 $0 -s sample_file -i workflow_file
1129
1130 Options:
1131
1132 -i workflow configration file, required
1133
1134 -s sample data file, required unless -S is present
1135 File format example
1136 #Sample data file example, TAB or space delimited for following lines
1137 Sample_ID1 sample_data_0 sample_data_1
1138 Sample_ID2 sample_data_0 sample_data_1
1139 Sample_ID3 sample_data_0 sample_data_1
1140
1141 -S sample data from command line, required unless -s is present
1142 format: Sample_ID1:sample_data_0:sample_data_0:sample_data_1,Sample_ID2:sample_data_0:sample_data_1
1143
1144 -j run sub sets of jobs, optional, the workflow will run all jobs by default
1145 e.g. -j qc or -j qc,fastqc
1146
1147 -t parameter file, optional, replace default paramters in workflow configration file
1148 File format example
1149 #parameter file example, TAB or space delimited for following lines
1150 CMDOPT JobID_A:opt0:opt1:opt2
1151 CMDOPT JobID_B:opt0:opt1
1152
1153 -T parameter from command line
1154 format: JobID_A:opt0:opt1:opt2,JobID_B:opt0:opt1
1155
1156 -r root directory of NGS-tools
1157
1158 -J optional tasks
1159 write-sh: write sh files and quite
1160 log-cpu: gathering cpu time for each run for each sample
1161 list-jobs: list jobs
1162 snapshot: snapshot current job status
1163 delete-jobs: delete jobs, must supply jobs delete syntax by option -Z
1164 e.g. -J delete-jobs -Z jobids:assembly,blast ---delete assembly,blast and all jobs depends on them
1165 -J delete-jobs -Z run_after:filename ---delete jobs that has start time (WF.start.date) after this file, and all depending jobs
1166
1167 -Z secondary parameter used by other options, such as -J
1168
1169 -Q queue system, default SGE
1170 can be PBS, SGE
1171
1172 Question and comments:
1173 http://weizhongli-lab.org/ngomicswf liwz\@sdsc.edu
1174
1175 EOD
1176 }
1177
1178
1179
1180 ############################################################################################
1181 # _______ ________ _________ ___________________ ________ .____ _________
1182 # \ \ / _____/ / _____/ \__ ___/\_____ \ \_____ \ | | / _____/
1183 # / | \/ \ ___ \_____ \ ______ | | / | \ / | \| | \_____ \
1184 #/ | \ \_\ \/ \ /_____/ | | / | \/ | \ |___ / \
1185 #\____|__ /\______ /_______ / |____| \_______ /\_______ /_______ \/_______ /
1186 # \/ \/ \/ \/ \/ \/ \/
1187 ############################################################################################
1188
0 CD-HIT usecases: CD-HIT-OTU-MiSeq (http://cd-hit.org)
1
2 Please also check https://github.com/weizhongli/cdhit/wiki,
3 which offers most up-to-date documents.
4
5
6 ================================================================================================
7 Introduction of CD-HIT-OTU-MiSeq
8 ================================================================================================
9 This use case is developed for clustering 16S rDNA sequences sequenced with MiSeq
10 platform into OTUs for microbiome studies.
11 In recent years, Illumina MiSeq sequencers became dominant in 16S rDNA sequencing. The
12 Paired End (PE) reads need to be assembled first. However many reads can not be accurately
13 assembled because the poor quality at the 3’ ends of both PE reads in the overlapping region.
14 This causes that many sequences are discarded in the analysis. CD-HIT-OTU-MiSeq has unique
15 features to cluster MiSeq 16S sequences.
16
17 * The package can clustering PE reads without joining them into contigs.
18 * Users can choose a high quality portion of the PE reads for analysis
19 (e.g. first 200 / 150 bases from forward / reverse reads), according to base quality profile.
20 * We implemented a tool that can splice out the target region (e.g. V3-V4) from a full-length
21 16S reference database into the PE sequences. CD-HIT-OTU-MiSeq can cluster the spliced PE
22 reference database together with samples, so we can derive Operational Tax-onomic Units (OTUs)
23 and annotate these OTUs concurrently.
24 * Chimeric sequences are effectively identified through de novo approache.
25
26 The most important unique feature of CD-HIT-OTU-MiSeq is to only use high quality region at
27 the 5’ ends of R1 and R2 reads. For example, the effective clustering read length can be 200 bases
28 for R1 and 150 bases for R2. The effective portions of PE reads are clustered together with
29 spliced PE sequences from the reference database to derive OTUs (Figure).
30
31
32 ================================================================================================
33 Installation
34 ================================================================================================
35 1. Install CD-HIT package
36 * download current CD-HIT at https://github.com/weizhongli/cdhit/releases,
37 for example cd-hit-v4.6.2-2015-0511.tar.gz
38 * unpack the file with “tar xvf cd-hit-v4.6.2-2015-0511.tar.gz –gunzip”
39 * change dir by “cd cd-hit-v4.6.2-2015-0511”
40 * compile the programs by “make” with multi-threading (default),
41 or by “make openmp=no” without multi-threading (on old systems without OpenMP)
42 * cd cd-hit-auxtools
43 * compile cd-hit-auxtools by “make”
44 * CD-HIT-OTU-MiSeq scripts are inside a folder like cd-hit-v4.6.2-2015-0511/usecases/Miseq-16S
45
46
47 2. Install Trimmomatic
48 CD-HIT-OTU-MiSeq uses Trimmomatic for sequence quality control. It can be downloaded from
49 http://www.usadellab.org/cms/?page=trimmomatic or https://github.com/timflutre/trimmomatic.
50 We also have a copy at http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/.
51
52
53 3. Modify NG-Omics-Miseq-16S.pl
54 Please edit usecases/Miseq-16S/NG-Omics-Miseq-16S.pl, in the top few lines:
55 $CD_HIT_dir = "PATH_to_cd-hit";
56 $NGS_prog_trimmomatic = "PATH_to_trimmomatic/trimmomatic-0.32.jar"; #### where you have installed Trimmomatic
57
58 4. Download reference dataset
59 Reference database can be downloaded from http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/.
60 The reference database Greengene-13-5-99.fasta.gz was re-formatted from original Greengene database,
61 so that sequences with more specific annotations are at the beginning of the file. Please gunzip after
62 download.
63
64 You can also download Greengene directly. You should download Greengene from
65 http://greengenes.secondgenome.com/downloads, or ftp://greengenes.microbio.me/.
66 Please download file like greengenes_release/gg_13_5/gg_13_5_otus.tar.gz, unpack the tar file.
67 You may find gg_13_5_otus/taxonomy/99_otu_taxonomy.txt and gg_13_5_otus/rep_set/99_otus.fasta.
68
69 There is a script: usecases/Miseq-16S/greengene-ann1.pl, please run this script to re-format greengene:
70 PATH_to_cd-hit/usecases/Miseq-16S/greengene-ann1.pl -i gg_13_5_otus/taxonomy/99_otu_taxonomy.txt -j gg_13_5_otus/rep_set/99_otus.fasta -o Greengene-13-5-99.fasta
71
72 5. Download sample datasets
73 Sample datasets can be downloaded from http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/.
74 The Miseq-otu-example.tar.gz contains two Miseq 16S samples. You can download and unpack to test.
75
76
77 ================================================================================================
78 Usage of CD-HIT-OTU-MiSeq
79 ================================================================================================
80 1. Prepare fastq files and sample file
81 Most projects have multiple samples sequenced at the same variable regions.
82 After your samples are sequenced, your sequencing center should give you two paired ended fastq files
83 for each samples. Put them in a working directory in similar way as the testing datasets,
84 where the R1.fq and R2.fq are placed in a folder for each sample. the folder name is the sample name.
85 So in the working directory, you should have files:
86
87 sample_name_1/R1.fq
88 sample_name_1/R2.fq
89 sample_name_2/R1.fq
90 sample_name_2/R2.fq
91 ...
92 sample_name_N/R1.fq
93 sample_name_N/R2.fq
94
95
96 2. Prepare sample file
97 Next is to prepare a SAMPLE_file, a text file, in the working directory. The file should look like:
98
99 sample_name_1 R1.fq R2.fq
100 sample_name_2 R1.fq R2.fq
101 ...
102 sample_name_N R1.fq R2.fq
103
104
105 3. Prepare reference database
106 We implemented a tool that can splice out the target amplicon region (e.g. V3-V4) from a
107 full-length 16S rRNA reference sequence database, such as Greengene, RDP and Silva,
108 into PE sequences. If there are multiple samples in a project sequenced with the same
109 amplicon of same variable region, only one spliced reference database is needed.
110 Please run:
111
112 Path_to_cd-hit_dir/usecases/Miseq-16S/16S-ref-db-PE-splice.pl -i sample_name_1/R1.fq -j sample_name_2/R2.fq -d Greengene-13-5-99.fasta -o gg_13_5-PE99.150-100 -p 150 -q 100 -c 0.99
113
114 Where Greengene-13-5-99.fasta is our re-formatted Greengene sequence file.
115 -p 150 specify the effective clustering read length for R1 to be 150
116 -q 100 specify the effective clustering read length for R2 to be 100
117 -p and -q option need to be consistent with parameters in OTU clustering in step 4
118 see next section for suggestions in choose effective clustering read length
119
120 This program will output spliced PE files gg_13_5-PE99.150-100-R1 and gg_13_5-PE99.150-100-R2.
121
122
123 4. Run sequence QC and OTU clustering for each sample
124 In the working directory, run
125
126 PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
127
128 where: 150 and 100 are the effective length,
129 see next section for suggestions in choose effective clustering read length
130 0.97 is the OTU clustering cutoff,
131 0.00001 is the abundance cutoff,
132 75 is the length for chimeric checking at each R1 and R2 read
133 PATH_to-gg_13_5-PE99.150-100-R1 and PATH_to-gg_13_5-PE99.150-100-R2 need to be full path
134 e.g. /home/user/myproj/PATH_to-gg_13_5-PE99.150-100-R1
135
136 This command will generate shell scripts for QC and for OTU for each sample.
137 The scripts will be in WF-sh folder. You can first run all the qc.sample_name.sh and after all
138 these jobs finished you then run all otu.sample_name.sh
139
140 NG-Omics-WF.pl https://github.com/weizhongli/ngomicswf is a very powerful workflow and pipeline
141 tool developed in our group. It is not fully released yet, since we need more time to document
142 this tool. However, you can try to use NG-Omics-WF.pl to automatically run all your samples.
143 First edit NG-Omics-Miseq-16S.pl and modify cores_per_node around line #36 to match the
144 number of CPU cores of your computer, then run
145
146 nohup PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 &
147
148 After the job finished, the OTU results will be in sample_name/otu folder, important files include
149 OTU.clstr: file lists all clusters and sequences
150 chimeric-small-clusters-list.txt: list of chimeric reads and low abundance reads not used
151
152
153 5. Pool all samples together
154 If you have multiple samples, you don't just want to stop here. It is important
155 to pool all sample together and re-run OTU clustering so that all samples can be
156 compared, run
157
158 PATH_to_cd-hit-dir/usecases/pool_samples.pl -s SAMPLE_file -o pooled
159
160 This will pool sequences from all samples. We can handle hundred and more sample without problem.
161
162
163 6. Cluster pooled samples, run
164
165 PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -S pooled -j otu-pooled -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
166
167 This command will generate a script WF-sh/otu-pooled.pooled.sh, you can
168 run this sh script. When it is finished, OTUs will be in the pooled directory:
169 OTU.clstr: file list all clusters and sequences from all samples in CD-HIT format
170 OTU.txt: spread sheet list number of sequences in each OTU for each sample, it also show annotation for each OTU.
171 chimeric-small-clusters-list.txt: list of chimeric reads and low abundance reads not used
172
173
174 ================================================================================================
175 Choose effective clustering read length
176 ================================================================================================
177 The key of this method is to use the high quality portion of reads from both R1 and R2, so how
178 to choose effective clustering read length depends on the actual quality of the PE reads. In our
179 paper five pairs of effective clustering read lengths (225, 175), (200, 150), (175, 125),
180 (150, 100) and (125, 75) were selected for samples sequenced at V34 or V45.
181 Two pairs of effective clustering read lengths (150, 100) and (125, 75) were used for
182 samples of V4 region. All these settings gave good results.
183
184 You can try some different settings and compare the resutls. Also, programs such as FASTQC
185 (http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) can be used to scan the raw reads
186 to help choose the effective clustering read length of R1 and R2.
187
188
189
190
191 ================================================================================================
192 Other topics
193 ================================================================================================
194
195 Questions, comments to the author Weizhong Li, liwz@sdsc.edu
196
197
198
199
0 #!/usr/bin/perl
1
2 use Getopt::Std;
3 my $script_name = $0;
4 my $script_dir = $0;
5 $script_dir =~ s/[^\/]+$//;
6 chop($script_dir);
7 $script_dir = "./" unless ($script_dir);
8
9 getopts("i:j:o:p:c:s:t:m:e:Z:a:f:d:R:",\%opts);
10 die usage() unless ($opts{i} and $opts{o});
11
12 my $input = $opts{i};
13 my $input2 = $opts{j};
14 my $dir = $opts{o};
15 my $abs_cutoff = $opts{a}; $abs_cutoff = 0.00005 unless ($abs_cutoff); #5e-5
16 my $otu_cutoff = $opts{c}; $otu_cutoff = 0.97 unless ($otu_cutoff);
17 my $chimera_f = $opts{m}; $chimera_f = "true" unless ($chimera_f);
18 my $debug_mode = $opts{Z};
19 my $fast_mode = $opts{f}; #### use cd-hit-dup for stage 1 and 2 clustering
20 my $cdhit_opt = $opts{d};
21 my $restart_n = $opts{R}; $restart_n = 0 unless (defined($restart_n));
22 my $LOGf = "$dir/OTU.log";
23 my $cd_hit_dup = "$script_dir/../../cd-hit-auxtools/cd-hit-dup"; die "no $cd_hit_dup" unless (-e $cd_hit_dup);
24 my $cd_hit_est = "$script_dir/../../cd-hit-est"; die "no $cd_hit_est" unless (-e $cd_hit_est);
25
26 my ($i, $j, $k, $str, $cmd, $ll);
27 $cmd = `mkdir -p $dir`;
28 open(LOG, "> $LOGf") || die "can not write to $LOGf";
29 my $f2 = "$dir/seq";
30
31 ################################################################################
32 #### Stage 0 ----------- clustering at 100% - stage 0
33 ################################################################################
34 my $clstr = "$f2.dup.clstr";
35 my $clstr2 = "$f2.dup2.clstr";
36 if ($restart_n <= 0) {
37 nice_run("$cd_hit_dup -i $input -i2 $input2 -o $f2.dup -o2 $f2.dup.2 -u 100 -d 0 -m false -f $chimera_f > $f2.dup2.log");
38 nice_run("cat $f2.dup.clstr $f2.dup2.clstr > $f2-stage0.clstr.tmp");
39 nice_run("$script_dir/cd-hit/clstr_sort_by.pl < $f2-stage0.clstr.tmp > $f2-stage0.clstr; rm -f $f2-stage0.clstr.tmp");
40 nice_run("$script_dir/clstr_sort_rep.pl $f2-stage0.clstr $input > $f2-stage0-rep.fa");
41 #
42 # /home/oasis/data/etc/git/cdhit/cd-hit-auxtools/cd-hit-dup -i qc/R1.fa -i2 qc/R2.fa -o otu/seq.dup -o2 otu/seq.dup.2 -u 100 -d 0 -f true > otu/seq.dup.log # no work
43 # /home/oasis/data/etc/git/cdhit/cd-hit-auxtools/cd-hit-dup -i qc/R1.fa -i2 qc/R2.fa -o otu/seq.dup -o2 otu/seq.dup.2 -u 100 -d 0 > otu/seq.dup.log
44 #
45 # what if cd-hit-est
46 # /home/oasis/data/etc/git/cdhit/cd-hit-est -i qc/R1.fa -j qc/R2.fa -o otu/seq.nr -op otu/seq.nr.2 -sf 1 -sc 1 -P 1 -r 0 -cx 100 -cy 100 -c 1.0 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > otu/seq.nr.log
47 # /home/oasis/data/etc/git/cdhit/cd-hit-est -i otu/seq.nr -o otu/seq.nr.R1 -r 0 -cx 100 -c 1.0 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > otu/seq.nr.R1.log
48 # /home/oasis/data/etc/git/cdhit/cd-hit-est -i otu/seq.nr.2 -o otu/seq.nr.R2 -r 0 -cx 100 -c 1.0 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > otu/seq.nr.R2.log
49
50 # /home/oasis/data/etc/git/cdhit/cd-hit-est -i otu/seq.nr -j otu/seq.nr.2 -o otu/seq.99 -op otu/seq.99.2 -P 1 -r 0 -cx 100 -cy 100 -c 0.99 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > otu/seq.99.log
51 # /home/oasis/data/etc/git/cdhit/cd-hit-est -i otu/seq.99 -j otu/seq.99.2 -o otu/seq.97 -op otu/seq.97.2 -P 1 -r 0 -cx 100 -cy 100 -c 0.97 -n 10 -G 1 -b 5 -T 1 -M 8000 -d 0 -p 1 > otu/seq.97.log
52 # do not sort 99.clstr, always trust cd-hit-dup ordered sequences
53 # /home/oasis/data/etc/git/cdhit/clstr_rev.pl otu/seq.nr.clstr otu/seq.99.clstr | /home/oasis/data/etc/git/cdhit/clstr_sort_by.pl > otu/seq.99-full.clstr
54 # /home/oasis/data/etc/git/cdhit/clstr_rev.pl otu/seq.99-full.clstr otu/seq.97.clstr | /home/oasis/data/etc/git/cdhit/clstr_sort_by.pl > otu/seq.97-full.clstr
55 #
56 # combine ref
57 # /home/oasis/data/etc/git/cdhit/cd-hit-est -i seq.99.wref.R1 -o seq.97.wref.R1only -r 0 -cx 100 -c 0.97 -n 10 -b 5 -T 1 -M 8000 -d 1 -p 1 -G 0 -A 50 -g 1
58 #
59 }
60 if (not $debug_mode) {
61 my $no1 = count_seqs_from_fasta_file($input);
62 my $no_clstr = count_clstrs_from_clstr_file($clstr);
63 my $no_clstr2 = count_clstrs_from_clstr_file($clstr2);
64 print LOG "Number_contigs\t$no1\n";
65 print LOG "Number_unique_contigs\t$no_clstr\n";
66 print LOG "Number_unique_chimaric_contigs\t$no_clstr2\n";
67 }
68
69 ################################################################################
70 #### Stage 1 ---------- clustering at 99.25% #### distance 0.75%
71 ################################################################################
72 my $seq_n = `grep -c "^>" $input`; $seq_n =~ s/\D//g;
73 my $cutoff = int($seq_n * $abs_cutoff);
74 my $c1 = 0.9925;
75 if ($restart_n <= 1) {
76 if ($fast_mode) {
77 nice_run("$script_dir/cd-hit-auxtools/cd-hit-dup -i $f2-stage0-rep.fa -o $f2-stage1 -d 0 -m false -e 3 > $f2-stage1.log");
78 }
79 else {
80 nice_run("$script_dir/cd-hit/cd-hit-est -i $f2-stage0-rep.fa -o $f2-stage1 -c $c1 -n 10 -l 11 -p 1 -d 0 -g 1 -b 3 $cdhit_opt > $f2-stage1.log");
81 }
82 nice_run("$script_dir/cd-hit/clstr_rev.pl $f2-stage0.clstr $f2-stage1.clstr | $script_dir/cd-hit/clstr_sort_by.pl > $f2-stage1-all.clstr");
83 nice_run("$script_dir/clstr_sort_rep.pl $f2-stage1-all.clstr $f2-stage1 > $f2-stage1-rep.fa");
84 }
85 if (not $debug_mode) {
86 $no_clstr = count_clstrs_from_clstr_file("$f2-stage1.clstr");
87 print LOG "Stage1 clustering at $c1\n";
88 print LOG "Number_clusters_stage1\t$no_clstr\n";
89 }
90
91 ################################################################################
92 #### Stage 2 ---------- clustering at 98.50% #### distance 1.50%
93 ################################################################################
94 $c1 = 0.985;
95 if ($restart_n <= 2) {
96 if ($fast_mode) {
97 nice_run("$script_dir/cd-hit-auxtools/cd-hit-dup -i $f2-stage1-rep.fa -o $f2-stage2 -d 0 -m false -e 6 > $f2-stage2.log");
98 }
99 else {
100 nice_run("$script_dir/cd-hit/cd-hit-est -i $f2-stage1-rep.fa -o $f2-stage2 -c $c1 -n 10 -l 11 -p 1 -d 0 -g 1 -b 3 $cdhit_opt > $f2-stage2.log");
101 }
102 nice_run("$script_dir/cd-hit/clstr_rev.pl $f2-stage1-all.clstr $f2-stage2.clstr | $script_dir/cd-hit/clstr_sort_by.pl > $f2-stage2-all.clstr");
103 nice_run("$script_dir/clstr_sort_rep.pl $f2-stage2-all.clstr $f2-stage2 > $f2-stage2-rep.fa");
104 }
105 if (not $debug_mode) {
106 $no_clstr = count_clstrs_from_clstr_file("$f2-stage2.clstr");
107 print LOG "Stage2 clustering at $c1\n";
108 print LOG "Number_clusters_stage2\t$no_clstr\n";
109 }
110
111
112 ################################################################################
113 #### Stage pre-3 ---------- filtering
114 ################################################################################
115
116 if ($restart_n <= 3) {
117 nice_run("$script_dir/clstr_select_rep.pl size $cutoff 999999999 < $f2-stage2-all.clstr > $f2-stage2-rep-big.ids");
118 nice_run("$script_dir/fetch_fasta_by_ids.pl $f2-stage2-rep-big.ids $f2-stage2-rep.fa > $f2-stage2-rep-big.fa");
119 nice_run("$script_dir/fetch_fasta_exclude_ids.pl $f2-stage2-rep-big.ids $f2-stage2-rep.fa > $f2-stage2-rep-small.fa");
120
121 if (-s $clstr2) {
122 nice_run("$script_dir/clstr_select_rep.pl size 1 999999999 < $clstr2 > $dir/chimaric.ids"); ## save chimaric ids
123 nice_run("$script_dir/fetch_fasta_exclude_ids.pl $dir/chimaric.ids $f2-stage2-rep-big.fa > $f2-stage2-rep-big-good.fa"); ## exclude chimaric reads from $t1-pri-rep.fa
124 nice_run("rm -f $f2-stage2-rep-big.fa");
125
126 nice_run("$script_dir/fetch_fasta_exclude_ids.pl $dir/chimaric.ids $f2-stage2-rep-small.fa > $f2-stage2-rep-small-good.fa");
127 nice_run("rm -f $f2-stage2-rep-small.fa");
128 }
129 else {
130 nice_run("mv $f2-stage2-rep-big.fa $f2-stage2-rep-big-good.fa");
131 nice_run("mv $f2-stage2-rep-small.fa $f2-stage2-rep-small-good.fa");
132 }
133 }
134
135 if (not $debug_mode) {
136 print LOG "Min_clstr_size\t$cutoff\n";
137 my $no_seq = count_seqs_from_fasta_file("$f2-stage2-rep-big-good.fa");
138 print LOG "Number_clstrs_above_min_size\t$no_seq\n";
139 }
140
141 ################################################################################
142 #### Stage 3 ---------- clustering at 97%
143 ################################################################################
144 $c1 = $otu_cutoff;
145 if ($restart_n <= 3) {
146 nice_run("$script_dir/cd-hit/cd-hit-est -i $f2-stage2-rep-big-good.fa -o $f2-stage3 -c $c1 -n 8 -l 11 -p 1 -d 0 -g 1 -b 5 $cdhit_opt > $f2-stage3.log");
147 nice_run("$script_dir/cd-hit/clstr_rev.pl $f2-stage2-all.clstr $f2-stage3.clstr | $script_dir/cd-hit/clstr_sort_by.pl > $f2-stage3-all.clstr");
148 nice_run("$script_dir/clstr_sort_rep.pl $f2-stage3-all.clstr $f2-stage3 > $f2-stage3-rep.fa");
149 nice_run("mv -f $f2-stage3-all.clstr $dir/OTU.clstr");
150 nice_run("$script_dir/cd-hit-otu-table-faa.pl -i $dir/OTU.clstr -s $f2-stage3-rep.fa -o $dir/OTU-dist.txt -f $dir/OTU.fa");
151 }
152
153 if (not $debug_mode) {
154 $no_clstr = count_clstrs_from_clstr_file("$dir/OTU.clstr");
155 $no_seq = count_seqs_from_clstr_file("$dir/OTU.clstr");
156 print LOG "OTU clustering at $c1\n";
157 print LOG "Number_OTUs\t$no_clstr\n";
158 print LOG "Number_seqs_in_OTUs\t$no_seq\n";
159 my ($tu,$ts,$cu,$cs)=times(); my $tt=$tu+$ts+$cu+$cs;
160 print LOG "Total_CPU_time\t$tt\n";
161 }
162 close(LOG);
163
164
165 sub usage {
166 <<EOF
167 Usage:
168 $script_name -i contig_fasta_file -o output_dir -a abundance_cutoff -c OTU_cutoff -m check_chimera_flag
169
170 Options:
171 -i input fasta file of contig
172 -o output dir
173 -c OTU cutoff, default 0.97
174 -m whether to perform chimera checking (true/false), default true
175 -a abundance cutoff, default 0.00005
176 small clusters < this size will be considiered as noise and will be removed
177 if total input sequence is 50,000, then clusters < 2 (i.e. singletons) are removed
178 -f 1 or 0, default 0
179 if set to 1, then use cd-hit-dup instead of cd-hit-est for stage 1 and 2 clustering
180 which is very fast
181 -R restart flag, if re-run at different abundance cutoff value or something,
182 with this parameter, program can skip the first n step and restart at certain step
183 values:
184 0 default, start from the scratch cd-hit-dup
185 1 cd-hit-est at 99.25
186 2 cd-hit-est at 98.75
187 3 filtering and cd-hit-est at 97%
188
189 EOF
190 }
191 ###### END usage
192
193 sub nice_run {
194 my $str = shift;
195 print STDERR "$str\n";
196 my $cmd = `$str` unless ($debug_mode);
197 return $cmd;
198 }
199 ##########
200
201 sub count_clstrs_from_clstr_file {
202 my $clstr = shift;
203 my $n = `grep -c "^>" $clstr`;
204 $n =~ s/\s//g;
205 return $n;
206 }
207
208 sub count_seqs_from_clstr_file {
209 my $clstr = shift;
210 my $n = `grep -cv "^>" $clstr`;
211 $n =~ s/\s//g;
212 return $n;
213 }
214
215 sub count_seqs_from_fasta_file {
216 my $faa = shift;
217 my $n = `grep -c "^>" $faa`;
218 $n =~ s/\s//g;
219 return $n;
220 }
221
0 #!/usr/bin/perl
1 #
2 use Getopt::Std;
3 getopts("i:s:S:o:f:j:",\%opts);
4
5 my $input = $opts{i}; $input = "OTU.clstr" unless $input;
6 my $output = $opts{o}; $output = "OTU.txt" unless ($output);
7 my ($i, $j, $k, $str, $cmd, $ll);
8
9 my %count = ();
10 my %count_t = ();
11 my %count_s = ();
12 my $OTU_2_ann = ();
13 my $tree_flag = 0; #### for greengene header format
14 # >4360486|k__Bacteria;.p__Firmicutes;.c__Clostridia;.o__Clostridiales;.f__Lachnospiraceae;.g__Roseburia;.s__faecis
15 open(TMP, $input) || die "can not open $input";
16 my $OTU=0;
17 while($ll=<TMP>){
18 if ($ll =~ /^>/) {
19 $OTU++;
20 }
21 else {
22 chop($ll);
23 if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
24 my $id = $2;
25 if ($id =~ /^Sample\|([^\|]+)\|/) {
26 $sample_id = $1;
27 $sample_id{$sample_id}=1;
28 $count{$OTU}{$sample_id}++;
29 $count_t{$OTU}++;
30 $count_s{$sample_id}++;
31 }
32 else {
33 $OTU_2_ann{$OTU} = $id;
34 $tree_flag = 1 if ($id =~ /\|k__Bacteria;.p__/);
35 }
36 }
37 else {
38 die "format error $ll";
39 }
40 }
41 }
42 close(TMP);
43
44 my @sample_ids = sort keys %sample_id;
45
46 open(OUT1, "> $output") || die "can not write $output";
47 print OUT1 "OTU";
48 foreach $sample_id (@sample_ids){
49 print OUT1 "\t$sample_id";
50 }
51 if ($tree_flag) {
52 print OUT1 "\t", join("\t", qw/Kingdom Phylum Class Order Family Genus Species/);
53 }
54 #print OUT1 "\tTotal\n";
55 print OUT1 "\tAnnotation\n";
56
57 for ($i=1; $i<=$OTU; $i++){
58 $ann = "None";
59 if ($OTU_2_ann{$i}) { $ann = $OTU_2_ann{$i}; }
60 print OUT1 "OTU$i";
61 foreach $sample_id (@sample_ids){
62 $k = $count{$i}{$sample_id}? $count{$i}{$sample_id} : 0;
63 print OUT1 "\t$k";
64 }
65 if ($tree_flag) {
66 my ($tax_k, $tax_p, $tax_c, $tax_o, $tax_f, $tax_g, $tax_s);
67 if ($ann =~ /k__(\w+)/) {$tax_k = $1} else {$tax_k = "";}
68 if ($ann =~ /p__(\w+)/) {$tax_p = $1} else {$tax_p = "";}
69 if ($ann =~ /c__(\w+)/) {$tax_c = $1} else {$tax_c = "";}
70 if ($ann =~ /o__(\w+)/) {$tax_o = $1} else {$tax_o = "";}
71 if ($ann =~ /f__(\w+)/) {$tax_f = $1} else {$tax_f = "";}
72 if ($ann =~ /g__(\w+)/) {$tax_g = $1} else {$tax_g = "";}
73 if ($ann =~ /s__(\w+)/) {$tax_s = $1} else {$tax_s = "";}
74 print OUT1 "\t", join("\t", ($tax_k, $tax_p, $tax_c, $tax_o, $tax_f, $tax_g, $tax_s));
75 }
76 #print OUT1 "\t$count_t{$i}";
77 print OUT1 "\t$ann\n";
78 }
79 close(OUT1);
80
81
0 #!/usr/bin/perl
1
2 use Getopt::Std;
3 my $script_name = $0;
4 my $script_dir = $0;
5 $script_dir =~ s/[^\/]+$//;
6 chop($script_dir);
7 $script_dir = "./" unless ($script_dir);
8
9 getopts("k:i:j:o:p:c:s:t:m:e:Z:a:f:d:R:g:",\%opts);
10 die usage() unless ($opts{k} and $opts{i} and $opts{j} and $opts{a} and $opts{f} and $opts{g} and $opts{o});
11
12 my $input0 = $opts{k}; ## nr.clstr
13 my $input = $opts{i}; ## R1 only clstr
14 my $input2 = $opts{j}; ## R2 only clstr
15 my $clstr_99 = $opts{a}; ## seq.99.clstr #### can be any 2nd -preclustering e.g. 98.5%
16 my $seq_99 = $opts{f}; ## seq.99 - fasta file R1
17 my $seq_992 = $opts{g}; ## seq.99 - fasta file R2
18 my $output = $opts{o}; ## seq.99f
19 my $abs_cutoff = $opts{c}; $abs_cutoff = 0.0001 unless ($abs_cutoff);
20 my $output_2 = "$output.2"; ## seq.99f.2 -- R2
21 my $output_cls = "$output.clstr"; ## seq.99f.clstr
22 my $output_log = "$output.log"; ## seq.99f.log
23
24 my ($i, $j, $k, $str, $cmd, $ll);
25
26 my $num_total_seq;
27 my %seq_nr_size;
28 my %seqs_of_nr;
29 open(LOG, "> $output_log") || die "can not open $output_log";
30 open(TMP, $input0) || die "can not open $input0";
31 if (1) {
32 my $rep;
33 while($ll=<TMP>){
34 if ($ll =~ /^>/) {
35 $rep = "";
36 }
37 else {
38 chop($ll);
39 my $id;
40 if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
41 $id = $2;
42 $num_total_seq++;
43 if ($ll =~ /\*$/) { $rep=$id; $seq_nr_size{$rep}=0; $seqs_of_nr{$rep} = [];}
44 $seq_nr_size{$rep}++ if ($rep);
45 push(@{$seqs_of_nr{$rep}}, $id) if ($rep);
46 }
47 }
48 }
49 }
50 close(TMP);
51
52 my %seq_R1_clstr;
53 my %seq_R2_clstr;
54 foreach my $f (($input, $input2)) {
55 open(TMP, $f) || die "can not open $f";
56 my $rep;
57
58 while($ll=<TMP>){
59 if ($ll =~ /^>/) {
60 $rep = "";
61 }
62 else {
63 chop($ll);
64 my $id;
65 if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
66 $id = $2;
67 if ($ll =~ /\*$/) {
68 $rep=$id;
69 }
70 if ($rep) {
71 if ($f eq $input) { $seq_R1_clstr{$id} = $rep;}
72 else { $seq_R2_clstr{$id} = $rep;}
73 }
74 }
75 }
76 }
77 close(TMP);
78 }
79
80 #### open $clstr_99 first time
81 open(TMP, $clstr_99) || die "can not open $clstr_99";
82 %rep_2_otu = ();
83 $OTU = -1;
84 while($ll=<TMP>){
85 if ($ll =~ /^>/) {
86 $OTU++;
87 }
88 else {
89 my $id;
90 if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
91 $id = $2;
92 $rep_2_otu{$id} = $OTU;
93 }
94 }
95 }
96 close(TMP);
97
98 my %chimeric_ids = ();
99 #### those ids are candidates, if they are recurited by other non-chimeric clusters,
100 #### then they are not chimeric anymore
101 foreach $i (keys %seq_R1_clstr) {
102 my $rep1 = $seq_R1_clstr{$i};
103 my $rep2 = $seq_R2_clstr{$i};
104
105 next if ($rep1 eq $rep2);
106 next unless ($seq_nr_size{$rep1} >= $seq_nr_size{$i}*2);
107 next unless ($seq_nr_size{$rep2} >= $seq_nr_size{$i}*2);
108
109 my $OTU1 = $rep_2_otu{$rep1};
110 my $OTU2 = $rep_2_otu{$rep2};
111 next if ($OTU1 eq $OTU2);
112 $chimeric_ids{$i} = 1;
113 }
114
115 #### parse seq.99.clstr
116 my $cutoff_clstr_size = int($num_total_seq * $abs_cutoff);
117 $cutoff_clstr_size = 1 unless ($cutoff_clstr_size >= 1); #### singleton will be removed
118 #print LOG "cutoff_clstr_size\t$cutoff_clstr_size\n";
119
120 open(TMP, $clstr_99) || die "can not open $clstr_99";
121 open(OUT, "> $output_cls") || die "can not write to $output_cls";
122 my %good_ids = ();
123 my @seqs_this_cls = ();
124 if (1) {
125 my $clstr_txt = "";
126 my $clstr_size = 0;
127 my $rep;
128
129 while($ll=<TMP>){
130 if ($ll =~ /^>/) {
131 if ($clstr_txt) {
132 if (($clstr_size > $cutoff_clstr_size) and (not $chimeric_ids{$rep})) {
133 print OUT $clstr_txt;
134 $good_ids{$rep} = 1;
135 }
136 elsif ( $chimeric_ids{$rep} ) {
137 foreach $j (@seqs_this_cls) {
138 foreach $i ( @{ $seqs_of_nr{$j} } ) {
139 print LOG "$i\tChimeric_cluster\t$rep\t$clstr_size\tP1:$seq_R1_clstr{$rep}\tP2:$seq_R2_clstr{$rep}\tOTU1:$rep_2_otu{$seq_R1_clstr{$rep}}\tOTU2:$rep_2_otu{$seq_R2_clstr{$rep}}\n";
140 }
141 }
142 }
143 else {
144 foreach $j (@seqs_this_cls) {
145 foreach $i ( @{ $seqs_of_nr{$j} } ) {
146 print LOG "$i\tSmall_cluster\t$rep\t$clstr_size\n";
147 }
148 }
149 }
150 }
151 $clstr_size = 0;
152 $clstr_txt = $ll;
153 $rep = "";
154 @seqs_this_cls=();
155 }
156 else {
157 $clstr_txt .= $ll;
158 chop($ll);
159 my $id;
160 if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
161 $id = $2;
162 $clstr_size += $seq_nr_size{$id};
163 $rep=$id if ($ll =~ /\*$/);
164 push(@seqs_this_cls, $id);
165 }
166 }
167 }
168 if ($clstr_txt) {
169 if (($clstr_size > $cutoff_clstr_size) and (not $chimeric_ids{$rep})) {
170 print OUT $clstr_txt;
171 $good_ids{$rep} = 1;
172 }
173 elsif ( $chimeric_ids{$rep} ) {
174 foreach $j (@seqs_this_cls) {
175 foreach $i ( @{ $seqs_of_nr{$j} } ) {
176 print LOG "$i\tChimeric_cluster\t$rep\t$clstr_size\tP1:$seq_R1_clstr{$rep}\tP2:$seq_R2_clstr{$rep}\tOTU1:$rep_2_otu{$seq_R1_clstr{$rep}}\tOTU2:$rep_2_otu{$seq_R2_clstr{$rep}}\n";
177 }
178 }
179 }
180 else {
181 foreach $j (@seqs_this_cls) {
182 foreach $i ( @{ $seqs_of_nr{$j} } ) {
183 print LOG "$i\tSmall_cluster\t$rep\t$clstr_size\n";
184 }
185 }
186 }
187 }
188 }
189 close(TMP);
190 close(OUT);
191
192 foreach my $f (($seq_99, $seq_992)) {
193 my $fout = ($f eq $seq_99) ? $output : $output_2;
194
195 open(TMP, $f) || die "can not open $f";
196 open(OUT, ">$fout") || die "can not write to $fout";
197
198 my $flag = 0;
199 while($ll = <TMP>) {
200 if ($ll =~ /^>/) {
201 $gi = substr($ll,1);
202 chop($gi);
203 $gi =~ s/\s.+$//;
204 $flag = ( $good_ids{$gi} ) ? 1 : 0;
205 }
206 print OUT $ll if ($flag);
207 }
208
209 close(TMP);
210 close(OUT);
211 }
212
213
214 close(LOG);
215
216 sub usage {
217 <<EOF
218 Usage:
219 $script_name -k seq.nr.clstr -i seq.nr.R1.clstr -j seq.nr.R2.clstr -c 0.0001 -a seq.99.clstr -f seq.99 -g seq.99.2 -o seq.99f
220
221 Options:
222 -k input seq.nr.clstr
223 -i input seq.nr.R1.clstr
224 -j input seq.nr.R2.clstr
225 -a input seq.99.clstr
226 -f input seq.99
227 -g input seq.99.2
228 -o output
229 -c abundance cutoff, default $abs_cutoff
230 small clusters < this size will be considiered as noise and will be removed
231 if total input sequence is 50,000, then clusters < 2 (i.e. singletons) are removed
232
233 EOF
234 }
235 ###### END usage
236
0 #!/usr/bin/perl
1
2 use Getopt::Std;
3 my $script_name = $0;
4 my $script_dir = $0;
5 $script_dir =~ s/[^\/]+$//;
6 chop($script_dir);
7 $script_dir = "./" unless ($script_dir);
8
9 getopts("k:i:j:o:p:c:s:t:m:e:Z:a:f:d:R:g:",\%opts);
10 die usage() unless ($opts{i} and $opts{j} and $opts{a} and $opts{f} and $opts{g} and $opts{o});
11
12 my $input = $opts{i}; ## R1 only clstr
13 my $input2 = $opts{j}; ## R2 only clstr
14 my $clstr_99 = $opts{a}; ## seq.97f-full.clstr #### can be any 2nd -preclustering e.g. 98.5%
15 my $seq_99 = $opts{f}; ## seq.99 - fasta file R1
16 my $seq_992 = $opts{g}; ## seq.99 - fasta file R2
17 my $output = $opts{o}; ## seq.99f
18 my $abs_cutoff = $opts{c}; $abs_cutoff = 0.01 unless ($abs_cutoff); #### small cluster will be checked for chimeric
19 my $output_2 = "$output.2"; ## seq.99f.2 -- R2
20 my $output_cls = "$output.clstr"; ## seq.99f.clstr
21 my $output_log = "$output.log"; ## seq.99f.log
22
23 my ($i, $j, $k, $str, $cmd, $ll);
24
25 my $num_total_seq;
26 my %seq_nr_size;
27 my %seqs_of_rep;
28 open(LOG, "> $output_log") || die "can not open $output_log";
29 open(TMP, $clstr_99) || die "can not open $clstr_99";
30 if (1) {
31 my $rep;
32 while($ll=<TMP>){
33 if ($ll =~ /^>/) {
34 $rep = "";
35 }
36 else {
37 chop($ll);
38 my $id;
39 if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
40 $id = $2;
41 $num_total_seq++ if ($id =~ /^Sample/);
42 if ($ll =~ /\*$/) { $rep=$id; $seq_nr_size{$rep}=0; $seqs_of_rep{$rep} = [];}
43 $seq_nr_size{$rep}++ if ($rep);
44 push(@{$seqs_of_rep{$rep}}, $id) if ($rep);
45 }
46 }
47 }
48 }
49 close(TMP);
50
51
52 my %seq_R1_clstr;
53 my %seq_R2_clstr;
54 foreach my $f (($input, $input2)) {
55 open(TMP, $f) || die "can not open $f";
56 my $rep;
57
58 while($ll=<TMP>){
59 if ($ll =~ /^>/) {
60 $rep = "";
61 }
62 else {
63 chop($ll);
64 my $id;
65 if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
66 $id = $2;
67 if ($ll =~ /\*$/) {
68 $rep=$id;
69 }
70 if ($rep and ($id =~ /^Sample/) ) {
71 if ($f eq $input) { $seq_R1_clstr{$id} = $rep;}
72 else { $seq_R2_clstr{$id} = $rep;}
73 }
74 }
75 }
76 }
77 close(TMP);
78 }
79
80 my $cutoff_clstr_size = int($num_total_seq * $abs_cutoff);
81 $cutoff_clstr_size = 1 unless ($cutoff_clstr_size >= 1);
82 #print LOG "cutoff_clstr_size\t$cutoff_clstr_size\n";
83
84 my %chimeric_ids = ();
85 #### those ids are candidates, if they are recurited by other non-chimeric clusters,
86 #### then they are not chimeric anymore
87 foreach $i (keys %seq_nr_size) {
88 next unless ($i =~ /^Sample/);
89 my $rep1 = $seq_R1_clstr{$i};
90 my $rep2 = $seq_R2_clstr{$i};
91 next unless ($rep1 and $rep2);
92
93 next if ($rep1 eq $rep2);
94 next if ($rep1 eq $i);
95 next if ($rep2 eq $i);
96 next if ($seq_nr_size{$i} > $cutoff_clstr_size);
97 if (defined($seq_nr_size{$rep1})) { next unless ($seq_nr_size{$rep1} >= $seq_nr_size{$i}*2); }
98 if (defined($seq_nr_size{$rep2})) { next unless ($seq_nr_size{$rep2} >= $seq_nr_size{$i}*2); }
99
100 $chimeric_ids{$i} = 1;
101 }
102
103 #### parse seq.97fwref.clstr
104 #### do chimeric checking for sample-only clusters
105 open(TMP, $clstr_99) || die "can not open $clstr_99";
106 open(OUT, "> $output_cls") || die "can not write to $output_cls";
107 my %good_ids = ();
108 if (1) {
109 my $clstr_txt = "";
110 my $clstr_size = 0;
111 my $rep;
112 my $refonly = 1;
113
114 while($ll=<TMP>){
115 if ($ll =~ /^>/) {
116 if ($clstr_txt) {
117 if ( not $refonly ) {
118 if (not $chimeric_ids{$rep}) {
119 print OUT $clstr_txt;
120 $good_ids{$rep} = 1;
121 }
122 elsif ( $chimeric_ids{$rep} ) {
123 foreach $i ( @{ $seqs_of_rep{$rep} }) {
124 print LOG "Chimeric_cluster\t$i\t$rep\t$clstr_size\tP1:$seq_R1_clstr{$rep}\tP2:$seq_R2_clstr{$rep}\n";
125 }
126 }
127 }
128 }
129 $clstr_size = 0;
130 $clstr_txt = $ll;
131 $rep = "";
132 $refonly = 1;
133 }
134 else {
135 $clstr_txt .= $ll;
136 chop($ll);
137 my $id;
138 if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
139 $id = $2;
140 $clstr_size++;
141 $rep=$id if ($ll =~ /\*$/);
142 $refonly = 0 if ($id =~ /^Sample/);
143 }
144 }
145 }
146 if ($clstr_txt) {
147 if ( not $refonly ) {
148 if (not $chimeric_ids{$rep}) {
149 print OUT $clstr_txt;
150 $good_ids{$rep} = 1;
151 }
152 elsif ( $chimeric_ids{$rep} ) {
153 foreach $i ( @{ $seqs_of_rep{$rep} }) {
154 print LOG "Chimeric_cluster\t$i\t$rep\t$clstr_size\tP1:$seq_R1_clstr{$rep}\tP2:$seq_R2_clstr{$rep}\n";
155 }
156 }
157 }
158 }
159
160 }
161 close(TMP);
162 close(OUT);
163
164 foreach my $f (($seq_99, $seq_992)) {
165 my $fout = ($f eq $seq_99) ? $output : $output_2;
166
167 open(TMP, $f) || die "can not open $f";
168 open(OUT, ">$fout") || die "can not write to $fout";
169
170 my $flag = 0;
171 while($ll = <TMP>) {
172 if ($ll =~ /^>/) {
173 $gi = substr($ll,1);
174 chop($gi);
175 $gi =~ s/\s.+$//;
176 $flag = ( $good_ids{$gi} ) ? 1 : 0;
177 }
178 print OUT $ll if ($flag);
179 }
180
181 close(TMP);
182 close(OUT);
183 }
184
185 close(LOG);
186
187 sub usage {
188 <<EOF
189 Usage:
190 $script_name -i seq.nr.R1.clstr -j seq.nr.R2.clstr -c 0.0001 -a seq.97f-full.clstr -f seq.99 -g seq.99.2 -o seq.99f
191
192 Options:
193 -i input seq.nr.R1.clstr
194 -j input seq.nr.R2.clstr
195 -a input seq.97f-full.clstr
196 -f input seq.99
197 -g input seq.99.2
198 -o output cluster without chimeric cluster, without ref-only cluster
199 -c abundance cutoff, default $abs_cutoff
200 small clusters < this size will be checked for chimeric and be removed if is chimeric
201 if total input sequence is 50,000, then clusters < 2 (i.e. singletons) are checked
202
203 EOF
204 }
205 ###### END usage
206
0 #!/usr/bin/perl
1
2 use Getopt::Std;
3 my $script_name = $0;
4 my $script_dir = $0;
5 $script_dir =~ s/[^\/]+$//;
6 chop($script_dir);
7 $script_dir = "./" unless ($script_dir);
8
9 my ($i, $j, $k, $str, $cmd, $ll);
10
11 my $clstr = "";
12 my $best_ref = "";
13 my $best_score = 0;
14
15 my $refonly = 1;
16 while($ll=<>){
17 if ($ll =~ /^>/) {
18 if ($clstr) {
19 print $clstr;
20 print $best_ref if ($best_ref);
21 }
22
23 $clstr = $ll;
24 $best_ref = "";
25 $best_score = 0;
26 }
27 else {
28 if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
29 my $id = $2;
30 if ($id =~ /^Sample/) {
31 $clstr .= $ll;
32 }
33 elsif ( $ll =~ /\/([\d|\.]+)%$/) {
34 my $iden = $1;
35 if ($iden > $best_score) {
36 $best_score = $iden;
37 $best_ref = $ll;
38 }
39 }
40 }
41 else {
42 print STDERR "format err: $ll";
43 }
44 }
45 }
46
47 if ($clstr) {
48 print $clstr;
49 print $best_ref if ($best_ref);
50 }
0 #!/usr/bin/perl
1
2 use Getopt::Std;
3 my $script_name = $0;
4 my $script_dir = $0;
5 $script_dir =~ s/[^\/]+$//;
6 chop($script_dir);
7 $script_dir = "./" unless ($script_dir);
8
9 my ($i, $j, $k, $str, $cmd, $ll);
10
11 my $num_total_seq;
12 my %seq_nr_size;
13
14 if (1) {
15 my $clstr = "";
16 my $refonly = 1;
17 while($ll=<>){
18 if ($ll =~ /^>/) {
19 print $clstr unless ($refonly);
20 $clstr = $ll;
21 $refonly = 1;
22 }
23 else {
24 $clstr .= $ll;
25 my $id;
26 if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
27 $id = $2;
28 $refonly = 0 if ($id =~ /^Sample/);
29 }
30 }
31 }
32 }
33
0 #!/usr/bin/perl
1 ## =========================== NGS tools ==========================================
2 ## NGS tools for metagenomic sequence analysis
3 ## May also be used for other type NGS data analysis
4 ##
5 ## Weizhong Li, UCSD
6 ## liwz@sdsc.edu
7 ## http://weizhongli-lab.org/
8 ## ================================================================================
9
10 use Getopt::Std;
11 getopts("i:j:o:r:e:p:q:c:d:N:t:u:d:M:T:S:",\%opts);
12 die usage() unless ($opts{i} and $opts{j} and $opts{o});
13 my ($i, $j, $k, $cmd);
14 my ($ll, $lla, $llb, $id, $ida, $idb, $seq, $seqa, $seqb, $qua, $quaa, $quab);
15 my ($len, $lena, $lenb);
16
17 my $file1 = $opts{i};
18 my $fasta = $opts{j};
19 my $output = $opts{o};
20
21 my %id_2_ann;
22 open(TMP, $file1) || die "can not open $file1";
23 while($ll=<TMP>){
24 chop($ll);
25 my ($id, $txt) = split(/\s+/, $ll, 2);
26 $txt =~ s/ /./g;
27 $id_2_ann{$id} = $txt;
28 }
29 close(TMP);
30
31 my %id_2_seq = ();
32 my $id = "";
33 open(TMP, $fasta) || die "can not open $fasta";
34 while($ll=<TMP>){
35 if ($ll =~ /^>(\d+)/) {
36 chop($ll);
37 $id = $1;
38 $ann = $id_2_ann{$id};
39 $id = "$id|$ann" if ($ann);
40 }
41 else {
42 $id_2_seq{$id} .= $ll;
43 }
44 }
45
46 close(TMP);
47
48 my @ids = keys %id_2_seq;
49 @ids = sort {length($b) <=> length($a) } @ids;
50
51 open(OUT, "> $output") || die "can not write to $output";
52 foreach $id (@ids) {
53 print OUT ">$id\n$id_2_seq{$id}";
54 }
55 close(OUT);
56
57
58
59 sub usage {
60 <<EOD;
61 This script formats Greengene FASTA file for CD-HIT-OTU-MiSeq. You should download Greengene sequences
62 from http://greengenes.secondgenome.com/downloads, or ftp://greengenes.microbio.me/.
63 download file like greengenes_release/gg_13_5/gg_13_5_otus.tar.gz, unpack the tar file. You may find
64 gg_13_5_otus/taxonomy/99_otu_taxonomy.txt and gg_13_5_otus/rep_set/99_otus.fasta
65
66 Run this script as $0 -i gg_13_5_otus/taxonomy/99_otu_taxonomy.txt -j gg_13_5_otus/rep_set/99_otus.fasta -o gg_13_5_processed.fasta
67
68 Options:
69 ======================
70 -i path for gg_13_5_otus/taxonomy/99_otu_taxonomy.txt
71 -j path for gg_13_5_otus/rep_set/99_otus.fasta
72 -o output FASTA file of formatted Greengene reference DB
73 EOD
74 }
0 #!/usr/bin/perl
1 #
2 use Getopt::Std;
3 getopts("s:S:o:f:j:",\%opts);
4
5 die usage() unless ($opts{s} or $opts{S});
6
7 my $output = $opts{o};
8 $output = "pooled" unless ($output);
9 my $sample_in = $opts{s};
10 my $sample_command_in = $opts{S}; #### ',' delimited samples, ':' delimited entries, e.g. sample1:R1.fq:R2.fq;sample2:R1.fq:R2.fq or sample1;sample2;sample3
11 my $job = $opts{j};
12 $job = "otu" unless ($job);
13
14 my @file_list = qw/seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt/;
15
16 my ($i, $j, $k, $cmd);
17 $cmd = `mkdir $output` unless (-e $output);
18
19 foreach $i (@file_list) {
20 if (-e "$output/$i") {
21 die "output dir $output & file $output/$i already exist, please remove all files from $output and re-run\n";
22 }
23 }
24
25 ######## parse NGS_samples
26 my @NGS_samples = ();
27 if (defined($sample_in)) {
28 open(TMP, $sample_in) || die "can not open $sample_in";
29 while($ll=<TMP>){
30 next if ($ll =~ /^#/);
31 next unless ($ll =~ /^\w/); chop($ll);
32 my ($id, @data) = split(/\s+/,$ll);
33 push(@NGS_samples, $id);
34 }
35 close(TMP);
36 }
37 elsif (defined($sample_command_in)) {
38 my @lls = split(/,/, $sample_command_in);
39 foreach $ll (@lls) {
40 my ($id, @data) = split(/:/, $ll);
41 push(@NGS_samples, $id);
42 }
43 }
44 else {
45 die "no input samples";
46 }
47
48 foreach $i (@file_list) {
49 my $target = "$output/$i";
50 foreach $j (@NGS_samples) {
51 my $source = "$j/$job/$i";
52 if (-e $source) {
53 print STDERR "cat $source >> $target\n";
54 $cmd = `cat $source >> $target`;
55 }
56 else {
57 print STDERR "Warning, $source missing\n";
58 }
59 }
60 }
61
62 sub usage {
63 <<EOD;
64 $0 -s sample_file -o output_dir
65 -s sample data file, required unless -S is present
66 File format example
67 #Sample data file example, TAB or space delimited for following lines
68 Sample_ID1 sample_data_0 sample_data_1
69 Sample_ID2 sample_data_0 sample_data_1
70 Sample_ID3 sample_data_0 sample_data_1
71
72 -S sample data from command line, required unless -s is present
73 format: Sample_ID1:sample_data_0:sample_data_0:sample_data_1,Sample_ID2:sample_data_0:sample_data_1
74
75 EOD
76 }
77