Commit 08d8190e044b037e8c64797551ee7c44c38b303a - cd-hit

New upstream version 4.6.8 Sascha Steinbiss 6 years ago

28 changed file(s) with 6180 addition(s) and 320 deletion(s). Raw diff Collapse all Expand all

+109

-0

FET.pl less more

	0	#!/usr/bin/perl
	1
	2	use Storable;
	3	use strict;
	4	use Text::NSP::Measures::2D::Fisher::right;
	5
	6	my $clstr_file = shift;
	7	my $anno_file = shift;
	8	my $store_file = shift;
	9
	10	my @cls_list = ();
	11	my @fun_list = ();
	12	my $cur_cls = "";
	13	my %cls2rep = ();
	14	my @cur_anno = ();
	15
	16
	17	open(TMP, $clstr_file) \|\| die;
	18	while(my $ll = <TMP>) { # read .clstr files
	19	if ($ll =~ /^>/) { # the begin of a cluster
	20	$cur_cls = $ll;
	21	$cur_cls =~ s/^>(.*?)\s$/$1/;
	22	# print "$cur_cls\|\n";
	23	}
	24	else{
	25	chop($ll);
	26	if ($ll =~ /^(\d+)\s+(\d+)(aa\|nt),\s+>(.+)\.\.\./) {
	27	my @tmp = split(/\\|\\|/,$4);
	28	if ($#tmp == 0){
	29	@cur_anno = ();
	30	}
	31	else{
	32	@cur_anno = split(/,/, pop(@tmp));
	33	}
	34	# print $cur_cls.$cur_anno[0]."\|\n";
	35	push(@cls_list, $cur_cls);
	36	push(@fun_list, [@cur_anno]);
	37	if ($ll =~ /^(\d+)\s+(\d+)(aa\|nt),\s+>(.+)\.\.\.(.)\$/){
	38	# print "$4\n";
	39	$cls2rep{$cur_cls} = $4;
	40	# print "$cur_cls\t$4\n";
	41	}
	42	}
	43	}
	44	}
	45
	46	#print join("\n", @cls_list[0..10]);
	47	@cls_list = map {$cls2rep{$_}} @cls_list;
	48	#print join("\n", @cls_list[0..10]);
	49	#print "\n";
	50	#foreach my $i (0..10){
	51	# print join("\t",@{$fun_list[$i]});
	52	# print "\n";
	53	#}
	54	#print join("\n", @fun_list[0..10]);
	55	#exit(1);
	56	my %cls_size = ();
	57	my %cls_anno = ();
	58	my %anno_size = ();
	59	my $M = $#fun_list+1;
	60	#print $#fun_list."\t".$M."\n";
	61	#print $#cls_list."\t".$M."\n";
	62	foreach my $i (0..$#fun_list){
	63	$cls_size{$cls_list[$i]}++;
	64	if ($#{$fun_list[$i]} >= 0) { # have annotation
	65	foreach my $anno (@{$fun_list[$i]}){
	66	# print "$i\t$cls_list[$i]\t$anno\n";
	67	$anno_size{$anno}++;
	68	$cls_anno{$cls_list[$i]}{$anno}++;
	69	}
	70	}
	71	}
	72
	73	#while (my ($a,$b) = each %anno_size){
	74	# print "$a\t$b\n";
	75	#}
	76
	77	#print "COG0171\t".$anno_
	78
	79	my %resu = ();
	80	while(my ($cls, $cls_s) = each %cls_size){
	81	my @tmp = ();
	82	# $resu{$cls} = [];
	83	while (my ($anno,$anno_s) = each %{$cls_anno{$cls}}){
	84	# print "$cls\t$cls_s\t$anno\t$anno_s\t$anno_size{$anno}";
	85	# print "\n";
	86	my $pvalue = calculateStatistic(n11=>$anno_s, n1p=>$cls_s, np1=>$anno_size{$anno}, npp=>$M);
	87	# anno_term, anno_size, clsper, anno_total, backper, enrichment, pvalue
	88	push @tmp, [$anno, $anno_s, $anno_s/$cls_s, $anno_size{$anno}, $anno_size{$anno}/$M, $anno_s$M/($cls_s$anno_size{$anno}), $pvalue];
	89	# push $resu{$cls}, [sort{$a[0] <=> $b[0]} @tmp];
	90	}
	91	@tmp = sort {$$a[6] <=> $$b[6]} @tmp;
	92	$resu{$cls} = [@tmp];
	93	}
	94
	95	store \%resu, $store_file;
	96	open(TMP, "> $anno_file") \|\| die;
	97	print TMP "ClsName\tClsSize\tAnno_term\tAnno_size\tClsPer\tAnno_total\tSeq_total\tBackPer\tEnrichment\tPvalue\n";
	98	while(my ($cls, $info) = each %resu){
	99	foreach my $a (@{$info}){ #[$pvalue, $enrichment, $anno_s, $anno]
	100	print TMP join("\t",($cls, $cls_size{$cls}, $a->[0], $a->[1], $a->[2], $a->[3],
	101	$M, $a->[4], $a->[5], $a->[6]))."\n";
	102	# print "$cls\t".join("\t",@{$a})."\n";
	103	}
	104	# print "$cls\t$#{$info}\n";
	105	}
	106	close(TMP)
	107
	108

-3

README less more

13	13	please download legacy BLAST (not BLAST+) and install the executables in your $PATH
14	14
15	15
16		For more information, please visit http://cd-hit.org or please read the cdhit-users-guide.pdf.
17		Most up-to-date documents are available at http://weizhongli-lab.org/cd-hit/wiki/doku.php?id=cd-hit_user_guide.
	16	For more information, please visit http://cd-hit.org
18	17
19		cd-hit was originally hosted at Google Code, some of the old releases are still available from https://code.google.com/p/cdhit/.
	18	Most up-to-date documents are available at https://github.com/weizhongli/cdhit/wiki
20	19
21	20	cd-hit is also available as web server, visit http://cd-hit.org for web server address.

+486

-110

cdhit-common.c++ less more

211	211	{
212	212	int intval = atoi( value );
213	213	if (strcmp(flag, "-i" ) == 0) input = value;
	214	else if (strcmp(flag, "-j" ) == 0) input_pe = value;
214	215	else if (strcmp(flag, "-o" ) == 0) output = value;
	216	else if (strcmp(flag, "-op") == 0) output_pe = value;
215	217	else if (strcmp(flag, "-M" ) == 0) max_memory = atoll(value) * 1000000;
216	218	else if (strcmp(flag, "-l" ) == 0) min_length = intval;
217	219	else if (strcmp(flag, "-c" ) == 0) cluster_thd = atof(value), useIdentity = true;

222	224	else if (strcmp(flag, "-s" ) == 0) diff_cutoff = atof(value);
223	225	else if (strcmp(flag, "-S" ) == 0) diff_cutoff_aa = intval;
224	226	else if (strcmp(flag, "-B" ) == 0) store_disk = intval;
	227	else if (strcmp(flag, "-P" ) == 0) PE_mode = intval;
	228	else if (strcmp(flag, "-cx") == 0) trim_len = intval;
	229	else if (strcmp(flag, "-cy") == 0) trim_len_R2 = intval;
	230	else if (strcmp(flag, "-ap") == 0) align_pos = intval;
	231	else if (strcmp(flag, "-sc") == 0) sort_output = intval;
	232	else if (strcmp(flag, "-sf") == 0) sort_outputf = intval;
225	233	else if (strcmp(flag, "-p" ) == 0) print = intval;
226	234	else if (strcmp(flag, "-g" ) == 0) cluster_best = intval;
227	235	else if (strcmp(flag, "-G" ) == 0) global_identity = intval;

279	287	{
280	288	if( SetOptionCommon( flag, value ) ) return true;
281	289	if (strcmp(flag, "-i2" ) == 0) input2 = value;
	290	else if (strcmp(flag, "-j2" ) == 0) input2_pe = value;
282	291	else if (strcmp(flag, "-s2") == 0) diff_cutoff2 = atof(value);
283	292	else if (strcmp(flag, "-S2") == 0) diff_cutoff_aa2 = atoi(value);
284	293	else return false;

350	359	if ((cluster_thd > 1.0) \|\| (cluster_thd < 0.4)) bomb_error("invalid clstr");
351	360	}
352	361
	362	if (input.size() == 0) bomb_error("no input file");
	363	if (output.size() == 0) bomb_error("no output file");
	364	if (PE_mode) {
	365	if (input_pe.size() == 0) bomb_error("no input file for R2 sequences in PE mode");
	366	if (output_pe.size() == 0) bomb_error("no output file for R2 sequences in PE mode");
	367	}
	368	if (isEST && (align_pos==1)) option_r = 0;
	369
353	370	if (band_width < 1 ) bomb_error("invalid band width");
354	371	if (NAA < 2 \|\| NAA > NAA_top_limit) bomb_error("invalid word length");
355	372	if (des_len < 0 ) bomb_error("too short description, not enough to identify sequences");

359	376	if( has2D ){
360	377	if ((diff_cutoff2<0) \|\| (diff_cutoff2>1)) bomb_error("invalid value for -s2");
361	378	if (diff_cutoff_aa2<0) bomb_error("invalid value for -S2");
	379	if (PE_mode) {
	380	if (input2_pe.size() == 0) bomb_error("no input file for R2 sequences for 2nd db in PE mode");
	381	}
362	382	}
363	383	if (global_identity == 0) print = 1;
364	384	if (short_coverage < long_coverage) short_coverage = long_coverage;

467	487	seq[j] = 0;
468	488	} // END void format_seq
469	489
	490	void strrev(char *p)
	491	{
	492	char *q = p;
	493	while(q && *q) ++q;
	494	for(--q; p < q; ++p, --q)
	495	p = p ^ *q,
	496	q = p ^ *q,
	497	p = p ^ *q;
	498	}
470	499
471	500	////For smiple len1 <= len2, len2 is for existing representative
472	501	////walk along all diag path of two sequences,

1457	1486	distance = 2.0;
1458	1487	if( other.data ){
1459	1488	size = bufsize = other.size;
	1489	size_R2 = 0;
1460	1490	data = new char[size+1];
1461	1491	//printf( "data: %p %p\n", data, other.data );
1462	1492	data[size] = 0;

1470	1500	identifier[len] = 0;
1471	1501	}
1472	1502	}
	1503
	1504	// back to back merge for PE
	1505	// R1 -> XXXXXXABC ------------------- NMLYYYYYY <--R2
	1506	// >R1 >R2
	1507	// XXXXXXABC YYYYYYLMN =====> Merge into
	1508	// >R12
	1509	// NMLYYYYYYXXXXXXABC
	1510	Sequence::Sequence( const Sequence & other, const Sequence & other2, int mode )
	1511	{
	1512	int i;
	1513	if (mode != 1) bomb_error("unknown mode");
	1514
	1515	//printf( "new: %p %p\n", this, & other );
	1516	memcpy( this, & other, sizeof( Sequence ) );
	1517	distance = 2.0;
	1518
	1519	if( other.data && other2.data ){
	1520	size = bufsize = (other.size + other2.size);
	1521	size_R2 = other2.size;
	1522	data = new char[size+1];
	1523	//printf( "data: %p %p\n", data, other.data );
	1524	data[size] = 0;
	1525	data[size_R2] = 0;
	1526	memcpy( data, other2.data, size_R2); // copy R2 first
	1527	strrev( data ); // reverse R2 on data
	1528	memcpy( data+size_R2, other.data, size-size_R2 ); // copy R1 to end of R2
	1529	//for (i=0; i<size; i++) data[i] = other.data[i];
	1530	des_begin2 = other2.des_begin;
	1531	tot_length2= other2.tot_length;
	1532	}
	1533	else if ( other.data \|\| other2.data ) {
	1534	bomb_error("Not both PE sequences have data");
	1535	}
	1536
	1537	if( other.identifier ){ // only use R1
	1538	int len = strlen( other.identifier );
	1539	identifier = new char[len+1];
	1540	memcpy( identifier, other.identifier, len );
	1541	identifier[len] = 0;
	1542	}
	1543	}
	1544
	1545
1473	1546	Sequence::~Sequence()
1474	1547	{
1475	1548	//printf( "delete: %p\n", this );

1528	1601	}
1529	1602	}
1530	1603	if( size ) data[size] = 0;
	1604	}
	1605	void Sequence::trim(int trim_len) {
	1606	if (trim_len >= size) return;
	1607	size = trim_len;
	1608	if (size) data[size]=0;
1531	1609	}
1532	1610	void Sequence::ConvertBases()
1533	1611	{

1599	1677	}
1600	1678	}
1601	1679
	1680	// by liwz
	1681	// disable swap option
	1682	// change des_begin, des_length, des_length2, dat_length => des_begin, tot_length
	1683	// where des_begin is the FILE pointer of sequence record start
	1684	// tot_length is the total bytes of sequence record
1602	1685	void SequenceDB::Read( const char *file, const Options & options )
1603	1686	{
1604		Sequence one;
1605		Sequence dummy;
1606		Sequence des;
1607		Sequence *last = NULL;
1608		FILE *swap = NULL;
1609		FILE *fin = fopen( file, "rb" );
1610		char *buffer = NULL;
1611		char *res = NULL;
1612		size_t swap_size = 0;
1613		int option_l = options.min_length;
1614		if( fin == NULL ) bomb_error( "Failed to open the database file" );
1615		if( options.store_disk ) swap = OpenTempFile( temp_dir );
1616		Clear();
1617		dummy.swap = swap;
1618		buffer = new char[ MAX_LINE_SIZE+1 ];
1619
1620		while (not feof( fin ) \|\| one.size) { /* do not break when the last sequence is not handled */
1621		buffer[0] = '>';
1622		if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL && one.size == 0) break;
1623		if( buffer[0] == '+' ){
1624		int len = strlen( buffer );
1625		int len2 = len;
1626		while( len2 && buffer[len2-1] != '\n' ){
1627		if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
1628		len2 = strlen( buffer );
1629		len += len2;
1630		}
1631		one.des_length2 = len;
1632		dummy.des_length2 = len;
1633		fseek( fin, one.size, SEEK_CUR );
1634		}else if (buffer[0] == '>' \|\| buffer[0] == '@' \|\| (res==NULL && one.size)) {
1635		if ( one.size ) { // write previous record
1636		one.dat_length = dummy.dat_length = one.size;
1637		if( one.identifier == NULL \|\| one.Format() ){
1638		printf( "Warning: from file \"%s\",\n", file );
1639		printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
1640		if( one.identifier ) printf( "%s\n", one.identifier );
1641		printf( "%s\n", one.data );
1642		one.size = 0;
1643		}
1644		one.index = dummy.index = sequences.size();
1645		if( one.size > option_l ) {
1646		if ( swap ) {
1647		swap_size += one.size;
1648		// so that size of file < MAX_BIN_SWAP about 2GB
1649		if ( swap_size >= MAX_BIN_SWAP) {
1650		dummy.swap = swap = OpenTempFile( temp_dir );
1651		swap_size = one.size;
1652		}
1653		dummy.size = one.size;
1654		dummy.offset = ftell( swap );
1655		dummy.des_length = one.des_length;
1656		sequences.Append( new Sequence( dummy ) );
1657		one.ConvertBases();
1658		fwrite( one.data, 1, one.size, swap );
1659		}else{
1660		//printf( "==================\n" );
1661		sequences.Append( new Sequence( one ) );
1662		//printf( "------------------\n" );
1663		//if( sequences.size() > 10 ) break;
1664		}
1665		//if( sequences.size() >= 10000 ) break;
1666		}
1667		}
1668		one.size = 0;
1669		one.des_length2 = 0;
1670
1671		int len = strlen( buffer );
1672		int len2 = len;
1673		des.size = 0;
1674		des += buffer;
1675		while( len2 && buffer[len2-1] != '\n' ){
1676		if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
1677		des += buffer;
1678		len2 = strlen( buffer );
1679		len += len2;
1680		}
1681		size_t offset = ftell( fin );
1682		one.des_begin = dummy.des_begin = offset - len;
1683		one.des_length = dummy.des_length = len;
1684
1685		int i = 0;
1686		if( des.data[i] == '>' \|\| des.data[i] == '@' \|\| des.data[i] == '+' ) i += 1;
1687		if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
1688		if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
1689		while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
1690		des.data[i] = 0;
1691		one.identifier = dummy.identifier = des.data;
1692		} else {
1693		one += buffer;
1694		}
1695		}
	1687	Sequence one;
	1688	Sequence des;
	1689	FILE *fin = fopen( file, "rb" );
	1690	char *buffer = NULL;
	1691	char *res = NULL;
	1692	int option_l = options.min_length;
	1693	if( fin == NULL ) bomb_error( "Failed to open the database file" );
	1694	Clear();
	1695	buffer = new char[ MAX_LINE_SIZE+1 ];
	1696
	1697	while (not feof( fin ) \|\| one.size) { /* do not break when the last sequence is not handled */
	1698	buffer[0] = '>';
	1699	if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL && one.size == 0) break;
	1700	if( buffer[0] == '+' ){
	1701	int len = strlen( buffer );
	1702	int len2 = len;
	1703	while( len2 && buffer[len2-1] != '\n' ){
	1704	if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
	1705	len2 = strlen( buffer );
	1706	len += len2;
	1707	}
	1708	one.tot_length += len;
	1709
	1710	// read next line quality score
	1711	if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) bomb_error("can not read quality score after");
	1712	len = strlen( buffer );
	1713	len2 = len;
	1714	while( len2 && buffer[len2-1] != '\n' ){
	1715	if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
	1716	len2 = strlen( buffer );
	1717	len += len2;
	1718	}
	1719	one.tot_length += len;
	1720	}else if (buffer[0] == '>' \|\| buffer[0] == '@' \|\| (res==NULL && one.size)) {
	1721	if ( one.size ) { // write previous record
	1722	if( one.identifier == NULL \|\| one.Format() ){
	1723	printf( "Warning: from file \"%s\",\n", file );
	1724	printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
	1725	if( one.identifier ) printf( "%s\n", one.identifier );
	1726	printf( "%s\n", one.data );
	1727	one.size = 0;
	1728	}
	1729	one.index = sequences.size();
	1730	if( one.size > option_l ) {
	1731	if (options.trim_len > 0) one.trim(options.trim_len);
	1732	sequences.Append( new Sequence( one ) );
	1733	}
	1734	}
	1735	one.size = 0;
	1736	one.tot_length = 0;
	1737
	1738	int len = strlen( buffer );
	1739	int len2 = len;
	1740	des.size = 0;
	1741	des += buffer;
	1742	while( len2 && buffer[len2-1] != '\n' ){
	1743	if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
	1744	des += buffer;
	1745	len2 = strlen( buffer );
	1746	len += len2;
	1747	}
	1748	size_t offset = ftell( fin );
	1749	one.des_begin = offset - len;
	1750	one.tot_length += len; // count first line
	1751
	1752	int i = 0;
	1753	if( des.data[i] == '>' \|\| des.data[i] == '@' \|\| des.data[i] == '+' ) i += 1;
	1754	if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
	1755	if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
	1756	while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
	1757	des.data[i] = 0;
	1758	one.identifier = des.data;
	1759	} else {
	1760	one.tot_length += strlen(buffer); one += buffer;
	1761	}
	1762	}
1696	1763	#if 0
1697		int i, n = 0;
1698		for(i=0; i<sequences.size(); i++) n += sequences[i].bufsize + 4;
1699		cout<<n<<"\t"<<sequences.capacity() * sizeof(Sequence)<<endl;
1700		int i;
1701		scanf( "%i", & i );
	1764	int i, n = 0;
	1765	for(i=0; i<sequences.size(); i++) n += sequences[i].bufsize + 4;
	1766	cout<<n<<"\t"<<sequences.capacity() * sizeof(Sequence)<<endl;
	1767	int i;
	1768	scanf( "%i", & i );
1702	1769	#endif
1703		one.identifier = dummy.identifier = NULL;
1704		delete[] buffer;
1705		fclose( fin );
	1770	one.identifier = NULL;
	1771	delete[] buffer;
	1772	fclose( fin );
	1773	}
	1774
	1775	// PE reads liwz, disable swap option
	1776	void SequenceDB::Read( const char file, const char file2, const Options & options )
	1777	{
	1778	Sequence one, two;
	1779	Sequence des;
	1780	FILE *fin = fopen( file, "rb" );
	1781	FILE *fin2= fopen( file2,"rb" );
	1782	char *buffer = NULL;
	1783	char *buffer2= NULL;
	1784	char *res = NULL;
	1785	char *res2= NULL;
	1786	int option_l = options.min_length;
	1787	if( fin == NULL ) bomb_error( "Failed to open the database file" );
	1788	if( fin2== NULL ) bomb_error( "Failed to open the database file" );
	1789	Clear();
	1790	buffer = new char[ MAX_LINE_SIZE+1 ];
	1791	buffer2= new char[ MAX_LINE_SIZE+1 ];
	1792
	1793	while (((not feof( fin )) && (not feof( fin2)) ) \|\| (one.size && two.size)) { /* do not break when the last sequence is not handled */
	1794	buffer[0] = '>'; res =fgets( buffer, MAX_LINE_SIZE, fin );
	1795	buffer2[0]= '>'; res2=fgets( buffer2, MAX_LINE_SIZE, fin2 );
	1796
	1797	if ( (res == NULL) && (res2 != NULL)) bomb_error( "Paired input files have different number sequences" );
	1798	if ( (res != NULL) && (res2 == NULL)) bomb_error( "Paired input files have different number sequences" );
	1799	if ( (one.size == 0 ) && (two.size > 0)) bomb_error( "Paired input files have different number sequences" );
	1800	if ( (one.size > 0 ) && (two.size == 0)) bomb_error( "Paired input files have different number sequences" );
	1801	if ( (res == NULL) && (one.size == 0)) break;
	1802
	1803	if( buffer[0] == '+' ){ // fastq 3rd line
	1804	// file 1
	1805	int len = strlen( buffer );
	1806	int len2 = len;
	1807	while( len2 && buffer[len2-1] != '\n' ){ // read until the end of the line
	1808	if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
	1809	len2 = strlen( buffer );
	1810	len += len2;
	1811	}
	1812	one.tot_length += len;
	1813
	1814	// read next line quality score
	1815	if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) bomb_error("can not read quality score after");
	1816	len = strlen( buffer );
	1817	len2 = len;
	1818	while( len2 && buffer[len2-1] != '\n' ){
	1819	if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
	1820	len2 = strlen( buffer );
	1821	len += len2;
	1822	}
	1823	one.tot_length += len;
	1824
	1825	// file 2
	1826	len = strlen( buffer2 );
	1827	len2 = len;
	1828	while( len2 && buffer2[len2-1] != '\n' ){ // read until the end of the line
	1829	if ( (res2=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) break;
	1830	len2 = strlen( buffer2 );
	1831	len += len2;
	1832	}
	1833	two.tot_length += len;
	1834
	1835	// read next line quality score
	1836	if ( (res2=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) bomb_error("can not read quality score after");
	1837	len = strlen( buffer2 );
	1838	len2 = len;
	1839	while( len2 && buffer2[len2-1] != '\n' ){
	1840	if ( (res2=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) break;
	1841	len2 = strlen( buffer2 );
	1842	len += len2;
	1843	}
	1844	two.tot_length += len;
	1845
	1846	}else if (buffer[0] == '>' \|\| buffer[0] == '@' \|\| (res==NULL && one.size)) {
	1847	if ( one.size && two.size ) { // write previous record
	1848	if( one.identifier == NULL \|\| one.Format() ){
	1849	printf( "Warning: from file \"%s\",\n", file );
	1850	printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
	1851	if( one.identifier ) printf( "%s\n", one.identifier );
	1852	printf( "%s\n", one.data );
	1853	one.size=0; two.size=0;
	1854	}
	1855	if( two.identifier == NULL \|\| two.Format() ){
	1856	printf( "Warning: from file \"%s\",\n", file2 );
	1857	printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
	1858	if( two.identifier ) printf( "%s\n", two.identifier );
	1859	printf( "%s\n", two.data );
	1860	one.size=0; two.size = 0;
	1861	}
	1862	one.index = sequences.size();
	1863	if( (one.size + two.size)> option_l ) {
	1864	if (options.trim_len > 0) one.trim(options.trim_len);
	1865	if (options.trim_len_R2 > 0) two.trim(options.trim_len_R2);
	1866	sequences.Append( new Sequence( one, two, 1 ) );
	1867	}
	1868	}
	1869	// R1
	1870	one.size = 0;
	1871	one.tot_length = 0;
	1872
	1873	int len = strlen( buffer );
	1874	int len2 = len;
	1875	des.size = 0;
	1876	des += buffer;
	1877	while( len2 && buffer[len2-1] != '\n' ){
	1878	if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
	1879	des += buffer;
	1880	len2 = strlen( buffer );
	1881	len += len2;
	1882	}
	1883	size_t offset = ftell( fin );
	1884	one.des_begin = offset - len; // offset of ">" or "@"
	1885	one.tot_length += len; // count first line
	1886
	1887	int i = 0;
	1888	if( des.data[i] == '>' \|\| des.data[i] == '@' \|\| des.data[i] == '+' ) i += 1;
	1889	if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
	1890	if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
	1891	while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
	1892	des.data[i] = 0; // find first non-space letter
	1893	one.identifier = des.data;
	1894
	1895	// R2
	1896	two.size = 0;
	1897	two.tot_length = 0;
	1898
	1899	len = strlen( buffer2 );
	1900	len2 = len;
	1901	while( len2 && buffer2[len2-1] != '\n' ){
	1902	if ( (res=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) break;
	1903	len2 = strlen( buffer2 );
	1904	len += len2;
	1905	}
	1906	offset = ftell( fin2 );
	1907	two.des_begin = offset - len;
	1908	two.tot_length += len; // count first line
	1909	two.identifier = des.data;
	1910	} else {
	1911	one.tot_length += strlen(buffer); one += buffer;
	1912	two.tot_length+= strlen(buffer2); two+= buffer2;
	1913	}
	1914	}
	1915	#if 0
	1916	int i, n = 0;
	1917	for(i=0; i<sequences.size(); i++) n += sequences[i].bufsize + 4;
	1918	cout<<n<<"\t"<<sequences.capacity() * sizeof(Sequence)<<endl;
	1919	int i;
	1920	scanf( "%i", & i );
	1921	#endif
	1922	one.identifier = NULL;
	1923	two.identifier = NULL;
	1924	delete[] buffer;
	1925	fclose( fin );
	1926	delete[] buffer2;
	1927	fclose( fin2 );
1706	1928	}
1707	1929
1708	1930	#if 0

1826	2048	n = sequences.size();
1827	2049	for (i=0; i<n; i++){
1828	2050	Sequence *seq = sequences[i];
1829		int qs = seq->des_length2 ? seq->des_length2 + seq->dat_length : 0;
1830	2051	fseek( fin, seq->des_begin, SEEK_SET );
1831	2052
1832	2053	seg_size += seq->size;

1838	2059	seg_size = seq->size;
1839	2060	}
1840	2061
1841		count = (seq->des_length + seq->dat_length + qs) / MAX_LINE_SIZE;
1842		rest = (seq->des_length + seq->dat_length + qs) % MAX_LINE_SIZE;
	2062	count = seq->tot_length / MAX_LINE_SIZE;
	2063	rest = seq->tot_length % MAX_LINE_SIZE;
1843	2064	//printf( "count = %6i, rest = %6i\n", count, rest );
1844	2065	for (j=0; j<count; j++){
1845	2066	if( fread( buf, 1, MAX_LINE_SIZE, fin ) ==0 ) bomb_error( "Can not swap in sequence" );

1867	2088	std::sort( sorting.begin(), sorting.end() );
1868	2089	for (i=0; i<n; i++){
1869	2090	Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
1870		int qs = seq->des_length2 ? seq->des_length2 + seq->dat_length : 0;
1871	2091	fseek( fin, seq->des_begin, SEEK_SET );
1872	2092
1873		count = (seq->des_length + seq->dat_length + qs) / MAX_LINE_SIZE;
1874		rest = (seq->des_length + seq->dat_length + qs) % MAX_LINE_SIZE;
	2093	count = seq->tot_length / MAX_LINE_SIZE;
	2094	rest = seq->tot_length % MAX_LINE_SIZE;
1875	2095	//printf( "count = %6i, rest = %6i\n", count, rest );
1876	2096	for (j=0; j<count; j++){
1877	2097	if( fread( buf, 1, MAX_LINE_SIZE, fin ) ==0 ) bomb_error( "Can not swap in sequence" );

1886	2106	fclose( fout );
1887	2107	delete []buf;
1888	2108	}
	2109	// liwz PE output
	2110	void SequenceDB::WriteClusters( const char db, const char db_pe, const char newdb, const char newdb_pe, const Options & options )
	2111	{
	2112	FILE *fin = fopen( db, "rb" );
	2113	FILE *fout = fopen( newdb, "w+" );
	2114	FILE *fin_pe = fopen( db_pe, "rb" );
	2115	FILE *fout_pe = fopen( newdb_pe, "w+" );
	2116	int i, j, n = rep_seqs.size();
	2117	int count, rest;
	2118	char *buf = new char[MAX_LINE_SIZE+1];
	2119	vector<uint64_t> sorting( n );
	2120	if( fin == NULL \|\| fout == NULL ) bomb_error( "file opening failed" );
	2121	if( fin_pe == NULL \|\| fout_pe == NULL ) bomb_error( "file opening failed" );
	2122	for (i=0; i<n; i++) sorting[i] = ((uint64_t)sequences[ rep_seqs[i] ]->index << 32) \| rep_seqs[i];
	2123	std::sort( sorting.begin(), sorting.end() );
	2124
	2125	//sort fasta / fastq
	2126	int *clstr_size;
	2127	int *clstr_idx1;
	2128	if (options.sort_outputf) {
	2129	clstr_size = new int[n];
	2130	clstr_idx1 = new int[n];
	2131	for (i=0; i<n; i++) {
	2132	clstr_size[i] = 0;
	2133	clstr_idx1[i] = i;
	2134	}
	2135
	2136	int N = sequences.size();
	2137	for (i=0; i<N; i++) {
	2138	int id = sequences[i]->cluster_id;
	2139	if (id < 0) continue;
	2140	if (id >=n) continue;
	2141	clstr_size[id]++;
	2142	}
	2143	quick_sort_idxr(clstr_size, clstr_idx1, 0, n-1);
	2144	}
	2145
	2146	for (i=0; i<n; i++){
	2147	Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
	2148	if (options.sort_outputf) seq = sequences[ rep_seqs[ clstr_idx1[i] ] ];
	2149	//R1
	2150	fseek( fin, seq->des_begin, SEEK_SET );
	2151
	2152	count = seq->tot_length / MAX_LINE_SIZE;
	2153	rest = seq->tot_length % MAX_LINE_SIZE;
	2154	//printf( "count = %6i, rest = %6i\n", count, rest );
	2155	for (j=0; j<count; j++){
	2156	if( fread( buf, 1, MAX_LINE_SIZE, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
	2157	fwrite( buf, 1, MAX_LINE_SIZE, fout );
	2158	}
	2159	if( rest ){
	2160	if( fread( buf, 1, rest, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
	2161	fwrite( buf, 1, rest, fout );
	2162	}
	2163
	2164	//R2
	2165	fseek( fin_pe, seq->des_begin2, SEEK_SET );
	2166
	2167	count = seq->tot_length2 / MAX_LINE_SIZE;
	2168	rest = seq->tot_length2 % MAX_LINE_SIZE;
	2169	//printf( "count = %6i, rest = %6i\n", count, rest );
	2170	for (j=0; j<count; j++){
	2171	if( fread( buf, 1, MAX_LINE_SIZE, fin_pe ) ==0 ) bomb_error( "Can not swap in sequence" );
	2172	fwrite( buf, 1, MAX_LINE_SIZE, fout_pe );
	2173	}
	2174	if( rest ){
	2175	if( fread( buf, 1, rest, fin_pe ) ==0 ) bomb_error( "Can not swap in sequence" );
	2176	fwrite( buf, 1, rest, fout_pe );
	2177	}
	2178
	2179	}
	2180	fclose( fin );
	2181	fclose( fout );
	2182	fclose( fin_pe );
	2183	fclose( fout_pe );
	2184	delete []buf;
	2185	}
	2186
1889	2187	void SequenceDB::WriteExtra1D( const Options & options )
1890	2188	{
1891	2189	string db_clstr = options.output + ".clstr";
1892	2190	string db_clstr_bak = options.output + ".bak.clstr";
1893		int i, k, N = sequences.size();
	2191	int i, i0, k, N = sequences.size();
1894	2192	vector<long long> sorting( N );
1895	2193	for (i=0; i<N; i++) sorting[i] = ((long long)sequences[i]->index << 32) \| i;
1896	2194	std::sort( sorting.begin(), sorting.end() );

1917	2215	}
1918	2216
1919	2217	fout = fopen( db_clstr.c_str(), "w+" );
1920		for (i=0; i<M; i++) {
	2218
	2219	if (options.sort_output) {
	2220	int *clstr_size = new int[M];
	2221	int *clstr_idx1 = new int[M];
	2222
	2223	for (i=0; i<M; i++) {
	2224	clstr_size[i] = (int)clusters[i].size();
	2225	clstr_idx1[i] = i;
	2226	}
	2227	quick_sort_idxr(clstr_size, clstr_idx1, 0, M-1);
	2228
	2229	for (i=0; i<M; i++) {
	2230	i0 = clstr_idx1[i];
	2231	fprintf( fout, ">Cluster %i\n", i );
	2232	for (k=0; k<(int)clusters[i0].size(); k++)
	2233	sequences[ clusters[i0][k] ]->PrintInfo( k, fout, options, buf );
	2234	}
	2235	}
	2236	else {
	2237	for (i=0; i<M; i++) {
1921	2238	fprintf( fout, ">Cluster %i\n", i );
1922	2239	for (k=0; k<(int)clusters[i].size(); k++)
1923	2240	sequences[ clusters[i][k] ]->PrintInfo( k, fout, options, buf );
1924		}
	2241	}
	2242
	2243	}
	2244
1925	2245	delete []buf;
1926	2246	}
1927	2247	void SequenceDB::WriteExtra2D( SequenceDB & other, const Options & options )

3283	3603	}
3284	3604	} // make_comp_short_word_index
3285	3605
3286
	3606	//quick_sort_idx calling (a, idx, 0, no-1)
	3607	//sort a with another array idx
	3608	//so that idx rearranged
	3609	int quick_sort_idx (int a, int idx, int lo0, int hi0 ) {
	3610	int lo = lo0;
	3611	int hi = hi0;
	3612	int mid;
	3613	int tmp;
	3614
	3615	if ( hi0 > lo0) {
	3616	mid = a[ ( lo0 + hi0 ) / 2 ];
	3617
	3618	while( lo <= hi ) {
	3619	while( ( lo < hi0 ) && ( a[lo] < mid ) ) lo++;
	3620	while( ( hi > lo0 ) && ( a[hi] > mid ) ) hi--;
	3621	if( lo <= hi ) {
	3622	tmp=a[lo]; a[lo]=a[hi]; a[hi]=tmp;
	3623	tmp=idx[lo]; idx[lo]=idx[hi]; idx[hi]=tmp;
	3624	lo++; hi--;
	3625	}
	3626	} // while
	3627
	3628	if( lo0 < hi ) quick_sort_idx(a, idx, lo0, hi );
	3629	if( lo < hi0 ) quick_sort_idx(a, idx, lo, hi0 );
	3630	} // if ( hi0 > lo0)
	3631	return 0;
	3632	} // quick_sort_idx
	3633
	3634
	3635	//decreasing can not use reverse of quick_sort_idx due to tie
	3636	//quick_sort_idxr calling (a, idx, 0, no-1)
	3637	//sort a with another array idx
	3638	//so that idx rearranged
	3639	int quick_sort_idxr (int a, int idx, int lo0, int hi0 ) {
	3640	int lo = lo0;
	3641	int hi = hi0;
	3642	int mid;
	3643	int tmp;
	3644
	3645	if ( hi0 > lo0) {
	3646	mid = a[ ( lo0 + hi0 ) / 2 ];
	3647
	3648	while( lo <= hi ) {
	3649	while( ( lo < hi0 ) && ( a[lo] > mid ) ) lo++;
	3650	while( ( hi > lo0 ) && ( a[hi] < mid ) ) hi--;
	3651	if( lo <= hi ) {
	3652	tmp=a[lo]; a[lo]=a[hi]; a[hi]=tmp;
	3653	tmp=idx[lo]; idx[lo]=idx[hi]; idx[hi]=tmp;
	3654	lo++; hi--;
	3655	}
	3656	} // while
	3657
	3658	if( lo0 < hi ) quick_sort_idxr(a, idx, lo0, hi );
	3659	if( lo < hi0 ) quick_sort_idxr(a, idx, lo, hi0 );
	3660	} // if ( hi0 > lo0)
	3661	return 0;
	3662	} // quick_sort_idxr
3287	3663
3288	3664	/////////////////////////// END ALL ////////////////////////
3289	3665

+30

-8

cdhit-common.h less more

38	38	#include<vector>
39	39	#include<map>
40	40
41		#define CDHIT_VERSION "4.6"
	41	#define CDHIT_VERSION "4.7"
42	42
43	43	#ifndef MAX_SEQ
44	44	#define MAX_SEQ 655360

279	279	int frag_size;
280	280	int option_r;
281	281	int threads;
	282	int PE_mode; // -P
	283	int trim_len; // -cx
	284	int trim_len_R2; // -cy
	285	int align_pos; // -ap for alignment position
282	286
283	287	size_t max_entries;
284	288	size_t max_sequences;

292	296	bool backupFile;
293	297
294	298	string input;
	299	string input_pe;
295	300	string input2;
	301	string input2_pe;
296	302	string output;
	303	string output_pe;
	304
	305	int sort_output; // -sc
	306	int sort_outputf; // -sf
297	307
298	308	Options(){
299	309	backupFile = false;

331	341	frag_size = 0;
332	342	des_len = 20;
333	343	threads = 1;
	344	PE_mode = 0;
	345	trim_len = 0;
	346	trim_len_R2 = 0;
	347	align_pos = 0;
	348	sort_output = 0;
	349	sort_outputf = 0;
334	350	max_entries = 0;
335	351	max_sequences = 1<<20;
336	352	mem_limit = 100000000;

357	373	// length of the sequence:
358	374	int size;
359	375	int bufsize;
	376	int size_R2; // size = size.R1 + size.R2 for back-to-back merged seq
360	377
361	378	//uint32_t stats;
362	379

368	385	int offset;
369	386
370	387	// stream offset of the description string in the database:
371		size_t des_begin;
372		// length of the description:
373		int des_length;
374		// length of the description in quality score part:
375		int des_length2;
376		// length of data in fasta file, including line wrapping:
377		int dat_length;
	388	size_t des_begin, des_begin2;
	389	// total record length
	390	int tot_length, tot_length2;
378	391
379	392	char *identifier;
380	393

388	401
389	402	Sequence();
390	403	Sequence( const Sequence & other );
	404	Sequence( const Sequence & other, const Sequence & other2, int mode );
391	405	~Sequence();
392	406
393	407	void Clear();

402	416	int Format();
403	417
404	418	void ConvertBases();
	419	void trim(int trim_len);
405	420
406	421	void SwapIn();
407	422	void SwapOut();

543	558	~SequenceDB(){ Clear(); }
544	559
545	560	void Read( const char *file, const Options & options );
	561	void Read( const char file, const char file2, const Options & options );
546	562	void WriteClusters( const char db, const char newdb, const Options & options );
	563	void WriteClusters( const char db, const char db_pe, const char newdb, const char newdb_pe, const Options & options );
547	564	void WriteExtra1D( const Options & options );
548	565	void WriteExtra2D( SequenceDB & other, const Options & options );
549	566	void DivideSave( const char db, const char newdb, int n, const Options & options );

589	606	int &best_score, int &iden_no, int &alnln, float &dist, int *alninfo,
590	607	int band_left, int band_center, int band_right, WorkingBuffer & buffer);
591	608
	609	void strrev(char *p);
592	610	int print_usage_2d (char *arg);
593	611	int print_usage_est (char *arg);
594	612	int print_usage_div (char *arg);

605	623	int calc_ann_list(int len, char *seqi, int NAA, int& aan_no, Vector<int> & aan_list, Vector<INTs> & aan_list_no, bool est=false);
606	624
607	625	float current_time();
	626
	627	//some functions from very old cd-hit
	628	int quick_sort_idx(int a, int idx, int lo0, int hi0 );
	629	int quick_sort_idxr(int a, int idx, int lo0, int hi0 );

+15

-2

cdhit-est-2d.c++ less more

48	48	string db_in;
49	49	string db_in2;
50	50	string db_out;
	51	string db_in_pe;
	52	string db_in2_pe;
	53	string db_out_pe;
	54
51	55
52	56	options.cluster_thd = 0.95;
53	57	options.NAA = 10;

66	70	options.Validate();
67	71
68	72	db_in = options.input;
	73	db_in_pe = options.input_pe;
69	74	db_in2 = options.input2;
	75	db_in2_pe = options.input2_pe;
70	76	db_out = options.output;
	77	db_out_pe = options.output_pe;
	78
71	79
72	80	InitNAA( MAX_UAA );
73	81	options.NAAN = NAAN_array[options.NAA];

79	87	make_comp_short_word_index(options.NAA, NAAN_array, Comp_AAN_idx);
80	88	}
81	89
82		seq_db.Read( db_in.c_str(), options );
	90	if ( options.PE_mode ) {seq_db.Read( db_in.c_str(), db_in_pe.c_str(), options );}
	91	else {seq_db.Read( db_in.c_str(), options );}
83	92	cout << "total seq in db1: " << seq_db.sequences.size() << endl;
84	93
85		seq_db2.Read( db_in2.c_str(), options );
	94	if ( options.PE_mode ) { seq_db2.Read( db_in2.c_str(), db_in2_pe.c_str(), options );}
	95	else { seq_db2.Read( db_in2.c_str(), options );}
86	96	cout << "total seq in db2: " << seq_db2.sequences.size() << endl;
87	97
88	98	seq_db.SortDivide( options );

92	102	cout << "writing non-redundant sequences from db2" << endl;
93	103	seq_db2.WriteClusters( db_in2.c_str(), db_out.c_str(), options );
94	104
	105	if ( options.PE_mode ) { seq_db2.WriteClusters( db_in2.c_str(), db_in2_pe.c_str(), db_out.c_str(), db_out_pe.c_str(), options ); }
	106	else { seq_db2.WriteClusters( db_in2.c_str(), db_out.c_str(), options ); }
	107
95	108	seq_db2.WriteExtra2D( seq_db, options );
96	109	cout << "program completed !" << endl << endl;
97	110	end_time = current_time();

+11

-4

cdhit-est.c++ less more

42	42	{
43	43	string db_in;
44	44	string db_out;
	45	string db_in_pe;
	46	string db_out_pe;
45	47
46	48	options.cluster_thd = 0.95;
47	49	options.NAA = 10;

59	61	if (options.SetOptions( argc, argv, false, true ) == 0) print_usage_est(argv[0]);
60	62	options.Validate();
61	63
62		db_in = options.input;
63		db_out = options.output;
	64	db_in = options.input;
	65	db_in_pe = options.input_pe;
	66	db_out = options.output;
	67	db_out_pe = options.output_pe;
64	68
65	69	InitNAA( MAX_UAA );
66	70	seq_db.NAAN = NAAN_array[options.NAA];

70	74	make_comp_short_word_index(options.NAA, NAAN_array, Comp_AAN_idx);
71	75	}
72	76
73		seq_db.Read( db_in.c_str(), options );
	77	if ( options.PE_mode ) {seq_db.Read( db_in.c_str(), db_in_pe.c_str(), options );}
	78	else {seq_db.Read( db_in.c_str(), options );}
	79
74	80	cout << "total seq: " << seq_db.sequences.size() << endl;
75	81	seq_db.SortDivide( options );
76	82	seq_db.DoClustering( options );
77	83
78	84	printf( "writing new database\n" );
79		seq_db.WriteClusters( db_in.c_str(), db_out.c_str(), options );
	85	if ( options.PE_mode ) { seq_db.WriteClusters( db_in.c_str(), db_in_pe.c_str(), db_out.c_str(), db_out_pe.c_str(), options ); }
	86	else { seq_db.WriteClusters( db_in.c_str(), db_out.c_str(), options ); }
80	87
81	88	// write a backup clstr file in case next step crashes
82	89	seq_db.WriteExtra1D( options );

+55

-12

cdhit-utility.c++ less more

6	6
7	7	// information
8	8	char cd_hit_ver[] = "\t\t====== CD-HIT version " CDHIT_VERSION " (built on " __DATE__ ") ======";
9		char cd_hit_ref1[] = "\"Clustering of highly homologous sequences to reduce thesize of large protein database\", Weizhong Li, Lukasz Jaroszewski & Adam Godzik. Bioinformatics, (2001) 17:282-283";
10		char cd_hit_ref2[] = "\"Tolerating some redundancy significantly speeds up clustering of large protein databases\", Weizhong Li, Lukasz Jaroszewski & Adam Godzik. Bioinformatics, (2002) 18:77-82";
11		char cd_hit_ref3[] = "\"Cd-hit: a fast program for clustering and comparing large sets of protein or nucleotide sequences\", Weizhong Li & Adam Godzik. Bioinformatics, (2006) 22:1658-1659";
12		char cd_hit_ref4[] = "\"Beifang Niu, Limin Fu, Shulei Sun and Weizhong Li. Artificial and natural duplicates in pyrosequencing reads of metagenomic data. BMC Bioinformatics (2010) 11:187";
	9	char cd_hit_ref1[] = "\"CD-HIT: a fast program for clustering and comparing large sets of protein or nucleotide sequences\", Weizhong Li & Adam Godzik. Bioinformatics, (2006) 22:1658-1659";
	10	char cd_hit_ref2[] = "\"CD-HIT: accelerated for clustering the next generation sequencing data\", Limin Fu, Beifang Niu, Zhengwei Zhu, Sitao Wu & Weizhong Li. Bioinformatics, (2012) 28:3150-3152";
	11	char cd_hit_ref3[] = "\"Beifang Niu, Limin Fu, Shulei Sun and Weizhong Li. Artificial and natural duplicates in pyrosequencing reads of metagenomic data. BMC Bioinformatics (2010) 11:187";
13	12	//
14	13
15	14	char contacts[] =

19	18	" If you find cd-hit useful, please kindly cite:\n\n";
20	19
21	20	char txt_option_i[] = "\tinput filename in fasta format, required\n";
	21	char txt_option_j[] =
	22	"\tinput filename in fasta/fastq format for R2 reads if input are paired end (PE) files\n \
	23	\t -i R1.fq -j R2.fq -o output_R1 -op output_R2 or\n \
	24	\t -i R1.fa -j R2.fa -o output_R1 -op output_R2 \n";
22	25	char txt_option_i_2d[] = "\tinput filename for db1 in fasta format, required\n";
23	26	char txt_option_i2[] = "\tinput filename for db2 in fasta format, required\n";
	27	char txt_option_j2[] =
	28	"\tinput filename in fasta/fastq format for R2 reads if input are paired end (PE) files\n \
	29	\t -i db1-R1.fq -j db1-R2.fq -i2 db2-R1.fq -j2 db2-R2.fq -o output_R1 -op output_R2 or\n \
	30	\t -i db1-R1.fa -j db1-R2.fa -i2 db2-R1.fq -j2 db2-R2.fq -o output_R1 -op output_R2 \n";
24	31	char txt_option_o[] = "\toutput filename, required\n";
	32	char txt_option_op[] = "\toutput filename for R2 reads if input are paired end (PE) files\n";
25	33	char txt_option_c[] =
26	34	"\tsequence identity threshold, default 0.9\n \
27	35	\tthis is the default cd-hit's \"global sequence identity\" calculated as:\n \

87	95	char txt_option_B[] =
88	96	"\t1 or 0, default 0, by default, sequences are stored in RAM\n \
89	97	\tif set to 1, sequence are stored on hard drive\n \
90		\tit is recommended to use -B 1 for huge databases\n";
	98	\t!! No longer supported !!\n";
	99	char txt_option_P[] =
	100	"\tinput paired end (PE) reads, default 0, single file\n \
	101	\tif set to 1, please use -i R1 -j R2 to input both PE files\n";
	102	char txt_option_cx[] =
	103	"\tlength to keep after trimming the tail of sequence, default 0, not trimming\n \
	104	\tif set to 50, the program only uses the first 50 letters of input sequence\n";
	105	char txt_option_cy[] =
	106	"\tlength to keep after trimming the tail of R2 sequence, default 0, not trimming\n \
	107	\tif set to 50, the program only uses the first 50 letters of input R2 sequence\n \
	108	\te.g. -cx 100 -cy 80 for paired end reads\n";
	109	char txt_option_ap[] =
	110	"\talignment position constrains, default 0, no constrain\n \
	111	\tif set to 1, the program will force sequences to align at beginings\n \
	112	\twhen set to 1, the program only does +/+ alignment\n";
91	113	char txt_option_uL[] =
92	114	"\tmaximum unmatched percentage for the longer sequence, default 1.0\n \
93	115	\tif set to 0.1, the unmatched region (excluding leading and tailing gaps)\n \

107	129	\tif set to 0, only +/+ strand alignment\n";
108	130	char txt_option_bak[] =
109	131	"\twrite backup cluster file (1 or 0, default 0)\n";
	132	char txt_option_sc[] =
	133	"\tsort clusters by size (number of sequences), default 0, output clusters by decreasing length\n \
	134	\tif set to 1, output clusters by decreasing size\n";
	135	char txt_option_sf[] =
	136	"\tsort fasta/fastq by cluster size (number of sequences), default 0, no sorting\n \
	137	\tif set to 1, output sequences by decreasing cluster size\n";
110	138
111	139	char txt_option_mask[] = "\tmasking letters (e.g. -mask NX, to mask out both 'N' and 'X')\n";
112	140	char txt_option_match[] = "\tmatching score, default 2 (1 for T-U and N-N)\n";

144	172	cout << " -B" << txt_option_B;
145	173	cout << " -p" << txt_option_p;
146	174	cout << " -g" << txt_option_g;
	175	cout << " -sc"<< txt_option_sc;
	176	cout << " -sf"<< txt_option_sf;
147	177	cout << " -bak" << txt_option_bak;
148	178	cout << " -h\tprint this help\n\n";
149	179	cout << contacts;

189	219	cout << " Questions, bugs, contact Weizhong Li at liwz@sdsc.edu\n\n";
190	220	cout << " If you find cd-hit useful, please kindly cite:\n\n";
191	221	cout << " " << cd_hit_ref1 << "\n";
192		cout << " " << cd_hit_ref3 << "\n\n\n";
	222	cout << " " << cd_hit_ref2 << "\n\n\n";
193	223	exit(1);
194	224	} // END print_usage_2d
195	225

198	228	cout << cd_hit_ver << "\n\n" ;
199	229	cout << "Usage: "<< arg << " [Options] \n\nOptions\n\n";
200	230	cout << " -i" << txt_option_i;
201		cout << " -o" << txt_option_o;
	231	cout << " -j" << txt_option_j;
	232	cout << " -o" << txt_option_o;
	233	cout << " -op" << txt_option_op;
202	234	cout << " -c" << txt_option_c;
203	235	cout << " -G" << txt_option_G;
204	236	cout << " -b" << txt_option_b;

218	250	cout << " -uS" << txt_option_uS;
219	251	cout << " -U" << txt_option_U;
220	252	cout << " -B" << txt_option_B;
	253	cout << " -P" << txt_option_P;
	254	cout << " -cx"<< txt_option_cx;
	255	cout << " -cy"<< txt_option_cy;
	256	cout << " -ap"<< txt_option_ap;
221	257	cout << " -p" << txt_option_p;
222	258	cout << " -g" << txt_option_g;
223	259	cout << " -r" << txt_option_r;

227	263	cout << " -gap" << txt_option_gap;
228	264	cout << " -gap-ext" << txt_option_gap_ext;
229	265	cout << " -bak" << txt_option_bak;
	266	cout << " -sc"<< txt_option_sc;
	267	cout << " -sf"<< txt_option_sf;
230	268	cout << " -h\tprint this help\n\n";
231	269	cout << contacts;
232	270	cout << " " << cd_hit_ref1 << "\n";
233		cout << " " << cd_hit_ref3 << "\n\n\n";
	271	cout << " " << cd_hit_ref2 << "\n\n\n";
234	272	exit(1);
235	273	} // END print_usage_est
236	274

240	278	cout << "Usage: "<< arg << " [Options] \n\nOptions\n\n";
241	279	cout << " -i" << txt_option_i_2d;
242	280	cout << " -i2"<< txt_option_i2;
243		cout << " -o" << txt_option_o;
	281	cout << " -j, -j2"<< txt_option_j2;
	282	cout << " -o" << txt_option_o;
	283	cout << " -op" << txt_option_op;
244	284	cout << " -c" << txt_option_c;
245	285	cout << " -G" << txt_option_G;
246	286	cout << " -b" << txt_option_b;

262	302	cout << " -uS" << txt_option_uS;
263	303	cout << " -U" << txt_option_U;
264	304	cout << " -B" << txt_option_B;
	305	cout << " -P" << txt_option_P;
	306	cout << " -cx"<< txt_option_cx;
	307	cout << " -cy"<< txt_option_cy;
265	308	cout << " -p" << txt_option_p;
266	309	cout << " -g" << txt_option_g;
267	310	cout << " -r" << txt_option_r;

274	317	cout << " -h\tprint this help\n\n";
275	318	cout << contacts;
276	319	cout << " " << cd_hit_ref1 << "\n";
277		cout << " " << cd_hit_ref3 << "\n\n\n";
	320	cout << " " << cd_hit_ref2 << "\n\n\n";
278	321	exit(1);
279	322	} // END print_usage_est_2d
280	323

325	368	cout << " Questions, bugs, contact Weizhong Li at liwz@sdsc.edu\n\n";
326	369	cout << " If you find cd-hit useful, please kindly cite:\n\n";
327	370	cout << " " << cd_hit_ref1 << "\n";
328		cout << " " << cd_hit_ref3 << "\n";
329		cout << " " << cd_hit_ref4 << "\n\n\n";
	371	cout << " " << cd_hit_ref2 << "\n";
	372	cout << " " << cd_hit_ref3 << "\n\n\n";
330	373	exit(1);
331	374	}
332	375

+89

-0

clstr_list.pl less more

	0	#!/usr/bin/perl
	1
	2	use Storable;
	3	use strict;
	4	#my $sort_by_what = shift;
	5	# $sort_by_what = "no" unless $sort_by_what;
	6
	7	my $clstr_file = shift;
	8	my $store_file = shift;
	9
	10	my %clstr = (); # an array of hashes for all the cluster
	11	my $rep_len = 0;
	12	my $rep_acc = "";
	13	my @cur_sequences = (); # array of hashes for all sequences in a cluster
	14	my $ll = "";
	15	my @record = ();
	16
	17	open(TMP, $clstr_file) \|\| die;
	18	while($ll = <TMP>) { # read .clstr files
	19	if ($ll =~ /^>/) { # the begin of a cluster
	20	if (scalar(@cur_sequences)) { # not the first cluster, therefore collect the information of last clstr
	21	#@cur_sequences = sort {$$b{"seq_len"} <=> $$a{"seq_len"}} @cur_sequences;
	22	@cur_sequences = sort {$$b[1] <=> $$a[1]} @cur_sequences;
	23	@record = ($rep_acc, $rep_len, 1, [@cur_sequences], "");
	24	$clstr{$rep_acc} = [@record];
	25	}
	26	@cur_sequences=();
	27	}
	28	else { # the sequence line
	29	chop($ll);
	30	if ($ll =~ /^(\d+)\s+(\d+)(aa\|nt),\s+>(.+)\.\.\./) {
	31	@record = ($4, $2, 0, [], "");
	32	if ($ll =~ /\*$/) { # representative sequence or not
	33	$rep_acc = $record[0];
	34	$rep_len = $record[1];
	35	$record[4] = "100%";
	36	}
	37	# elsif ($ll =~ / at (\d.+)$/ ) {
	38	elsif ($ll =~ / at (.+\d.+)$/ ) {# because cd-hit-est have strand info
	39	$record[4] = $1;
	40	}
	41	}
	42	push(@cur_sequences, [@record]);
	43	}
	44	}
	45	if (scalar(@cur_sequences)) {
	46	#@cur_sequences = sort {$$b{"seq_len"} <=> $$a{"seq_len"}} @cur_sequences;
	47	@cur_sequences = sort {$$b[1] <=> $$a[1]} @cur_sequences;
	48	@record = ($rep_acc, $rep_len, 1, [@cur_sequences], "");
	49	$clstr{$rep_acc} = [@record];
	50	}
	51	close(TMP);
	52
	53	if (-e $store_file){ # already have a cluster file
	54	my %old_clstr = %{retrieve($store_file)};
	55	foreach my $rep_acc (keys %clstr){
	56	my $seqs = $clstr{$rep_acc}[3]; # $seqs a reference to the sequences;
	57	my $tmp_size = scalar(@{$seqs}); # how many sequences in a top level cluster, each sequence should be a representative sequence for lower level cluster
	58	#print "$rep_acc, $tmp_size\n";
	59	my $i;
	60	for $i (0..($tmp_size-1)){
	61	my $seq = $$seqs[$i];
	62	if ($old_clstr{$$seq[0]}){
	63	$clstr{$rep_acc}[3][$i][3] = [@{$old_clstr{$$seq[0]}[3]}];
	64	$clstr{$rep_acc}[3][$i][2] = 1;
	65	}
	66	}
	67	}
	68	}
	69
	70	store \%clstr, $store_file;
	71
	72	#~ my $size = scalar(keys %clstr);
	73	#~ print "$size\n";
	74
	75	#~ my $acc = 'D8F4YGO02FSTQP\|range\|2:370\|frame\|2\|len\|123';
	76
	77	#~ my $temp = $clstr{$acc}[1];
	78	#~ print "$temp\n";
	79
	80	#~ my $temp = scalar(@{$clstr{$acc}[3]});
	81	#~ print "$temp\n";
	82
	83	#~ my $x;
	84	#~ for $x (@{$clstr{$acc}[3]} ){
	85	#~ my $tmp_1 = scalar(@{$x->[3]});
	86	#~ print "$x->[2], $x->[4], $x->[0], $x->[1], $tmp_1\n";
	87	#~ }
	88

+51

-0

clstr_list_sort.pl less more

	0	#!/usr/bin/perl
	1
	2	use Storable;
	3	use strict;
	4
	5	my $input_file = shift;
	6	my $output_file = shift;
	7	my $sort_by_what = shift;
	8	$sort_by_what = "no" unless $sort_by_what;
	9
	10	my @clstr = values %{retrieve($input_file)};
	11
	12
	13	if ($sort_by_what eq "no") {
	14
	15	### Added by liwz sort by No. sequences instead of No. nodes
	16	my %rep2size = ();
	17	my $clstr_no = scalar(@clstr);
	18	my ($i);
	19
	20
	21	for ($i=0; $i<$clstr_no; $i++){
	22	my $node_size = 0;
	23	foreach my $seq1 (@{$clstr[$i][3]}) {
	24	if ($$seq1[2]) { # can be futher expanded
	25	foreach my $seq2(@{$$seq1[3]}) {
	26	if ($$seq2[2]) { $node_size += scalar(@{$$seq2[3]}); }
	27	else { $node_size++; }
	28	}
	29	}
	30	else {
	31	$node_size++;
	32	}
	33	}
	34	$rep2size{ $clstr[$i][0] } = $node_size;
	35	}
	36	### END
	37
	38	#@clstr = sort {scalar(@{$b->[3]}) <=> scalar(@{$a->[3]})} @clstr;
	39	@clstr = sort {$rep2size{$b->[0]} <=> $rep2size{$a->[0]}} @clstr;
	40	}
	41	elsif ($sort_by_what eq "len") {
	42	@clstr = sort {$b->[1] <=> $a->[1]} @clstr;
	43	}
	44	elsif ($sort_by_what eq "des") {
	45	@clstr = sort {$a->[0] cmp $b->[0]} @clstr;
	46	}
	47
	48	store \@clstr, $output_file;
	49
	50

doc/cd-hit-otu-miseq-Figure-1.png less more

Binary diff not shown

doc/cdhit-user-guide.pdf less more

Binary diff not shown

+355

-83

doc/cdhit-user-guide.wiki less more

3	3
4	4	[[http://cd-hit.org]]
5	5
6		Program developed by Weizhong Li's lab at UCSD [[http://weizhong-lab.ucsd.edu]] and JCVI [[http://jcvi.org]] [[liwz@sdsc.edu]]
	6	Program developed by Weizhong Li's lab at UCSD [[http://weizhongli-lab.org]] and JCVI [[http://jcvi.org]] [[liwz@sdsc.edu]]
7	7
8	8	===== Introduction =====
9	9

61	61
62	62	Reduced alphabet (to be implemented): This is for protein clustering. In reduced alphabet, a group of exchangeable residues are reduced to a single residue (I/V/L==>I, S/T==>S, D/E==>D, K/R==>K, F/Y==>F), and then conservative mutations would appear as identities in sequence alignments. It improves the short word filter for clustering at low sequence identity below 50%.
63	63
64		Gapped word (to be implemented): Short word filter using gapped word allows mismatch within a word such as “ACE” vs “AME”, “ACFE” vs “AMYE”, and “AACTT” vs “AAGTT”, which can be written as “101”, “1001” and “11011”. At low identity cutoff, a gapped word is more efficient than an ungapped word for filtering.
	64	Gapped word (to be implemented): Short word filter using gapped word allows mismatch within a word such as âACEâ vs âAMEâ, âACFEâ vs âAMYEâ, and âAACTTâ vs âAAGTTâ, which can be written as â101â, â1001â and â11011â. At low identity cutoff, a gapped word is more efficient than an ungapped word for filtering.
65	65
66	66
67	67

94	94	It can be copied under the GNU General Public License version 2 (GPLv2).
95	95
96	96	Most CD-HIT programs were written in C++. Installing CD-HIT package is very simple:
97		* download current CD-HIT at [[https://github.com/weizhongli/cdhit/releases]], for example cd-hit-v4.6.2-2015-0511.tar.gz
98		* unpack the file with " tar xvf cd-hit-v4.6.2-2015-0511.tar.gz --gunzip"
99		* change dir by "cd cd-hit-v4.6.2-2015-0511"
	97	* download current CD-HIT at [[https://github.com/weizhongli/cdhit/releases]], for example cd-hit-v4.6.6-2016-0711.tar.gz
	98	* unpack the file with " tar xvf cd-hit-v4.6.6-2016-0711.tar.gz --gunzip"
	99	* change dir by "cd cd-hit-v4.6.6-2016-0711"
100	100	* compile the programs by "make" with multi-threading (default), or by "make openmp=no" without multi-threading (on old systems without OpenMP)
101	101	* cd cd-hit-auxtools
102	102	* compile cd-hit-auxtools by "make"

106	106	CD-HIT clusters proteins into clusters that meet a user-defined similarity threshold, usually a sequence identity. Each cluster has one representative sequence. The input is a protein dataset in fasta format and the output are two files: a fasta file of representative sequences and a text file of list of clusters.
107	107
108	108	Basic command:
109		cd-hit -i nr -o nr100 -c 1.00 -n 5 -M 16000 –d 0 -T 8
110		cd-hit -i db -o db90 -c 0.9 -n 5 -M 16000 –d 0 -T 8,
	109	cd-hit -i nr -o nr100 -c 1.00 -n 5 -M 16000 âd 0 -T 8
	110	cd-hit -i db -o db90 -c 0.9 -n 5 -M 16000 âd 0 -T 8,
111	111
112	112	where\\
113	113	''db'' is the filename of input, \\

181	181	must not be more than 10 bases
182	182	-B 1 or 0, default 0, by default, sequences are stored in RAM
183	183	if set to 1, sequence are stored on hard drive
184		it is recommended to use -B 1 for huge databases
	184	!! No longer supported !!
185	185	-p 1 or 0, default 0
186	186	if set to 1, print alignment overlap in .clstr file
187	187	-g 1 or 0, default 0

190	190	will cluster it into the most similar cluster that meet the threshold
191	191	(accurate but slow mode)
192	192	but either 1 or 0 won't change the representatives of final clusters
	193	-sc sort clusters by size (number of sequences), default 0, output clusters by decreasing length
	194	if set to 1, output clusters by decreasing size
	195	-sf sort fasta/fastq by cluster size (number of sequences), default 0, no sorting
	196	if set to 1, output sequences by decreasing cluster size
193	197	-bak write backup cluster file (1 or 0, default 0)
194	198	-h print this help
195	199

199	203
200	204	See the figure below, the -aL, -AL, -aS and -AS options can be used to specify the alignment coverage on both the representative sequence and other sequences. -s and -S can control the length difference between the representative sequence and other sequences.
201	205
202		{{ :Figure2.png }}
	206	{{ :cd-hit-figure2.png }}
	207
203	208
204	209	''
205	210	aL = R<sub>a</sub> / R\\

263	268	-n 2 for thresholds 0.4 ~ 0.5
264	269	</code>
265	270
266		More options:
267
268		Options, -b, -M, -l, -d, -t, -s, -S, -B, -p, -aL, -AL, -aS, -AS, -g, -G, -T
269		are same to CD-HIT, here are few more cd-hit-2d specific options:
270		<code>
271		-i2 input filename for db2 in fasta format, required
272		-s2 length difference cutoff for db1, default 1.0
273		by default, seqs in db1 >= seqs in db2 in a same cluster
274		if set to 0.9, seqs in db1 may just >= 90% seqs in db2
275		-S2 length difference cutoff, default 0
276		by default, seqs in db1 >= seqs in db2 in a same cluster
277		if set to 60, seqs in db2 may 60aa longer than seqs in db1
	271	Options:
	272	<code>
	273	-i input filename for db1 in fasta format, required
	274	-i2 input filename for db2 in fasta format, required
	275	-o output filename, required
	276	-c sequence identity threshold, default 0.9
	277	this is the default cd-hit's "global sequence identity" calculated as:
	278	number of identical amino acids in alignment
	279	divided by the full length of the shorter sequence
	280	-G use global sequence identity, default 1
	281	if set to 0, then use local sequence identity, calculated as :
	282	number of identical amino acids in alignment
	283	divided by the length of the alignment
	284	NOTE!!! don't use -G 0 unless you use alignment coverage controls
	285	see options -aL, -AL, -aS, -AS
	286	-b band_width of alignment, default 20
	287	-M memory limit (in MB) for the program, default 800; 0 for unlimitted;
	288	-T number of threads, default 1; with 0, all CPUs will be used
	289	-n word_length, default 5, see user's guide for choosing it
	290	-l length of throw_away_sequences, default 10
	291	-t tolerance for redundance, default 2
	292	-d length of description in .clstr file, default 20
	293	if set to 0, it takes the fasta defline and stops at first space
	294	-s length difference cutoff, default 0.0
	295	if set to 0.9, the shorter sequences need to be
	296	at least 90% length of the representative of the cluster
	297	-S length difference cutoff in amino acid, default 999999
	298	if set to 60, the length difference between the shorter sequences
	299	and the representative of the cluster can not be bigger than 60
	300	-s2 length difference cutoff for db1, default 1.0
	301	by default, seqs in db1 >= seqs in db2 in a same cluster
	302	if set to 0.9, seqs in db1 may just >= 90% seqs in db2
	303	-S2 length difference cutoff, default 0
	304	by default, seqs in db1 >= seqs in db2 in a same cluster
	305	if set to 60, seqs in db2 may 60aa longer than seqs in db1
	306	-aL alignment coverage for the longer sequence, default 0.0
	307	if set to 0.9, the alignment must covers 90% of the sequence
	308	-AL alignment coverage control for the longer sequence, default 99999999
	309	if set to 60, and the length of the sequence is 400,
	310	then the alignment must be >= 340 (400-60) residues
	311	-aS alignment coverage for the shorter sequence, default 0.0
	312	if set to 0.9, the alignment must covers 90% of the sequence
	313	-AS alignment coverage control for the shorter sequence, default 99999999
	314	if set to 60, and the length of the sequence is 400,
	315	then the alignment must be >= 340 (400-60) residues
	316	-A minimal alignment coverage control for the both sequences, default 0
	317	alignment must cover >= this value for both sequences
	318	-uL maximum unmatched percentage for the longer sequence, default 1.0
	319	if set to 0.1, the unmatched region (excluding leading and tailing gaps)
	320	must not be more than 10% of the sequence
	321	-uS maximum unmatched percentage for the shorter sequence, default 1.0
	322	if set to 0.1, the unmatched region (excluding leading and tailing gaps)
	323	must not be more than 10% of the sequence
	324	-U maximum unmatched length, default 99999999
	325	if set to 10, the unmatched region (excluding leading and tailing gaps)
	326	must not be more than 10 bases
	327	-B 1 or 0, default 0, by default, sequences are stored in RAM
	328	if set to 1, sequence are stored on hard drive
	329	!! No longer supported !!
	330	-p 1 or 0, default 0
	331	if set to 1, print alignment overlap in .clstr file
	332	-g 1 or 0, default 0
	333	by cd-hit's default algorithm, a sequence is clustered to the first
	334	cluster that meet the threshold (fast cluster). If set to 1, the program
	335	will cluster it into the most similar cluster that meet the threshold
	336	(accurate but slow mode)
	337	but either 1 or 0 won't change the representatives of final clusters
	338	-bak write backup cluster file (1 or 0, default 0)
	339	-h print this help
	340
278	341	</code>
279	342
280	343	==== CD-HIT-EST ====

288	351	good for non-intron containing sequences like EST.
289	352
290	353	Basic command:
291		cd-hit-est -i est_human -o est_human95 -c 0.95 -n 10 -d 0 -M 16000 - T 8
	354	cd-hit-est -i est_human -o est_human95 -c 0.95 -n 10 -d 0 -M 16000 - T 8
	355	cd-hit-est -i R1.fa -j R2.fa -o R1.95.fa -op R2.95.fa -P 1 -c 0.95 -n 10 -d 0 -M 16000 - T 8
292	356
293	357	Choose of word size:
294	358	<code>

300	364	-n 4 for thresholds 0.75 ~ 0.8
301	365	</code>
302	366
303		More options:
304
305		Options, -b, -M, -l, -d, -t, -s, -S, -B, -p, -aL, -AL, -aS, -AS, -g, -G, -T
306		are same to CD-HIT, here are few more cd-hit-est specific options:
307		<code>
	367	Options:
	368	<code>
	369	-i input filename in fasta format, required
	370	-j input filename in fasta/fastq format for R2 reads if input are paired end (PE) files
	371	-i R1.fq -j R2.fq -o output_R1 -op output_R2 or
	372	-i R1.fa -j R2.fa -o output_R1 -op output_R2
	373	-o output filename, required
	374	-op output filename for R2 reads if input are paired end (PE) files
	375	-c sequence identity threshold, default 0.9
	376	this is the default cd-hit's "global sequence identity" calculated as:
	377	number of identical amino acids in alignment
	378	divided by the full length of the shorter sequence
	379	-G use global sequence identity, default 1
	380	if set to 0, then use local sequence identity, calculated as :
	381	number of identical amino acids in alignment
	382	divided by the length of the alignment
	383	NOTE!!! don't use -G 0 unless you use alignment coverage controls
	384	see options -aL, -AL, -aS, -AS
	385	-b band_width of alignment, default 20
	386	-M memory limit (in MB) for the program, default 800; 0 for unlimitted;
	387	-T number of threads, default 1; with 0, all CPUs will be used
	388	-n word_length, default 10, see user's guide for choosing it
	389	-l length of throw_away_sequences, default 10
	390	-d length of description in .clstr file, default 20
	391	if set to 0, it takes the fasta defline and stops at first space
	392	-s length difference cutoff, default 0.0
	393	if set to 0.9, the shorter sequences need to be
	394	at least 90% length of the representative of the cluster
	395	-S length difference cutoff in amino acid, default 999999
	396	if set to 60, the length difference between the shorter sequences
	397	and the representative of the cluster can not be bigger than 60
	398	-aL alignment coverage for the longer sequence, default 0.0
	399	if set to 0.9, the alignment must covers 90% of the sequence
	400	-AL alignment coverage control for the longer sequence, default 99999999
	401	if set to 60, and the length of the sequence is 400,
	402	then the alignment must be >= 340 (400-60) residues
	403	-aS alignment coverage for the shorter sequence, default 0.0
	404	if set to 0.9, the alignment must covers 90% of the sequence
	405	-AS alignment coverage control for the shorter sequence, default 99999999
	406	if set to 60, and the length of the sequence is 400,
	407	then the alignment must be >= 340 (400-60) residues
	408	-A minimal alignment coverage control for the both sequences, default 0
	409	alignment must cover >= this value for both sequences
	410	-uL maximum unmatched percentage for the longer sequence, default 1.0
	411	if set to 0.1, the unmatched region (excluding leading and tailing gaps)
	412	must not be more than 10% of the sequence
	413	-uS maximum unmatched percentage for the shorter sequence, default 1.0
	414	if set to 0.1, the unmatched region (excluding leading and tailing gaps)
	415	must not be more than 10% of the sequence
	416	-U maximum unmatched length, default 99999999
	417	if set to 10, the unmatched region (excluding leading and tailing gaps)
	418	must not be more than 10 bases
	419	-B 1 or 0, default 0, by default, sequences are stored in RAM
	420	if set to 1, sequence are stored on hard drive
	421	!! No longer supported !!
	422	-P input paired end (PE) reads, default 0, single file
	423	if set to 1, please use -i R1 -j R2 to input both PE files
	424	-cx length to keep after trimming the tail of sequence, default 0, not trimming
	425	if set to 50, the program only uses the first 50 letters of input sequence
	426	-cy length to keep after trimming the tail of R2 sequence, default 0, not trimming
	427	if set to 50, the program only uses the first 50 letters of input R2 sequence
	428	e.g. -cx 100 -cy 80 for paired end reads
	429	-ap alignment position constrains, default 0, no constrain
	430	if set to 1, the program will force sequences to align at beginings
	431	when set to 1, the program only does +/+ alignment
	432	-p 1 or 0, default 0
	433	if set to 1, print alignment overlap in .clstr file
	434	-g 1 or 0, default 0
	435	by cd-hit's default algorithm, a sequence is clustered to the first
	436	cluster that meet the threshold (fast cluster). If set to 1, the program
	437	will cluster it into the most similar cluster that meet the threshold
	438	(accurate but slow mode)
	439	but either 1 or 0 won't change the representatives of final clusters
308	440	-r 1 or 0, default 1, by default do both +/+ & +/- alignments
309	441	if set to 0, only +/+ strand alignment
310	442	-mask masking letters (e.g. -mask NX, to mask out both 'N' and 'X')

312	444	-mismatch mismatching score, default -2
313	445	-gap gap opening score, default -6
314	446	-gap-ext gap extension score, default -1
	447	-bak write backup cluster file (1 or 0, default 0)
	448	-sc sort clusters by size (number of sequences), default 0, output clusters by decreasing length
	449	if set to 1, output clusters by decreasing size
	450	-sf sort fasta/fastq by cluster size (number of sequences), default 0, no sorting
	451	if set to 1, output sequences by decreasing cluster size
	452	-h print this help
	453
	454
315	455	</code>
316	456
317	457	==== CD-HIT-EST-2D ====

325	465	sequences like EST.
326	466
327	467	Basic command:
328		cd-hit-est-2d -i mrna_human -i2 est_human -o est_human_novel -c 0.95 -n 10 -d 0 -M 16000 - T 8
329
	468	cd-hit-est-2d -i mrna_human -i2 est_human -o est_human_novel -c 0.95 -n 10 -d 0 -M 16000 - T 8
	469	cd-hit-est-2d -i db1.R1.fa -j db1.R2.fa -i2 db2.R1.fa -j2 db2.R2.fa -o db2_novel.R1.fa -op db2_novel.R2.fa -P 1 -c 0.95 -n 10 -d 0 -M 16000 - T 8
	470
330	471	Choose of word size and options are the same as CD-HIT-EST:
331	472
332		cd-hit-est-2d specificnoptions:
333		<code>
	473	Options:
	474	<code>
	475	-i input filename for db1 in fasta format, required
	476	-i2 input filename for db2 in fasta format, required
	477	-j, -j2 input filename in fasta/fastq format for R2 reads if input are paired end (PE) files
	478	-i db1-R1.fq -j db1-R2.fq -i2 db2-R1.fq -j2 db2-R2.fq -o output_R1 -op output_R2 or
	479	-i db1-R1.fa -j db1-R2.fa -i2 db2-R1.fq -j2 db2-R2.fq -o output_R1 -op output_R2
	480	-o output filename, required
	481	-op output filename for R2 reads if input are paired end (PE) files
	482	-c sequence identity threshold, default 0.9
	483	this is the default cd-hit's "global sequence identity" calculated as:
	484	number of identical amino acids in alignment
	485	divided by the full length of the shorter sequence
	486	-G use global sequence identity, default 1
	487	if set to 0, then use local sequence identity, calculated as :
	488	number of identical amino acids in alignment
	489	divided by the length of the alignment
	490	NOTE!!! don't use -G 0 unless you use alignment coverage controls
	491	see options -aL, -AL, -aS, -AS
	492	-b band_width of alignment, default 20
	493	-M memory limit (in MB) for the program, default 800; 0 for unlimitted;
	494	-T number of threads, default 1; with 0, all CPUs will be used
	495	-n word_length, default 10, see user's guide for choosing it
	496	-l length of throw_away_sequences, default 10
	497	-d length of description in .clstr file, default 20
	498	if set to 0, it takes the fasta defline and stops at first space
	499	-s length difference cutoff, default 0.0
	500	if set to 0.9, the shorter sequences need to be
	501	at least 90% length of the representative of the cluster
	502	-S length difference cutoff in amino acid, default 999999
	503	if set to 60, the length difference between the shorter sequences
	504	and the representative of the cluster can not be bigger than 60
334	505	-s2 length difference cutoff for db1, default 1.0
335	506	by default, seqs in db1 >= seqs in db2 in a same cluster
336	507	if set to 0.9, seqs in db1 may just >= 90% seqs in db2
337	508	-S2 length difference cutoff, default 0
338	509	by default, seqs in db1 >= seqs in db2 in a same cluster
339	510	if set to 60, seqs in db2 may 60aa longer than seqs in db1
	511	-aL alignment coverage for the longer sequence, default 0.0
	512	if set to 0.9, the alignment must covers 90% of the sequence
	513	-AL alignment coverage control for the longer sequence, default 99999999
	514	if set to 60, and the length of the sequence is 400,
	515	then the alignment must be >= 340 (400-60) residues
	516	-aS alignment coverage for the shorter sequence, default 0.0
	517	if set to 0.9, the alignment must covers 90% of the sequence
	518	-AS alignment coverage control for the shorter sequence, default 99999999
	519	if set to 60, and the length of the sequence is 400,
	520	then the alignment must be >= 340 (400-60) residues
	521	-A minimal alignment coverage control for the both sequences, default 0
	522	alignment must cover >= this value for both sequences
	523	-uL maximum unmatched percentage for the longer sequence, default 1.0
	524	if set to 0.1, the unmatched region (excluding leading and tailing gaps)
	525	must not be more than 10% of the sequence
	526	-uS maximum unmatched percentage for the shorter sequence, default 1.0
	527	if set to 0.1, the unmatched region (excluding leading and tailing gaps)
	528	must not be more than 10% of the sequence
	529	-U maximum unmatched length, default 99999999
	530	if set to 10, the unmatched region (excluding leading and tailing gaps)
	531	must not be more than 10 bases
	532	-B 1 or 0, default 0, by default, sequences are stored in RAM
	533	if set to 1, sequence are stored on hard drive
	534	!! No longer supported !!
	535	-P input paired end (PE) reads, default 0, single file
	536	if set to 1, please use -i R1 -j R2 to input both PE files
	537	-cx length to keep after trimming the tail of sequence, default 0, not trimming
	538	if set to 50, the program only uses the first 50 letters of input sequence
	539	-cy length to keep after trimming the tail of R2 sequence, default 0, not trimming
	540	if set to 50, the program only uses the first 50 letters of input R2 sequence
	541	e.g. -cx 100 -cy 80 for paired end reads
	542	-p 1 or 0, default 0
	543	if set to 1, print alignment overlap in .clstr file
	544	-g 1 or 0, default 0
	545	by cd-hit's default algorithm, a sequence is clustered to the first
	546	cluster that meet the threshold (fast cluster). If set to 1, the program
	547	will cluster it into the most similar cluster that meet the threshold
	548	(accurate but slow mode)
	549	but either 1 or 0 won't change the representatives of final clusters
	550	-r 1 or 0, default 1, by default do both +/+ & +/- alignments
	551	if set to 0, only +/+ strand alignment
	552	-mask masking letters (e.g. -mask NX, to mask out both 'N' and 'X')
	553	-match matching score, default 2 (1 for T-U and N-N)
	554	-mismatch mismatching score, default -2
	555	-gap gap opening score, default -6
	556	-gap-ext gap extension score, default -1
	557	-bak write backup cluster file (1 or 0, default 0)
	558	-h print this help
	559
340	560	</code>
341	561
342	562

347	567	Basic command:
348	568	cd-hit-454 -i 454_reads -o 454_reads_95 -c 0.95 -n 10 -d 0 -M 16000 - T 8
349	569
350		Full list of options:
	570	Options:
351	571	<code>
352	572	-i input filename in fasta format, required
353	573	-o output filename, required

422	642	- repeat cd-hit and cd-hit-2d runs till done
423	643	- Combine the results
424	644
425		{{ :Figure3.png }}
	645	{{ :cd-hit-figure3.png }}
426	646
427	647	Basic command:
428	648	cd-hit-para.pl -i nr90 -o nr60 -c 0.6 -n 4 --B hosts --S 64

505	725
506	726	With multiple-step, iterated runs of CD-HIT, you perform a clustering in a
507	727	neighbor-joining method, which generates a hierarchical structure. The third step use psi-cd-hit, please see psi-cd-hit section for details.
	728
	729	This way is faster than one-step clustering. It can also be more accurate.
	730
	731	There is a problem with one-step clustering. Two very similar sequences A and B may be clustered into different clusters. For example, let the clustering threshold to be 60%, IAB (identity of AB) = 95%, IAC â¥ 60%, but IBC < 60%. If C was first selected a cluster representative, then A will be in cluster âCâ, but âBâ will not, resulting near identical AB to be in different clusters. Hierarchically clustering will reduce this problem.
508	732
509		{{ :Figure4.png }}
	733	{{ :cd-hit-figure4.png }}
510	734
511	735	Commands:
512	736	cd-hit -i nr -o nr80 -c 0.8 -n 5 -d 0 -M 16000 -T 16

524	748	clstr_rev.pl nr80-60.clstr nr30.clstr > nr80-60-30.clstr
525	749	nr30.clstr only lists sequences from nr60, script clstr_rev.pl add the original sequences into file nr80-60-30.clstr
526	750
527		This way is faster than one-step run from nr directly to nr30. It can also
528		more accurate.
529
530
531	751
532	752
533	753	===== CD-HIT AuxTools =====

540	760
541	761
542	762	cd-hit-dup is a simple tool for removing duplicates from sequencing reads,
543		with optional step to detect and remove chimeric reads.
	763	with optional step to detect and remove chimeric reads. When two files of paired end reads are used as inputs, each pair of reads will be concatenated into a single one.
544	764	A number of options are provided to tune how the duplicates are removed.
545	765	Running the program without arguments should print out the list of available options,
546	766	as the following:

570	790	</code>
571	791
572	792	=== Option details ===
573
574		== Common options ==
575		Here are the more detailed description of the options.
576		<code>
577		-i Input file;
578		</code>
579		Input file that must be in fasta or fastq format.
580
581		<code>
582		-i2 Second input file;
583		</code>
584		cd-hit-dup can take 2 files of paired end reads.
585		"-i" can be used to specify the file for the R1;
586		and "-i2" can be used to specify the file for R2.
587
588		When two files of paired end reads are used as inputs, each pair of reads will
589		be concatenated into a single one. And the following steps of duplicate and chimeric
590		detection and removing.
591
592		<code>
593		-o Output file;
594		</code>
595		Output file which contains a list of reads without duplicates.
596
597		<code>
598		-o2 Output file for R2, with paired end reads;
599		</code>
600
601		<code>
602		-d Description length (default 0, truncate at the first whitespace character)
603		</code>
604		The length of description line that should be written to the output.
605
606	793	<code>
607	794	-u Length of prefix to be used in the analysis (default 0, for full/maximum length);
608	795	</code>
	796
609	797	For pair-end inputs, the program will take part (whole or prefix) of the first end
610	798	and part (whole or prefix) of the second read,
611	799	and join them together to form a single read to do the analysis.

620	808	to do the analysis. In case that a read is shorter than this length, no 'N' is appended to
621	809	the read since it is not necessary.
622	810
623
624		== Options for duplicate detection ==
625	811	<code>
626	812	-m Match length (true/false, default true);
627	813	</code>

636	822	no greater than the specified value are considered to be duplicates. For chimeric detection,
637	823	this option control how similar a read should be to either of its parents.
638	824
639
640		== Options for chimeric filtering ==
641	825	<code>
642	826	-f Filter out chimeric clusters (true/false, default false);
643	827	</code>

882	1066	- Repeat until done
883	1067
884	1068	==== Installation ====
885		please download legacy BLAST (not BLAST+) and install the executables in your $PATH. The programs
886		required by psi-cd-hit.pl are blastall, megablast, blastpgp and formatdb.
	1069	please download either legacy BLAST or BLAST+ and install the executables in your $PATH. The programs
	1070	required by psi-cd-hit.pl are blastall, megablast, blastpgp and formatdb for legacy blast, and blastp, blastn, psiblast and makeblastdb for blast+.
887	1071
888	1072	==== Usage ====
889	1073

940	1124	-------------circle-----------
941	1125	\| \|
942	1126	seq1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 1
943		\\\\ /////////////
944		\\\\ /////////////
	1127	\\\\\\\\ /////////////
	1128	\\\\\\\\ /////////////
945	1129	HSP 2 -> ////HSP 1 /// <-HSP 2
946		///////////// \\\\
947		///////////// \\\\
	1130	///////////// \\\\\\\\
	1131	///////////// \\\\\\\\
948	1132	seq2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 2
949	1133	\| \|
950	1134	-----------circle--------------

1163	1347
1164	1348	The CD-HIT-454 web server is also available from [[http://cd-hit.org]].
1165	1349
1166
	1350	===== Use cases =====
	1351	Here, a use case is defined as a sequence clustering related problem or application that cannot be easily solved with existing clustering approaches, such as CD-HIT. However, it is feasible to solve such a use case by customizing current clustering algorithms or utilizing current approach in a very intelligent way or non-standard manner. In the last years, we have developed many use cases in addressing various problems. We will release these use cases after additional testing. These use cases will be described in the following chapters.
	1352
	1353	===== CD-HIT-OTU-MiSeq =====
	1354	ï»¿This use case is developed for clustering 16S rRNA genes into OTUs for microbiome studies. In recent years, Illumina MiSeq sequencers became dominant in 16S rRNA sequencing. The Paired End (PE) reads need to be assembled first. However many reads can not be accurately assembled because the poor quality at the 3â ends of both PE reads in the overlapping region. This causes that many sequences are discarded in the analysis. CD-HIT-OTU-MiSeq has unique features to cluster MiSeq 16S sequences.
	1355	- The package can clustering PE reads without joining them into contigs.
	1356	- Users can choose a high quality portion of the PE reads for analysis (e.g. first 200 / 150 bases from forward / reverse reads), according to base quality profile.
	1357	- We implemented a tool that can splice out the target region (e.g. V3-V4) from a full-length 16S reference database into the PE sequences. CD-HIT-OTU-MiSeq can cluster the spliced PE reference database together with samples, so we can derive Operational Tax-onomic Units (OTUs) and annotate these OTUs concurrently.
	1358	- Chimeric sequences are effectively identified through both de novo and reference-based approaches.
	1359
	1360	The most important unique feature of CD-HIT-OTU-MiSeq is to only use high quality region at the 5â ends of R1 and R2 reads. For example, the effective read length can be 200 bases for R1 and 150 bases for R2. The effective portions of PE reads are clustered together with spliced PE sequences from the reference database to derive OTUs (Figure).
	1361
	1362	{{:cd-hit-otu-miseq-figure-1.png\|}}
	1363
	1364	==== Installation ====
	1365	First download and install full cd-hit package
	1366	* download current CD-HIT at [[https://github.com/weizhongli/cdhit/releases]], for example cd-hit-v4.6.2-2015-0511.tar.gz
	1367	* unpack the file with " tar xvf cd-hit-v4.6.2-2015-0511.tar.gz --gunzip"
	1368	* change dir by "cd cd-hit-v4.6.2-2015-0511"
	1369	* compile the programs by "make" with multi-threading (default), or by "make openmp=no" without multi-threading (on old systems without OpenMP)
	1370	* cd cd-hit-auxtools
	1371	* compile cd-hit-auxtools by "make"
	1372	* CD-HIT-OTU-MiSeq scripts are inside a folder like cd-hit-v4.6.2-2015-0511/usecases/Miseq-16S
	1373
	1374	CD-HIT-OTU-MiSeq uses Trimmomatic for sequence quality control. It can be downloaded from [[http://www.usadellab.org/cms/?page=trimmomatic]] or [[https://github.com/timflutre/trimmomatic]]. We also have a copy at [[http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/]].
	1375
	1376	* modify NG-Omics-Miseq-16S.pl
	1377	Please edit usecases/Miseq-16S/NG-Omics-Miseq-16S.pl, in the top few lines:
	1378	$CD_HIT_dir = "PATH_to_cd-hit";
	1379	$NGS_prog_trimmomatic = "PATH/trimmomatic-0.32.jar"; #### where you have installed Trimmomatic
	1380
	1381	==== Download reference and sample datasets ====
	1382	Reference database and sample datasets can be downloaded from [[http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/]].
	1383
	1384	The reference database Greengene-13-5-99.fasta.gz was processed from original Greengene database, so that sequences with more specific annotations are at the beginning of the file. You need to download and gunzip it.
	1385
	1386	You can also download Greengene and generate it. You should download Greengene from [[http://greengenes.secondgenome.com/downloads]], or [[ftp://greengenes.microbio.me/]]. Please download file like greengenes_release/gg_13_5/gg_13_5_otus.tar.gz, unpack the tar file. You may find gg_13_5_otus/taxonomy/99_otu_taxonomy.txt and gg_13_5_otus/rep_set/99_otus.fasta. There is a script: usecases/Miseq-16S/greengene-ann1.pl.
	1387
	1388	Commands:
	1389	/greengene-ann1.pl -i gg_13_5_otus/taxonomy/99_otu_taxonomy.txt -j gg_13_5_otus/rep_set/99_otus.fasta -o Greengene-13-5-99.fasta
	1390
	1391	The Miseq-otu-example.tar.gz contains two Miseq 16S samples. You can download and unpack to test.
	1392
	1393	==== Usage ====
	1394
	1395	Step 1. prepare fastq files and sample file: Most projects have multiple samples sequenced at the same region. You should already have paired ended fastq files for these samples, put them in a working directory in similar way as the testing datasets, where the R1.fq and R2.fq are placed in separate folder for each sample. So in the working directory, you should have files:
	1396	sample_name_1/R1.fq
	1397	sample_name_1/R2.fq
	1398	sample_name_2/R1.fq
	1399	sample_name_2/R2.fq
	1400	...
	1401	sample_name_N/R1.fq
	1402	sample_name_N/R2.fq
	1403
	1404	Then, please prepare a sample file in the working directory. The file should look like:
	1405	sample_name_1 R1.fq R2.fq
	1406	sample_name_2 R1.fq R2.fq
	1407	sample_name_N R1.fq R2.fq
	1408
	1409	Step 2. Reference database preparation: We implemented a tool that can splice out the target amplicon region (e.g. V3-V4) from a full-length 16S rRNA reference sequence database, such as Greengene, RDP and Silva, into PE sequences. If there are multiple samples in a project sequenced with the same amplicon of same variable region, only one spliced reference database is needed. To run:
	1410
	1411	path_to_cd-hit_dir/usecases/Miseq-16S/16S-ref-db-PE-splice.pl -i sample_name_1/R1.fq -j sample_name_2/R2.fq -d Greengene-13-5-99.fasta -o gg_13_5-PE99.150-100 -p 150 -q 100 -c 0.99
	1412	Where Greengene-13-5-99.fasta is our re-formatted Greengene sequence file. This program will output spliced PE files gg_13_5-PE99.150-100-R1 and gg_13_5-PE99.150-100-R2.
	1413
	1414	Step 3. Run sequence QC and OTU clustering for each sample:. In the working directory, run
	1415	PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s sample_file -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
	1416	where: 150 and 100 are the effective length, 0.97 is the OTU clustering cutoff, 0.00001 is the abundance cutoff, 75 is the length for chimeric checking at each R1 and R2 read
	1417
	1418	This command will generate shell scripts for QC and for OTU for each sample. The scripts will be in WF-sh folder. You can first run the qc.sample_name.sh and then run otu.sample_name.sh
	1419
	1420	NG-Omics-WF.pl [[https://github.com/weizhongli/ngomicswf]] is a very powerful workflow and pipeline tool developed in our group. It is not fully released yet, since we need more time to document this tool. However, you can try to use NG-Omics-WF.pl to automatically run all your samples. First edit NG-Omics-Miseq-16S.pl and modify cores_per_node around line #36, then
	1421	nohup PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s sample_file -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 &
	1422
	1423	After the job finished, the OTU results will be in sample_name/otu folder, important files include
	1424	* OTU.clstr: file lists all clusters and sequences
	1425	* removed_chimeric*: chimeric sequenced removed
	1426	* small_clusters.list: low abundance small clusters removed
	1427
	1428	Step 4. pool all the samples together: Please run
	1429	PATH_to_cd-hit-dir/usecases/pool_samples.pl -s sample_file -o pooled_sample.
	1430	This will pool sequences from all sample and re-run OTU clustering. We can pool hundred of samples without problem. After job finished, additional files will be available from pooled_sample directory
	1431	* OTU.clstr: file list all clusters and sequences from all samples
	1432	* removed_chimeric*: chimeric sequenced removed
	1433	* small_clusters.list: low abundance small clusters removed
	1434	* OTU.txt: spread sheet list number of sequences in each OTU for each sample, it also show annotation for each OTU.
	1435	* OTU.biome: OTU.txt in biome format
	1436
	1437
	1438
1167	1439	===== References =====
1168	1440
1169	1441	If you find cd-hit helpful to your research and study, please kindly cite the

+1285

-0

psi-cd-hit/psi-cd-hit-local-old.pl less more

	0	#!/usr/bin/perl -w
	1	################################################################################
	2	######### PSI-cd-hit written by Weizhong Li at http://cd-hit.org
	3	################################################################################
	4	our $pid = $$;
	5	our $db_in = ""; ###################
	6	our $db_out = ""; # input / output
	7	our $len_t = 10; ###################
	8	our $NR_clstr = 0.3; #
	9	our $NR_clstre = -1; #thresholds
	10	our $g_iden = 1; #
	11	our $opt_aS = 0.0; #
	12	our $opt_aL = 0.0; #
	13	our $circle = 0; #
	14	our $opt_g = 1; ####################
	15	our $blast_exe = "blastall -p blastp -m 8"; #########################
	16	our $prof_exe = "blastpgp -m 8"; #
	17	our $prof_para = "-j 3 -F T -e 0.001 -b 500 -v 500"; #
	18	our $prof_db = ""; #
	19	our $bl_para = "-F T -e 0.000001 -b 100000 -v 100000"; # program
	20	our $bl_STDIN = 1; #
	21	our $keep_bl = 0; #
	22	our $blast_prog= "blastp"; #
	23	our $formatdb = "formatdb"; #########################
	24	our $exec_mode = "local"; #######################
	25	our $num_qsub = 1; #
	26	our $para_no = 1; # compute
	27	our $sh_file = ""; #
	28	our $batch_no_per_node = 50; #######################
	29	our $reformat_seg = 50000;
	30	our $restart_seg = 20000;
	31	our $job = "";
	32	our $job_file = "";
	33	our $date = `date`;
	34	our $restart_in = "";
	35	our $pwd = `pwd`; chop($pwd);
	36	our $db_clstr;
	37	our $db_log;
	38	our $db_out1;
	39	our $seq_dir;
	40	our $bl_dir;
	41	our $restart_file;
	42	our $tmp_db;
	43	our $remote_perl_script;
	44	our $remote_sh_script;
	45	our $bl_path;
	46	our $bl_plus = 1; #### use blast+
	47	our $bl_threads = 1;
	48	our $skip_long = 0;
	49	our %qsub_ids = (); #### a list of qsub ids
	50	our %qstat_xml_data = ();
	51
	52
	53	sub parse_para_etc {
	54	my ($arg, $cmd);
	55	while($arg = shift) {
	56	## input/output:
	57	if ($arg eq "-i") { $db_in = shift; }
	58	elsif ($arg eq "-o") { $db_out = shift; }
	59	elsif ($arg eq "-l") { $len_t = shift; }
	60	## thresholds
	61	elsif ($arg eq "-c") { $NR_clstr = shift; }
	62	elsif ($arg eq "-ce") { $NR_clstre = shift; }
	63	elsif ($arg eq "-G") { $g_iden = shift; }
	64	elsif ($arg eq "-aL") { $opt_aL = shift; }
	65	elsif ($arg eq "-aS") { $opt_aS = shift; }
	66	elsif ($arg eq "-g") { $opt_g = shift; }
	67	elsif ($arg eq "-circle") { $circle = shift; }
	68	elsif ($arg eq "-sl") { $skip_long = shift; }
	69	## program
	70	elsif ($arg eq "-prog") { $blast_prog= shift; }
	71	elsif ($arg eq "-p") { $prof_para = shift; }
	72	elsif ($arg eq "-dprof") { $prof_db = shift; die "option -dprof no longer supported!";}
	73	elsif ($arg eq "-s") { $bl_para = shift; }
	74	elsif ($arg eq "-k") { $keep_bl = shift; }
	75	elsif ($arg eq "-bs") { $bl_STDIN = shift; }
	76	## compute
	77	elsif ($arg eq "-exec") { $exec_mode = shift; }
	78	elsif ($arg eq "-host") { $num_qsub = shift; }
	79	elsif ($arg eq "-para") { $para_no = shift; }
	80	elsif ($arg eq "-shf") { $sh_file = shift; }
	81	elsif ($arg eq "-blp") { $bl_threads = shift; }
	82	elsif ($arg eq "-bat") { $batch_no_per_node = shift; }
	83	## job:
	84	elsif ($arg eq "-rs") { $restart_seg = shift; }
	85	elsif ($arg eq "-rf") { $reformat_seg= shift; }
	86	elsif ($arg eq "-restart") { $restart_in= shift; }
	87	elsif ($arg eq "-J") { $job = shift; $job_file = shift; }
	88	## blast path
	89	elsif ($arg eq "-P") { $bl_path = shift; }
	90	else { print_usage(); exit(); }
	91	}
	92
	93	# speical jobs
	94	if ($job eq "parse_blout") { job_parse_blout(); exit();}
	95
	96	if ($blast_prog eq "blastn") {
	97	$formatdb = "formatdb -p F";
	98	$blast_exe = "blastall -p blastn -m 8";
	99	}
	100	elsif ($blast_prog eq "megablast") {
	101	$blast_prog = "blastn"; #### back to blastn for blast parser type
	102	$formatdb = "formatdb -p F";
	103	$blast_exe = "megablast -H 100 -D 2 -m 8";
	104	}
	105	elsif ($blast_prog eq "blastpgp") {
	106	$blast_exe = "blastpgp -m 8 -j 3";
	107	}
	108
	109	#### for blast+
	110	if ($bl_plus) {
	111	$formatdb = "makeblastdb -dbtype prot -max_file_sz 8GB";
	112	$blast_exe = "blastp -outfmt 6";
	113	$bl_para = "-seg yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
	114
	115	if ($blast_prog eq "blastn") {
	116	$formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
	117	$blast_exe = "blastp -task blastn -outfmt 6";
	118	$bl_para = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
	119	}
	120	elsif ($blast_prog eq "megablast") {
	121	$blast_prog = "blastn"; #### back to blastn for blast parser type
	122	$formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
	123	$blast_exe = "blastp -task megablast -outfmt 6";
	124	$bl_para = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
	125	}
	126	elsif ($blast_prog eq "blastpgp") {
	127	$blast_exe = "psiblast -outfmt 6 -num_iterations 3 -num_threads $bl_threads";
	128	}
	129	}
	130
	131	if ($bl_path) {
	132	$blast_exe = "$bl_path/$blast_exe";
	133	$formatdb = "$bl_path/$formatdb";
	134	}
	135
	136	(-e $db_in) \|\| die "No input";
	137	($db_out) \|\| die "No output";
	138
	139	$db_clstr = "$db_out.clstr";
	140	$db_log = "$db_out.log";
	141	$db_out1 = "$db_out.out";
	142	$seq_dir = "$db_in-seq";
	143	$bl_dir = "$db_in-bl";
	144	$restart_file =" $db_out.restart";
	145
	146	$tmp_db = "$db_in.$pid";
	147	$remote_perl_script = "$tmp_db-bl.pl";
	148	$remote_sh_script = "$tmp_db-bl.sh";
	149
	150	$cmd = `mkdir $bl_dir $seq_dir`;
	151
	152	write_remote_perl_script();
	153	write_remote_sh_script();
	154	return;
	155	}
	156	########## END parse_para_etc
	157
	158
	159	sub read_db {
	160	my $des = "";
	161	my $seq = "";
	162	my $ll;
	163
	164	open(DBIN, $db_in) \|\| die "Can not open $db_in";
	165	while($ll=<DBIN>){
	166	chop($ll);
	167	if ($ll =~ /^>/) {
	168	$seq =~ s/\s//g;
	169	if (length($seq) > $len_t) { add_seq($des, $seq); }
	170	$des = $ll; $seq = "";
	171
	172	}
	173	else { $seq .= $ll; }
	174	}
	175	$seq =~ s/\s//g;
	176	if (length($seq) > $len_t) { add_seq($des, $seq); }
	177	close(DBIN);
	178
	179	($NR_no >=1 ) \|\| die "No sequence readin";
	180
	181	print OUTT "Total seqs $NR_no in $db_in\n";
	182	return;
	183	}
	184	########## END read_db
	185
	186
	187	sub add_seq {
	188	my ($des, $seq) = @_;
	189	$des =~ s/\s.+$//;
	190	push(@seqs, $seq);
	191	push(@dess, $des);
	192	push(@lens, length($seq));
	193	push(@idens, 0);
	194	push(@passeds,0);
	195	push(@NR_clstr_nos,0);
	196	push(@in_bg, 0);
	197	$NR_no++;
	198	return;
	199	}
	200	########## END add_seq
	201
	202
	203	sub open_LOG {
	204	open(OUTT, ">> $db_out1") \|\| die "can not open $db_out1";
	205	select(OUTT); $\|++; ### file handle flush
	206	print OUTT "Started $date";
	207
	208	open(LOG, ">> $db_log") \|\| die "Can not open $db_log";
	209	select(LOG); $\|++; ### file handle flush
	210	select(STDOUT);
	211	return;
	212	}
	213	########## END open_LOG
	214
	215	sub write_LOG {
	216	my $txt=shift;
	217	print LOG "$txt\n";
	218	}
	219
	220	{## use static variables
	221	my $last_NR90_no=0;
	222	my $last_NR_passed=0;
	223	sub watch_progress {
	224	my ($i0, $NR90_no, $NR_passed, $NR_no, $flag) = @_;
	225	my $i1 = $i0+1;
	226
	227	if ( $i1 % 10 == 0 ) {
	228	print OUTT ".";
	229	$flag = 1 if ( $i1 % 100 == 0 );
	230	}
	231
	232	if ($flag) {
	233	my $t1 = (int($NR_passed/$NR_no*10000)) / 100;
	234	my $t90 = $NR90_no - $last_NR90_no;
	235	my $tno = $NR_passed - $last_NR_passed;
	236	my ($tu, $ts, $cu, $cs) = times();
	237	my $tt = $tu + $ts + $cu + $cs;
	238	print OUTT
	239	"$i1 finished $NR90_no clusters $NR_passed passed $t90/$tno clstr/passed $t1% done $tt cpu\n";
	240	$last_NR90_no = $NR90_no;
	241	$last_NR_passed = $NR_passed;
	242	}
	243	return;
	244	}
	245	}
	246
	247
	248	sub close_LOG {
	249	my $date = `date`; print OUTT "Completed $date\n";
	250	my $total_cpu = total_remote_cpu();
	251	print OUTT "Total CPUs on remote hosts: $total_cpu\n";
	252	close(OUTT);
	253	close(LOG);
	254	return;
	255	}
	256	########## END close_LOG
	257
	258	###### need to change to read dir because
	259	sub total_remote_cpu {
	260	my ($i, $j, $k, $ll);
	261	my $tt = 0;
	262	for ($j=0; $j<$num_qsub; $j++) {
	263	open(TCPU, "$seq_dir/host.$j.cpu") \|\| next;
	264	while($ll = <TCPU>) {
	265	chop($ll);
	266	$tt += $ll;
	267	}
	268	close(TCPU);
	269	}
	270	return $tt;
	271	}
	272	########## END total_remote_cpu
	273
	274
	275	sub job_parse_blout {
	276	my ($i, $j, $k);
	277	my @hits = process_blout_blastp_blastn($job_file);
	278
	279	open(BLOUT2, "> $job_file.out") \|\| return;
	280	foreach $i (@hits) {
	281	print BLOUT2 join("\t", @{$i}), "\n";
	282	}
	283	print BLOUT2 "#\n";
	284	close(BLOUT2);
	285	return;
	286	}
	287	########## END job_parse_blout
	288
	289
	290	sub write_restart {
	291	my ($i0, $i, $j, $k);
	292	open(RES, "> $restart_file") \|\| die;
	293
	294	for ($i0=0; $i0<$NR_no; $i0++) {
	295	$i = $NR_idx[$i0];
	296	print RES "$i\t$NR_clstr_nos[$i]\t$idens[$i]\t$passeds[$i]\n";
	297	}
	298
	299	close(RES);
	300	return;
	301	}
	302	########## END write_restart
	303
	304
	305	sub read_restart {
	306	my ($ii, $i0, $i, $j, $k, $ll);
	307	my @lls;
	308	open(RESIN, $restart_in) \|\| die;
	309
	310	$NR_passed = 0;
	311	$NR90_no = 0;
	312	$ii = -1;
	313	$i0 = 0;
	314	while($ll = <RESIN>) {
	315	chop($ll);
	316	@lls = split(/\t/,$ll);
	317	$i = $lls[0];
	318	$NR_clstr_nos[$i] = $lls[1];
	319	$idens[$i] = $lls[2];
	320	$passeds[$i] = $lls[3];
	321	$NR_passed++ if ($lls[3]);
	322
	323	if ($lls[2] eq "*") { #rep
	324	$NR90_no++;
	325	$ii = $i0 if ($lls[3]);
	326	}
	327	$NR_idx[$i0] = $i;
	328	$i0++; # idx of sorted , see write_restart
	329	}
	330	close(RESIN);
	331
	332	$ii++; # $ii to be last rep processed
	333	return $ii;
	334	}
	335	########## END read_restart
	336
	337
	338	sub write_db_clstr {
	339	my ($i0, $i, $j, $k);
	340
	341	my @NR90_seq = ();
	342	for ($i=0; $i<$NR90_no; $i++) { $NR90_seq[$i] = []; }
	343	for ($i0=0; $i0<$NR_no; $i0++) {
	344	$i = $NR_idx[$i0];
	345	next unless ($passeds[$i]);
	346	$j = $NR_clstr_nos[$i];
	347	next unless ($j < $NR90_no);
	348	push(@{$NR90_seq[$j]}, $i);
	349	}
	350
	351	open(DBCLS, "> $db_clstr") \|\| die "Can not write $db_clstr";
	352	for ($i=0; $i<$NR90_no; $i++) {
	353	print DBCLS ">Cluster $i\n";
	354	$k = 0;
	355	foreach $j (@{ $NR90_seq[$i] }) {
	356	my $des = (split(/\s+/,$dess[$j]))[0];
	357	print DBCLS "$k\t$lens[$j]"."aa, $des... ";
	358	if ($idens[$j] eq "") { print DBCLS "\n"; }
	359	else { print DBCLS "at $idens[$j]\n";}
	360	$k++;
	361	}
	362	}
	363	close(DBCLS);
	364
	365	@NR90_seq=();
	366	return;
	367	}
	368	########## END write_db_clstr
	369
	370
	371	sub remove_raw_blout {
	372	my $NR_sofar = shift;
	373	my ($i0, $i, $j, $k, $cmd);
	374	return if ($keep_bl);
	375
	376	for ($i0=$NR_sofar; $i0>=0; $i0--) {
	377	$i = $NR_idx[$i0];
	378	next unless $passeds[$i];
	379	next unless ($idens[$i] eq "*"); #only reps have blout
	380	my $fout = "$bl_dir/$i";
	381	last unless (-e "$fout.out"); #removed from last call
	382	if (not $bl_STDIN) { $cmd = `rm -f $fout`; }
	383	$cmd = `rm -f $bl_dir/$i.out`;
	384	}
	385	return;
	386	}
	387	########## END remove_raw_blout
	388
	389
	390	sub remove_raw_blout_bg {
	391	my $NR_sofar = shift;
	392	my ($i0, $i, $j, $k, $cmd);
	393	return if ($keep_bl);
	394
	395	my $tmp_sh_script = "$tmp_db-rm-$NR_sofar.sh";
	396	open(OUTRM, ">$tmp_sh_script") \|\| die "can not write to $tmp_sh_script";
	397
	398	for ($i0=$NR_sofar; $i0>=0; $i0--) {
	399	$i = $NR_idx[$i0];
	400	next unless $passeds[$i];
	401	next unless ($idens[$i] eq "*"); #only reps have blout
	402	my $fout = "$bl_dir/$i";
	403	last unless (-e "$fout.out"); #removed from last call
	404	if (not $bl_STDIN) { print OUTRM "rm -f $fout\n"; }
	405	print OUTRM "rm -f $bl_dir/$i.out";
	406	}
	407	print OUTRM "rm -f $tmp_sh_script\n"; ## remove self
	408	close(OUTRM);
	409	sleep(3);
	410
	411	$cmd = `sh $tmp_sh_script >/dev/null 2>&1 &`;
	412	return;
	413	}
	414	########## END remove_raw_blout_bg
	415
	416
	417	sub fish_other_homolog {
	418	my ($i, $j, $k, $i0, $j0, $k0);
	419	$id = shift; # real idx, not sorted idx
	420	my @hits = ();
	421
	422	wait_blast_out("$bl_dir/$id.out");
	423	open(BLPOUT, "$bl_dir/$id.out") \|\| return;
	424	while($i=<BLPOUT>) {
	425	last if ($i =~ /^#/);
	426	chop($i);
	427	push(@hits, [split(/\t/,$i)]);
	428	}
	429	close(BLPOUT);
	430	my $rep_len = $lens[$id];
	431
	432	foreach $i (@hits) {
	433	my $id1 = $i->[0];
	434	next unless ($id1 < $NR_no);
	435	next if ($idens[$id1] eq "*"); #existing reps
	436	next if ($lens[$id1] > $rep_len); # in opt_g=1 mode, preventing it from being clustered into short rep
	437
	438	if ( $passeds[$id1] ) { #### if this hit is better -g 1 mode
	439	my $old_e = (split(/\//,$idens[$id1]))[0];
	440	if ($i->[3] < $old_e) {
	441	$idens[$id1] = "$i->[3]/$i->[2]aa/$i->[1]%";
	442	$passeds[$id1] = 1;
	443	$NR_clstr_nos[$id1] = $NR90_no;
	444	}
	445	next;
	446	}
	447
	448	$idens[$id1] = "$i->[3]/$i->[2]aa/$i->[1]%";
	449	$passeds[$id1] = 1;
	450	$NR_clstr_nos[$id1] = $NR90_no;
	451	$NR_passed++;
	452	}
	453	return;
	454	}
	455	########## END fish_other_homolog
	456
	457
	458	########### if a hit has multiple HSPs on both + - strands
	459	########### keep only the HSPs, whose strand is same as the top HSP
	460	sub keep_strand_with_top_hsp {
	461	my $self = shift;
	462	my ($i,$j,$k);
	463
	464	my %id_2_strand = ();
	465	my @new_sbj = ();
	466	my $new_no = 0;
	467	for ($i=0; $i<$self->{no}; $i++) {
	468	my $p = $self->{sbj}->[$i];
	469	my ($id1, $len_sub) = split(/\./, $p->{id});
	470	if (not defined($id_2_strand{$id1})) {
	471	$id_2_strand{$id1} = $p->{frame};
	472	}
	473	if ($p->{frame} eq $id_2_strand{$id1}) { #### this stand is same as the top strand
	474	push(@new_sbj, $self->{sbj}->[$i]);
	475	$new_no++;
	476	}
	477	}
	478	$self->{no} = $new_no;
	479	$self->{sbj} = [@new_sbj];
	480	}
	481	########## END keep_strand_with_top_hsp
	482
	483	########## for blastpgp -j no (no>1)
	484	########## keep hits from the last round
	485	sub keep_hsp_of_last_round {
	486	my $self = shift;
	487	my ($i,$j,$k);
	488
	489	my @new_sbj = ();
	490	my $new_no = 0;
	491	my $last_score = 999999999999999999999; # a big one
	492	for ($i=0; $i<$self->{no}; $i++) {
	493	my $p = $self->{sbj}->[$i];
	494	my $score = $p->{score};
	495
	496	if ($score > $last_score) { ## this is new round of hits
	497	@new_sbj = ();
	498	$new_no = 0;
	499	}
	500	$last_score = $score;
	501	push(@new_sbj, $self->{sbj}->[$i]);
	502	$new_no++;
	503	}
	504	$self->{no} = $new_no;
	505	$self->{sbj} = [@new_sbj];
	506	}
	507	########## END keep_hsp_of_last_round
	508
	509	########## if a query hit a subject with multiple HSPs
	510	########## only the top HSP is kept
	511	sub keep_top_hsp {
	512	my $self = shift;
	513	my ($i,$j,$k);
	514
	515	my %id_exist = ();
	516	my @new_sbj = ();
	517	my $new_no = 0;
	518	for ($i=0; $i<$self->{no}; $i++) {
	519	my $p = $self->{sbj}->[$i];
	520	my ($id1, $len_sub) = split(/\./, $p->{id});
	521	next unless ($len_sub >0) ;
	522
	523	if (not defined($id_exist{$id1})) {
	524	$id_exist{$id1} = 1;
	525	push(@new_sbj, $self->{sbj}->[$i]);
	526	$new_no++;
	527	}
	528	}
	529	$self->{no} = $new_no;
	530	$self->{sbj} = [@new_sbj];
	531	}
	532	########## keep_top_hsp
	533
	534	########## let the top hsp to start at 0 for both query and subject
	535	########## i.e. the begining of HSP to be new original - coordinate 0
	536	########## then reset all other HSPs' alignment coordinates
	537	sub reset_alignment_coor_for_circle_seq {
	538	my $self = shift;
	539	my ($i,$j,$k);
	540
	541	my $last_id = "";
	542	$j = 0;
	543	my $hsp_count = 0; # number of HSPs for a subject
	544	for ($i=0; $i<$self->{no}; $i++) {
	545	my $p = $self->{sbj}->[$i];
	546	my ($id1, $len_sub) = split(/\./, $p->{id});
	547
	548	if ($id1 ne $last_id) {
	549	if ($hsp_count > 1) { # it is necessary to reset coordinate when at least 2 HSP
	550	my $p_top_hsp = $self->{sbj}->[$j];
	551	my $len_q = (split(/\./, $p_top_hsp->{qid}))[1];
	552	my $len_s = (split(/\./, $p_top_hsp->{id}))[1];
	553	my $ref_q = ($p_top_hsp->{qfrom} < $p_top_hsp->{qend}) ? $p_top_hsp->{qfrom} : $p_top_hsp->{qend};
	554	my $ref_s = ($p_top_hsp->{sfrom} < $p_top_hsp->{send}) ? $p_top_hsp->{sfrom} : $p_top_hsp->{send};
	555	for ($k = $j; $k<$j+$hsp_count; $k++) {
	556	$self->{sbj}->[$k]->{qfrom} -= $ref_q; if ($self->{sbj}->[$k]->{qfrom} < 0) {$self->{sbj}->[$k]->{qfrom} += $len_q;}
	557	$self->{sbj}->[$k]->{qend} -= $ref_q; if ($self->{sbj}->[$k]->{qend} < 0) {$self->{sbj}->[$k]->{qend} += $len_q;}
	558	$self->{sbj}->[$k]->{sfrom} -= $ref_s; if ($self->{sbj}->[$k]->{sfrom} < 0) {$self->{sbj}->[$k]->{sfrom} += $len_s;}
	559	$self->{sbj}->[$k]->{send} -= $ref_s; if ($self->{sbj}->[$k]->{send} < 0) {$self->{sbj}->[$k]->{send} += $len_s;}
	560	}
	561	}
	562	$j = $i;
	563	$hsp_count = 0;
	564	}
	565	$last_id = $id1;
	566	$hsp_count++;
	567	}
	568
	569	#last subject
	570	if ($hsp_count > 1) { # it is necessary to reset coordinate when at least 2 HSP
	571	my $p_top_hsp = $self->{sbj}->[$j];
	572	my $len_q = (split(/\./, $p_top_hsp->{qid}))[1];
	573	my $len_s = (split(/\./, $p_top_hsp->{id}))[1];
	574	my $ref_q = ($p_top_hsp->{qfrom} < $p_top_hsp->{qend}) ? $p_top_hsp->{qfrom} : $p_top_hsp->{qend};
	575	my $ref_s = ($p_top_hsp->{sfrom} < $p_top_hsp->{send}) ? $p_top_hsp->{sfrom} : $p_top_hsp->{send};
	576	for ($k = $j; $k<$j+$hsp_count; $k++) {
	577	$self->{sbj}->[$k]->{qfrom} -= $ref_q; if ($self->{sbj}->[$k]->{qfrom} < 0) {$self->{sbj}->[$k]->{qfrom} += $len_q;}
	578	$self->{sbj}->[$k]->{qend} -= $ref_q; if ($self->{sbj}->[$k]->{qend} < 0) {$self->{sbj}->[$k]->{qend} += $len_q;}
	579	$self->{sbj}->[$k]->{sfrom} -= $ref_s; if ($self->{sbj}->[$k]->{sfrom} < 0) {$self->{sbj}->[$k]->{sfrom} += $len_s;}
	580	$self->{sbj}->[$k]->{send} -= $ref_s; if ($self->{sbj}->[$k]->{send} < 0) {$self->{sbj}->[$k]->{send} += $len_s;}
	581	}
	582	}
	583
	584	return;
	585	}
	586	########## reset_alignment_coor_for_circle_seq
	587
	588
	589	sub process_blout_blastp_blastn {
	590	my ($i, $j, $k, $i0, $j0, $k0);
	591	my $blout = shift;
	592	my @blhits = ();
	593
	594	#### need $len_rep
	595	my $len_rep = 0;
	596	my $bl = readblast_m8("", $blout);
	597	if ($blast_prog eq "blastn") { keep_strand_with_top_hsp($bl); }
	598	if (($blast_prog eq "blastpgp") and (not $prof_db)) {keep_hsp_of_last_round($bl); }
	599
	600	if ($g_iden == 0 ) { #### Local identity
	601	keep_top_hsp($bl); #### local alignment, only the top HSP
	602
	603	for ($i=0; $i<$bl->{no}; $i++) {
	604	my $p = $bl->{sbj}->[$i];
	605	my ($id1, $len_sub) = split(/\./, $p->{id});
	606	my $frame = $p->{frame};
	607	if (not $len_rep) {$len_rep = (split(/\./,$p->{qid}))[1]; }
	608	my $iden = $p->{iden};
	609	next unless (($len_sub >0) and ($len_rep>0));
	610	my $cov_aS = $p->{alnln} / $len_sub;
	611	my $cov_aL = $p->{alnln} / $len_rep;
	612	my $exp1 = $p->{expect};
	613
	614	if (($iden/100 > $NR_clstr or $exp1<$NR_clstre) and ($cov_aS >= $opt_aS) and ($cov_aL >= $opt_aL) ) {
	615	push(@blhits, [$id1, $iden, $p->{alnln}, $exp1, $frame]);
	616	}
	617	}
	618	return @blhits;
	619	} #### END if ($g_iden == 0 )
	620	else { #### Global idnetity
	621	if (($blast_prog eq "blastn") and $circle) { reset_alignment_coor_for_circle_seq($bl); }
	622	#### get colinear non-overlapping HSPs
	623	my @hsp = (); #### [id, len, qfrom, qend, sbegin, send, expect]
	624	my $iden_letters = 0;
	625	my $aln_letters = 0;
	626	my @aln_lens = ();
	627	my $hsp_no = 0;
	628	for ($i=0; $i<$bl->{no}; $i++) {
	629	my $p = $bl->{sbj}->[$i];
	630	my ($id1, $len_sub) = split(/\./, $p->{id});
	631	my $frame = $p->{frame};
	632	if (not $len_rep) {$len_rep = (split(/\./,$p->{qid}))[1]; }
	633	next unless (($len_sub >0) and ($len_rep>0));
	634
	635	if ($hsp_no) {
	636	if ($id1 ne $hsp[0]->[0]) {
	637	#### 1. parse previous subject's HSPs
	638	my $iden = int($iden_letters / $hsp[0]->[1] * 10000)/100;
	639	my $cov_aS = $aln_letters / $hsp[0]->[1];
	640	my $cov_aL = $aln_letters / $len_rep;
	641	my $exp1 = $hsp[0]->[6];
	642	my $frame = $hsp[0]->[7];
	643
	644	if (($iden/100 > $NR_clstr or $exp1<$NR_clstre) and ($cov_aS >= $opt_aS) and ($cov_aL >= $opt_aL) ) {
	645	#push(@blhits, [$hsp[0]->[0], $iden, $aln_letters, $exp1, $frame]);
	646	push(@blhits, [$hsp[0]->[0], $iden, join(":", @aln_lens), $exp1, $frame]);
	647	}
	648	#### 2. init some values
	649	@hsp = ();
	650	$iden_letters = 0;
	651	$aln_letters = 0;
	652	@aln_lens = ();
	653	$hsp_no = 0;
	654	}
	655	}
	656
	657	#check whether overlap with previous high score HSPs
	658	my $overlap_flag = 0;
	659	for ($j=0; $j<$hsp_no; $j++) {
	660	if (overlap1($p->{qfrom}, $p->{qend}, $hsp[$j]->[2], $hsp[$j]->[3])) { $overlap_flag = 1; last; }
	661	if (overlap1($p->{sfrom}, $p->{send}, $hsp[$j]->[4], $hsp[$j]->[5])) { $overlap_flag = 1; last; }
	662	}
	663	next if ($overlap_flag);
	664
	665	#check whether this HSP cross with previous high score HSPs
	666	my $cross_flag = 0;
	667	for ($j=0; $j<$hsp_no; $j++) {
	668	if (cross1($p->{qfrom}, $p->{qend}, $hsp[$j]->[2], $hsp[$j]->[3],
	669	$p->{sfrom}, $p->{send}, $hsp[$j]->[4], $hsp[$j]->[5])) {
	670	$cross_flag = 1; last;
	671	}
	672	}
	673	next if ($cross_flag);
	674
	675	push(@hsp, [$id1, $len_sub, $p->{qfrom}, $p->{qend}, $p->{sfrom}, $p->{send}, $p->{expect}, $p->{frame}]);
	676	$iden_letters += int($p->{iden} * $p->{alnln} / 100);
	677	$aln_letters += $p->{alnln};
	678	push(@aln_lens, $p->{alnln});
	679	$hsp_no++;
	680	}
	681
	682	if ($hsp_no) { #last record
	683	#### 1. parse previous subject's HSPs
	684	my $iden = int($iden_letters / $hsp[0]->[1] * 10000)/100;
	685	my $cov_aS = $aln_letters / $hsp[0]->[1];
	686	my $cov_aL = $aln_letters / $len_rep;
	687	my $exp1 = $hsp[0]->[6];
	688	my $frame = $hsp[0]->[7];
	689
	690	if (($iden/100 > $NR_clstr or $exp1<$NR_clstre) and ($cov_aS >= $opt_aS) and ($cov_aL >= $opt_aL) ) {
	691	#push(@blhits, [$hsp[0]->[0], $iden, $aln_letters, $exp1, $frame]);
	692	push(@blhits, [$hsp[0]->[0], $iden, join(":", @aln_lens), $exp1, $frame]);
	693	}
	694	}
	695
	696	return @blhits;
	697	}
	698	}
	699	########## END process_blout_blastp_blastn
	700
	701
	702	sub overlap1 {
	703	my ($b1, $e1, $b2, $e2) = @_;
	704
	705	my $t; ###
	706	if ($e1 < $b1) { $t = $e1; $e1 = $b1; $b1 = $t; }
	707	if ($e2 < $b2) { $t = $e2; $e2 = $b2; $b2 = $t; }
	708
	709	return 0 if ($e2 < $b1);
	710	return 0 if ($b2 > $e1);
	711	return ( ($e1<$e2)? $e1:$e2 )-( ($b1>$b2)? $b1:$b2);
	712	}
	713	########## END overlap1
	714
	715	## modified on 2013_0818 to hancle +- frames
	716	sub cross1 {
	717	my ($q_b1, $q_e1, $q_b2, $q_e2,
	718	$s_b1, $s_e1, $s_b2, $s_e2) = @_;
	719
	720	my $fr_q1 = ($q_b1 < $q_e1) ? 1 : -1;
	721	my $fr_q2 = ($q_b2 < $q_e2) ? 1 : -1;
	722	my $fr_s1 = ($s_b1 < $s_e1) ? 1 : -1;
	723	my $fr_s2 = ($s_b2 < $s_e2) ? 1 : -1;
	724
	725	my $fr1 = $fr_q1 * $fr_s1;
	726	my $fr2 = $fr_q2 * $fr_s2;
	727	return 1 if (($fr1 * $fr2) < 0); # one ++ and one +-
	728
	729	my $t;
	730	if ($q_e1 < $q_b1) { $t = $q_e1; $q_e1 = $q_b1; $q_b1 = $t; }
	731	if ($q_e2 < $q_b2) { $t = $q_e2; $q_e2 = $q_b2; $q_b2 = $t; }
	732	if ($s_e1 < $s_b1) { $t = $s_e1; $s_e1 = $s_b1; $s_b1 = $t; }
	733	if ($s_e2 < $s_b2) { $t = $s_e2; $s_e2 = $s_b2; $s_b2 = $t; }
	734
	735	# after above transformation
	736	# 0 q_b1 q_e1 q_b2 q_e2 qlen
	737	# query 5' ====================================================================
	738	# match \|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\|\|\|\|\|\|\|\|\|
	739	# subject 5' ========================================================================>>>>>> frame +
	740	# 0 s_b1 s_e1 s_b2 s_e2 slen
	741
	742	# match \|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\|\|\|\|\|\|\|\|\|
	743	# subject 3' ========================================================================>>>>>> frame -
	744	# slen s_e1 s_b1 s_e2 s_b2 0
	745
	746	if (($fr1 > 0) and ($fr2>0)) { # both ++
	747	return ( (($q_b2-$q_b1)*($s_b2-$s_b1) <0) ? 1 : 0);
	748	}
	749	else { # both --
	750	return ( (($q_b2-$q_b1)*($s_e1-$s_e2) <0) ? 1 : 0);
	751	}
	752
	753	}
	754	########## END cross1
	755
	756
	757	## modified on 2013_0818 to hancle +- frames
	758	sub cross1_before_2013_0818 {
	759	my ($q_b1, $q_e1, $q_b2, $q_e2,
	760	$s_b1, $s_e1, $s_b2, $s_e2) = @_;
	761
	762	my $t;
	763	if ($q_e1 < $q_b1) { $t = $q_e1; $q_e1 = $q_b1; $q_b1 = $t; }
	764	if ($q_e2 < $q_b2) { $t = $q_e2; $q_e2 = $q_b2; $q_b2 = $t; }
	765	if ($s_e1 < $s_b1) { $t = $s_e1; $s_e1 = $s_b1; $s_b1 = $t; }
	766	if ($s_e2 < $s_b2) { $t = $s_e2; $s_e2 = $s_b2; $s_b2 = $t; }
	767
	768	return ( (($q_b2-$q_b1)*($s_b2-$s_b1) <0) ? 1 : 0);
	769	}
	770	########## END cross1
	771
	772	sub readblast_m8 {
	773	my ($i, $j, $k, $ll, $no);
	774	my ($q_seq, $filename) = @_;
	775
	776
	777	my $fh = "BL" ;
	778	if ($bl_STDIN) { $fh = "STDIN"; }
	779	else { open($fh, $filename) \|\| return; }
	780
	781	my @this_sbj = ();
	782	$no = 0;
	783	while($ll = <$fh>) {
	784	chop($ll);
	785	my @lls = split(/\t/,$ll);
	786	my $frame = "";
	787	$frame .= ($lls[6] < $lls[7]) ? "+" : "-";
	788	$frame .= ($lls[8] < $lls[9]) ? "+" : "-";
	789	next unless ($lls[0] and $lls[1]);
	790	$this_sbj[$no] = {
	791	'qid' => $lls[0],
	792	'id' => $lls[1],
	793	'iden' => $lls[2],
	794	'alnln' => $lls[3],
	795	'ms' => $lls[4],
	796	'gap' => $lls[5],
	797	'qfrom' => $lls[6],
	798	'qend' => $lls[7],
	799	'sfrom' => $lls[8],
	800	'send' => $lls[9],
	801	'expect' => $lls[10],
	802	'score' => $lls[11],
	803	'frame' => $frame,
	804	};
	805
	806	$no++;
	807	# BLASTP 2.2.24 [Aug-08-2010]
	808	# Query: gi\|388328107\|pdb\|4DDG\|A Chain A, Crystal Structure Of Human Otub1UBCH5B~UBUB
	809	# Database: pdbaa.fa
	810	# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
	811	#gi\|388328107\|pdb\|4DDG\|A gi\|388328107\|pdb\|4DDG\|A 91.81 171 9 3 6 171 1 171 6e-89 323
	812	#gi\|388328107\|pdb\|4DDG\|A gi\|388328107\|pdb\|4DDG\|A 96.51 86 3 0 235 320 155 240 2e-41 166
	813	}
	814	close($fh) if (not $bl_STDIN);
	815
	816	my $self = {
	817	'no' => $no,
	818	'sbj' => [@this_sbj],
	819	};
	820	return $self;
	821	}
	822	########## END readblast_m8
	823
	824
	825	sub blast_formatdb {
	826	my ($i0, $i, $j, $k, $len1);
	827
	828	open(FDB, "> $tmp_db") \|\| die;
	829	$j = 0;
	830	$len1 = 0;
	831	for ($i0=$NR_no-1; $i0>=0; $i0--) { ### from shortest to longest
	832	$i = $NR_idx[$i0];
	833	last if ($idens[$i] eq "*"); ### last if reach rep
	834	next if ($lens[$i] < $opt_aL_lower_band);
	835	next if ($passeds[$i] and ($opt_g==0));
	836	my $seq = $seqs[$i];
	837	$seq =~ s/(.{70})/$1\n/g;
	838	$seq =~ s/\n$//;
	839	#print FDB ">$i $dess[$i]\n$seq\n";
	840	print FDB ">$i.$lens[$i]\n$seq\n";
	841	$j++;
	842	$len1 += $lens[$i];
	843	}
	844	close(FDB);
	845
	846	while(1) {
	847	opendir(SEQDB, $seq_dir) \|\| next;
	848	my @leftseqs = grep {/lock/} readdir(SEQDB);
	849	closedir(SEQDB);
	850
	851	last unless @leftseqs;
	852	sleep(3);
	853	}
	854
	855	return(0, 0) unless ($j > 0);
	856
	857	my $cmd_line = "$formatdb -i $tmp_db";
	858	$cmd_line = "$formatdb -in $tmp_db" if ($bl_plus);
	859	my $cmd = `$cmd_line`;
	860
	861	((-e "$tmp_db.phr") and (-e "$tmp_db.pin") and (-e "$tmp_db.psq")) \|\|
	862	((-e "$tmp_db.nhr") and (-e "$tmp_db.nin") and (-e "$tmp_db.nsq")) \|\|
	863	((-e "$tmp_db.00.phr") and (-e "$tmp_db.00.pin") and (-e "$tmp_db.00.psq")) \|\|
	864	((-e "$tmp_db.00.nhr") and (-e "$tmp_db.00.nin") and (-e "$tmp_db.00.nsq"))
	865	\|\| die "Can not formatdb";
	866
	867	return($j, $len1);
	868	}
	869	########## END blast_formatdb
	870
	871
	872	sub remove_blast_db {
	873	my ($i, $j, $k);
	874	$cmd = `rm -f $tmp_db`;
	875	$cmd = `rm -f $tmp_db.p*`;
	876	$cmd = `rm -f $tmp_db.n*`;
	877
	878	return;
	879	}
	880	########## END remove_blast_db
	881
	882
	883	my $common_usage = <<EOD;
	884
	885	Options
	886	input/output:
	887	-i in_dbname, required
	888	-o out_dbname, required
	889	-l length_of_throw_away_sequences, default 10
	890
	891	thresholds:
	892	-c clustering threshold (sequence identity), default 0.3
	893	-ce clustering threshold (blast expect), default -1,
	894	it means by default it doesn't use expect threshold,
	895	but with positive value, the program cluster seqs if similarities
	896	meet either identity threshold or expect threshold
	897	-G (1/0) use global identity? default 1
	898	two sequences Long (i.e. representative) and Short (redunant) may have multiple
	899	alignment fragments (i.e. HSPs), see:
	900	seq1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx Long sequence
	901	\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\| ///////////// i.e. representative
	902	\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\| ///////////// sequence
	903	\|\|\|\|\|\|\|\|HSP 1 \|\|\|\| ////HSP 2 ///
	904	\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\| /////////////
	905	\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\| /////////////
	906	seq2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx Short sequence
	907	<< length 1 >> << len 2 >> i.e. redundant
	908	<<<<<<<<<<<< length of short sequence >>>>>>>>>>>>>> sequence
	909
	910	total identical letters from all co-linear and non-overlapping HSPs
	911	Glogal identity = -------------------------------------------------------------------
	912	length of short sequence
	913	Local identity = identity of the top high score HSP
	914	if you prefer to use -G 0, it is suggested that you also
	915	use -aS, -aL, such as -aS 0.8, to prevent very short matches.
	916	-aL alignment coverage for the longer sequence, default 0.0
	917	if set to 0.9, the alignment must covers 90% of the sequence
	918	-aS alignment coverage for the shorter sequence, default 0.0
	919	if set to 0.9, the alignment must covers 90% of the sequence
	920	-g (1/0), default 0
	921	by cd-hit's default algorithm, a sequence is clustered to the first
	922	cluster that meet the threshold (fast cluster). If set to 1, the program
	923	will cluster it into the most similar cluster that meet the threshold
	924	(accurate but slow mode)
	925	but either 1 or 0 won't change the representatives of final clusters
	926	-circle (1/0), default 0
	927	when set to 1, treat sequences as circular sequence.
	928	bacterial genomes, plasmids are circular, but their genome coordinate maybe arbitary,
	929	the 2 HSPs below will be treated as non co-linear with -circle 0
	930	the 2 HSPs below will be treated as co-linear with -circle 1
	931	-------------circle-----------
	932	\| \|
	933	seq1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 1
	934	\\\\\\\\ /////////////
	935	\\\\\\\\ /////////////
	936	HSP 2 -> ////HSP 1 /// <-HSP 2
	937	///////////// \\\\\\\\
	938	///////////// \\\\\\\\
	939	seq2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 2
	940	\| \|
	941	-----------circle--------------
	942	-sl, length of very long sequences to be skipped, default 0, no skipping
	943	e.g. -sl 5000 means sequences longer than 5000 aa will be treated as singleton clusters
	944	without clustering, to save time, especially when there is -aL option in place, very
	945	long sequences will not be clustered anyway.
	946	program:
	947	-prog (blastp, blastn, megablast, blastpgp), default blastp
	948	-p profile search para, default
	949	"-j 3 -F F -e 0.001 -b 500 -v 500"
	950	-dprof database for building PSSM, default using input
	951	you can also use another database that is more comprehensive like NR80
	952	-s blast search para, default
	953	"-F F -e 0.000001 -b 100000 -v 100000"
	954	-bs (1/0) default 1
	955	pipe blast results from into parser instead of save in hard drive (save time)
	956
	957	compute:
	958	-exec (qsub, local) default local
	959	this program writes a shell script to run blast, this script is
	960	either performed locally by sh or remotely by qsub
	961	with qsub, you can use PBS, SGE etc
	962	-host number of hosts, ie number of qsub jobs
	963	-para number of parallel blast job per qsub job (each blast can use multi cores), default 1
	964	-blp number of threads per blast job, default 1
	965	number of threads per blast job X number of parallel blast job per qsub job
	966	should <= the number of cores in your computer
	967	if your computer grid has 32 cores / node, do either of the followings
	968	-para 4 -blp 8
	969	-para 8 -blp 4
	970	-para 16 -blp 2
	971	-para 32 -blp 1
	972	-bat number of sequences a blast job to process
	973	-shf a filename for add local settings into the job shell script
	974	for example, when you run PBS jobs, you can add quene name etc in this
	975	file and this script will add them into the job shell script
	976	e.g. template file for PBS
	977	#!/bin/sh
	978	#PBS -v PATH
	979	#PBS -l walltime=8:00:00
	980	#PBS -q job_queue.q
	981
	982	e.g. template file for SGE or OGE
	983	#!/bin/sh
	984	#\$ -v PATH
	985	#\$ -q job_queue.q
	986	#\$ -V
	987	#\$ -pe orte 8
	988
	989	job:
	990	-rs steps of save restart file and clustering output, default 5000
	991	everytime after process 5000 sequences, program write a
	992	restart file and current clustering information
	993	-restart restart file, readin a restart file
	994	if program crash, stoped, termitated, you can restart it by
	995	add a option "-restart sth.restart"
	996	-rf steps of re format blast database, default 200,000
	997	if program clustered 200,000 seqs, it remove them from seq
	998	pool, and re format blast db to save time
	999	-J job, job_file, exe specific jobs like parse blast outonly
	1000	DO NOT use it, it is only used by this program itself
	1001	-k (1/0) keep blast raw output file, default $keep_bl
	1002
	1003	-P path to executables
	1004	EOD
	1005
	1006
	1007	sub print_usage {
	1008	print <<EOD;
	1009	Usage psi-cd-hit [Options]
	1010	$common_usage
	1011
	1012	==============================
	1013	by Weizhong Li, liwz\@sdsc.edu
	1014	==============================
	1015	If you find cd-hit useful, please kindly cite:
	1016
	1017	"Clustering of highly homologous sequences to reduce thesize of large protein database", Weizhong Li, Lukasz Jaroszewski & Adam GodzikBioinformatics, (2001) 17:282-283
	1018	"Cd-hit: a fast program for clustering and comparing large sets of protein or nucleotide sequences", Weizhong Li & Adam Godzik Bioinformatics, (2006) 22:1658-1659
	1019
	1020	EOD
	1021	return;
	1022	}
	1023	########## END print_usage
	1024
	1025
	1026	## like above, but don't assign seqs to specific node
	1027	## while let nodes run them autoly
	1028	sub run_batch_blast3 {
	1029	my $i0 = shift;
	1030	my ($id, $i, $j, $k, $cmd);
	1031
	1032	#### wait before qsubs
	1033	if ($exec_mode eq "qsub") {
	1034	while(1) {
	1035	SGE_qstat_xml_query();
	1036	last unless (%qsub_ids);
	1037
	1038	my $wait_flag = 0;
	1039	foreach my $qsub_id (keys %qsub_ids) {
	1040	if (defined($qstat_xml_data{$qsub_id})) { #### still running
	1041	$wait_flag = 1;
	1042	$cmd = `qdel -f $qsub_id`; #### at this point, all running jobs are not necessary,
	1043	print LOG "force delete un necessary job $qsub_id\n";
	1044	}
	1045	else {
	1046	delete $qsub_ids{$qsub_id};
	1047	}
	1048	}
	1049
	1050	if ($wait_flag) {print LOG "wait submitted jobs\n"; sleep(1); }
	1051	}
	1052
	1053	#### delete seq files from last batch
	1054	opendir(DIR1, $seq_dir);
	1055	my @files = grep { /^\d/ } readdir(DIR1);
	1056	closedir(DIR1);
	1057	foreach $i (@files) {
	1058	$cmd = `rm -f $seq_dir/$i`;
	1059	print LOG "remove un necessary seq file $i\n"
	1060	}
	1061	}
	1062
	1063	my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
	1064
	1065	for ($k=0; $i0<$NR_no; $i0++) {
	1066	$id = $NR_idx[$i0];
	1067	next if ($passeds[$id]);
	1068	next if ($in_bg[$id]);
	1069	next if ($lens[$id] < $opt_aL_upper_band);
	1070	$in_bg[$id] = 1;
	1071
	1072	my $seq = $seqs[$id];
	1073	open(SEQ, "> $seq_dir/$id") \|\| die "Can not write";
	1074	#print SEQ "$dess[$id]\n$seq\n";
	1075	print SEQ ">$id.$lens[$id]\n$seq\n";
	1076	close(SEQ);
	1077	$k++;
	1078	last if ($k >= $total_jobs);
	1079	}
	1080
	1081	if ($exec_mode eq "qsub") {
	1082	for ($j=0; $j<$num_qsub; $j++) {
	1083	my $t = "psi-cd-hit-$j";
	1084	my $cmd = `qsub -N $t $remote_sh_script`;
	1085	my $qsub_id = 0;
	1086	if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
	1087	print LOG "qsub querying $j, PID $qsub_id\n";
	1088	$qsub_ids{$qsub_id} = 1;
	1089	}
	1090	}
	1091	elsif ($exec_mode eq "local") {
	1092	#my $cmd = `sh $remote_sh_script >/dev/null 2>&1 &`;
	1093	my $cmd = `sh $remote_sh_script`;
	1094	}
	1095
	1096	return;
	1097	}
	1098	########## END run_batch_blast3
	1099
	1100
	1101	sub write_remote_sh_script {
	1102	my ($i, $j, $k);
	1103	my $local_sh = <<EOD;
	1104	#!/bin/sh
	1105	#PBS -v PATH
	1106	#\$ -v PATH
	1107	EOD
	1108
	1109	if ($sh_file) {
	1110	$local_sh = `cat $sh_file`;
	1111	}
	1112
	1113	open(RESH, "> $remote_sh_script") \|\| die;
	1114	print RESH <<EOD;
	1115	$local_sh
	1116
	1117	cd $pwd
	1118	EOD
	1119
	1120	for ($k=0; $k<$para_no; $k++){
	1121	print RESH "./$remote_perl_script $k&\n"
	1122	}
	1123	print RESH "wait\n\n";
	1124
	1125	close(RESH);
	1126	return;
	1127	}
	1128	########## END write_remote_sh_script
	1129
	1130	sub write_remote_perl_script {
	1131	my $dir1 = ".";
	1132	my $bl2 = "$blast_exe -d $dir1/$tmp_db $bl_para";
	1133	$bl2 = "$blast_exe -db $dir1/$tmp_db $bl_para" if ($bl_plus);
	1134
	1135	my $opti = "-i"; $opti = "-query" if ($bl_plus);
	1136	my $opto = "-o"; $opto = "-out" if ($bl_plus);
	1137
	1138	open(REPERL, "> $remote_perl_script") \|\| die;
	1139	print REPERL <<EOD;
	1140	#!/usr/bin/perl
	1141	\$host = shift;
	1142	\$arg = shift;
	1143
	1144	#### random sleep, rand() can be a fraction of second
	1145	select(undef,undef,undef,rand());
	1146
	1147	if (\$arg) {
	1148	\@ids = split(/,/, \$arg);
	1149	}
	1150	else {
	1151	while(1) {
	1152	if (opendir(DDIR, "$seq_dir")) {
	1153	\@ids = grep {/^\\d+\$/} readdir(DDIR);
	1154	last;
	1155	}
	1156	else {
	1157	sleep(1);
	1158	}
	1159	}
	1160	}
	1161
	1162	foreach \$id (\@ids) {
	1163
	1164	next unless (-e "$seq_dir/\$id");
	1165	next if (-e "$seq_dir/\$id.lock");
	1166	\$cmd = `touch $seq_dir/\$id.lock`;
	1167
	1168	if ($bl_STDIN) {
	1169	\$cmd = `$bl2 $opti $seq_dir/\$id \| $script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 1`;
	1170	}
	1171	else {
	1172	\$cmd = `$bl2 $opti $seq_dir/\$id $opto $bl_dir/\$id`;
	1173	\$cmd = `$script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 0`;
	1174	}
	1175	\$cmd = `rm -f $seq_dir/\$id`;
	1176	\$cmd = `rm -f $seq_dir/\$id.lock`;
	1177	}
	1178
	1179	(\$tu, \$ts, \$cu, \$cs) = times();
	1180	\$tt = \$tu + \$ts + \$cu + \$cs;
	1181	\$cmd = `echo \$tt >> $seq_dir/host.\$host.cpu`;
	1182
	1183	EOD
	1184	close(REPERL);
	1185	my $cmd = `chmod 755 $remote_perl_script`;
	1186
	1187	return;
	1188	}
	1189	########## END write_remote_perl_script
	1190
	1191
	1192	sub wait_blast_out {
	1193	my $out = shift;
	1194	print LOG "waiting for $out";
	1195	while(1) {
	1196	if (-e $out) {
	1197	my $last = `tail -1 $out`;
	1198	chop($last);
	1199	last if ($last =~ /^#$/);
	1200	}
	1201	sleep(1);
	1202	print LOG ".";
	1203	}
	1204	print LOG "\n";
	1205
	1206	return;
	1207	}
	1208	########## END wait_blast_out
	1209
	1210
	1211	sub SGE_qstat_xml_query {
	1212	my ($i, $j, $k, $cmd, $ll);
	1213	%qstat_xml_data = (); #### global
	1214	$cmd = `qstat -f -xml`;
	1215	if ($cmd =~ /<queue_info/) { #### dummy
	1216	$qstat_xml_data{"NULL"}= ["NULL","NULL"];
	1217	}
	1218	my $tmp = <<EOD;
	1219	<?xml version='1.0'?>
	1220	<job_info xmlns:xsd="http://gridscheduler.svn.sourceforge.net/viewvc/gridscheduler/trunk/source/dist/util/resources/schemas/qstat/qstat.xsd?revision=11">
	1221	<queue_info>
	1222	<Queue-List>
	1223	<name>all.q\@master</name>
	1224	<qtype>BIP</qtype>
	1225	<slots_used>0</slots_used>
	1226	<slots_resv>0</slots_resv>
	1227	<slots_total>0</slots_total>
	1228	<load_avg>0.08000</load_avg>
	1229	<arch>linux-x64</arch>
	1230	</Queue-List>
	1231	...
	1232	<Queue-List>
	1233	<name>all.q\@node016</name>
	1234	<qtype>BIP</qtype>
	1235	<slots_used>32</slots_used>
	1236	<slots_resv>0</slots_resv>
	1237	<slots_total>32</slots_total>
	1238	<load_avg>42.59000</load_avg>
	1239	<arch>linux-x64</arch>
	1240	<job_list state="running"> ####### running jobs in this section
	1241	<JB_job_number>3535</JB_job_number>
	1242	<JAT_prio>0.51468</JAT_prio>
	1243	<JB_name>cd-hit</JB_name>
	1244	<JB_owner>ubuntu</JB_owner>
	1245	<state>r</state>
	1246	<slots>4</slots>
	1247	</job_list>
	1248	...
	1249	</queue_info>
	1250	<job_info>
	1251	<job_list state="pending"> ######## pending jobs in this section
	1252	<JB_job_number>3784</JB_job_number>
	1253	<JAT_prio>0.60500</JAT_prio>
	1254	<JB_name>cd-hit</JB_name>
	1255	<JB_owner>ubuntu</JB_owner>
	1256	<state>qw</state>
	1257	<slots>32</slots>
	1258	</job_list>
	1259	...
	1260	</job_info>
	1261	</job_info>
	1262
	1263	EOD
	1264	my @lls = split(/\n/, $cmd);
	1265	$i = 2; #### skip first 2 lines
	1266	for (; $i<$#lls+1; $i++) {
	1267	if ($lls[$i] =~ /<job_list/) {
	1268	my ($id, $name, $state);
	1269	for (; $i<$#lls+1; $i++) {
	1270	last if ($lls[$i] =~ /<\/job_list/);
	1271	if ($lls[$i] =~ /<JB_job_number>(\d+)/) { $id = $1;}
	1272	if ($lls[$i] =~ /<JB_name>([^<]+)/) { $name = $1;}
	1273	if ($lls[$i] =~ /<state>([^<]+)/) {$state = $1;}
	1274	}
	1275	if (defined($id) and defined($name) and defined($state)) {
	1276	$qstat_xml_data{$id} = [$name, $state];
	1277	}
	1278	}
	1279	}
	1280	}
	1281
	1282
	1283	1;
	1284

+501

-92

psi-cd-hit/psi-cd-hit-local.pl less more

2	2	######### PSI-cd-hit written by Weizhong Li at http://cd-hit.org
3	3	################################################################################
4	4	our $pid = $$;
5		our $db_in = ""; ###################
6		our $db_out = ""; # input / output
	5	our $db_in; ###################
	6	our $db_out; # input / output
7	7	our $len_t = 10; ###################
8	8	our $NR_clstr = 0.3; #
9	9	our $NR_clstre = -1; #thresholds

14	14	our $opt_g = 1; ####################
15	15	our $blast_exe = "blastall -p blastp -m 8"; #########################
16	16	our $prof_exe = "blastpgp -m 8"; #
17		our $prof_para = "-j 3 -F F -e 0.001 -b 500 -v 500"; #
	17	our $prof_para = "-j 3 -F T -e 0.001 -b 500 -v 500"; #
18	18	our $prof_db = ""; #
19		our $bl_para = "-F F -e 0.000001 -b 100000 -v 100000"; # program
	19	our $bl_para = "-F T -e 0.000001 -b 100000 -v 100000"; # program
20	20	our $bl_STDIN = 1; #
21	21	our $keep_bl = 0; #
22	22	our $blast_prog= "blastp"; #
23	23	our $formatdb = "formatdb"; #########################
24	24	our $exec_mode = "local"; #######################
25		our $host_no = 1; #
26		our $core_no = 1; # compute
	25	our $num_qsub = 1; #
	26	our $para_no = 1; # compute
27	27	our $sh_file = ""; #
28		our $batch_no_per_node = 50; #######################
	28	our $num_multi_seq = 50; #
	29	our $batch_no_per_node = 100; #######################
29	30	our $reformat_seg = 50000;
30	31	our $restart_seg = 20000;
31	32	our $job = "";

38	39	our $db_out1;
39	40	our $seq_dir;
40	41	our $bl_dir;
	42	our $blm_dir;
41	43	our $restart_file;
42	44	our $tmp_db;
43	45	our $remote_perl_script;
44	46	our $remote_sh_script;
45	47	our $bl_path;
	48	our $bl_plus = 1; #### use blast+
	49	our $bl_threads = 1;
	50	our $skip_long = 0;
	51	our %qsub_ids = (); #### a list of qsub ids
	52	our %qstat_xml_data = ();
	53	our @blm8_buffer = ();
	54	our %blm8_data = ();
	55
46	56
47	57	sub parse_para_etc {
48	58	my ($arg, $cmd);

59	69	elsif ($arg eq "-aS") { $opt_aS = shift; }
60	70	elsif ($arg eq "-g") { $opt_g = shift; }
61	71	elsif ($arg eq "-circle") { $circle = shift; }
	72	elsif ($arg eq "-sl") { $skip_long = shift; }
62	73	## program
63	74	elsif ($arg eq "-prog") { $blast_prog= shift; }
64	75	elsif ($arg eq "-p") { $prof_para = shift; }
65		elsif ($arg eq "-dprof") { $prof_db = shift; }
	76	elsif ($arg eq "-dprof") { $prof_db = shift; die "option -dprof no longer supported!";}
66	77	elsif ($arg eq "-s") { $bl_para = shift; }
67	78	elsif ($arg eq "-k") { $keep_bl = shift; }
68	79	elsif ($arg eq "-bs") { $bl_STDIN = shift; }
69	80	## compute
70	81	elsif ($arg eq "-exec") { $exec_mode = shift; }
71		elsif ($arg eq "-host") { $host_no = shift; }
72		elsif ($arg eq "-core") { $core_no = shift; }
	82	elsif ($arg eq "-host") { $num_qsub = shift; }
	83	elsif ($arg eq "-para") { $para_no = shift; }
73	84	elsif ($arg eq "-shf") { $sh_file = shift; }
	85	elsif ($arg eq "-blp") { $bl_threads = shift; }
	86	elsif ($arg eq "-bat") { $batch_no_per_node = shift; }
74	87	## job:
75	88	elsif ($arg eq "-rs") { $restart_seg = shift; }
76	89	elsif ($arg eq "-rf") { $reformat_seg= shift; }

82	95	}
83	96
84	97	# speical jobs
85		if ($job eq "parse_blout") { job_parse_blout(); exit();}
	98	if ($job eq "parse_blout") { job_parse_blout(); exit();}
	99	elsif ($job eq "parse_blout_multi") { job_parse_blout_multi(); exit();}
	100
	101	if (not (defined($db_in) and defined($db_out))) {
	102	print_usage(); exit();
	103	}
86	104
87	105	if ($blast_prog eq "blastn") {
88	106	$formatdb = "formatdb -p F";

94	112	$blast_exe = "megablast -H 100 -D 2 -m 8";
95	113	}
96	114	elsif ($blast_prog eq "blastpgp") {
97		$blast_exe = ($prof_db) ? "blastpgp -m 8" : "blastpgp -m 8 -j 3";
	115	$blast_exe = "blastpgp -m 8 -j 3";
	116	}
	117
	118	#### for blast+
	119	if ($bl_plus) {
	120	$formatdb = "makeblastdb -dbtype prot -max_file_sz 8GB";
	121	$blast_exe = "blastp -outfmt 6";
	122	$bl_para = "-seg yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
	123
	124	if ($blast_prog eq "blastn") {
	125	$formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
	126	$blast_exe = "blastn -task blastn -outfmt 6";
	127	$bl_para = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
	128	}
	129	elsif ($blast_prog eq "megablast") {
	130	$blast_prog = "blastn"; #### back to blastn for blast parser type
	131	$formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
	132	$blast_exe = "blastn -task megablast -outfmt 6";
	133	$bl_para = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
	134	}
	135	elsif ($blast_prog eq "blastpgp") {
	136	$blast_exe = "psiblast -outfmt 6 -num_iterations 3 -num_threads $bl_threads";
	137	}
98	138	}
99	139
100	140	if ($bl_path) {

110	150	$db_out1 = "$db_out.out";
111	151	$seq_dir = "$db_in-seq";
112	152	$bl_dir = "$db_in-bl";
	153	$blm_dir = "$db_in-blm";
113	154	$restart_file =" $db_out.restart";
114	155
115	156	$tmp_db = "$db_in.$pid";
116	157	$remote_perl_script = "$tmp_db-bl.pl";
117	158	$remote_sh_script = "$tmp_db-bl.sh";
118	159
119		$cmd = `mkdir $bl_dir $seq_dir`;
	160	$cmd = `mkdir $bl_dir $blm_dir $seq_dir`;
120	161
121	162	write_remote_perl_script();
122	163	write_remote_sh_script();

137	178	$seq =~ s/\s//g;
138	179	if (length($seq) > $len_t) { add_seq($des, $seq); }
139	180	$des = $ll; $seq = "";
	181
140	182	}
141	183	else { $seq .= $ll; }
142	184	}

154	196
155	197	sub add_seq {
156	198	my ($des, $seq) = @_;
	199	$des =~ s/\s.+$//;
157	200	push(@seqs, $seq);
158	201	push(@dess, $des);
159	202	push(@lens, length($seq));

179	222	}
180	223	########## END open_LOG
181	224
	225	sub write_LOG {
	226	my $txt=shift;
	227	print LOG "$txt\n";
	228	}
182	229
183	230	{## use static variables
184	231	my $last_NR90_no=0;

222	269	sub total_remote_cpu {
223	270	my ($i, $j, $k, $ll);
224	271	my $tt = 0;
225		for ($j=0; $j<$host_no; $j++) {
	272	for ($j=0; $j<$num_qsub; $j++) {
226	273	open(TCPU, "$seq_dir/host.$j.cpu") \|\| next;
227	274	while($ll = <TCPU>) {
228	275	chop($ll);

233	280	return $tt;
234	281	}
235	282	########## END total_remote_cpu
	283
	284	#### process m8 format output from multi-query search
	285	sub job_parse_blout_multi{
	286	my ($i, $j, $k, $tfh, $ll, $t1, $t2);
	287
	288	$tfh="BLM8";
	289	open($tfh, $job_file) \|\| die "can not open $job_file";
	290
	291	@blm8_buffer = ();
	292	my $last_id = "";
	293	my $this_id = "";
	294	my $tquery;
	295	while($ll = <$tfh>) {
	296	next if ($ll =~ /^#/);
	297	($this_id, $t1) = split(/\s+/, $ll, 2);
	298
	299	if (@blm8_buffer and ($this_id ne $last_id)) { #### blast results of last query
	300	my @hits = process_blout_blastp_blastn();
	301	$tquery = (split(/\./, $last_id))[0];
	302	my $no1 = $#hits+1;
	303	print ">$tquery\t$no1\n";
	304	foreach $i (@hits) {
	305	print join("\t", @{$i}), "\n";
	306	}
	307	print "#\n";
	308	@blm8_buffer = ();
	309	}
	310	push(@blm8_buffer, $ll);
	311	$last_id = $this_id;
	312	}
	313
	314	if (@blm8_buffer and ($this_id ne $last_id)) { #### blast results of last query
	315	my @hits = process_blout_blastp_blastn();
	316	$tquery = (split(/\./, $last_id))[0];
	317	my $no1 = $#hits+1;
	318	print ">$tquery\t$no1\n";
	319	foreach $i (@hits) {
	320	print join("\t", @{$i}), "\n";
	321	}
	322	print "#\n";
	323	@blm8_buffer = ();
	324	}
	325	close($tfh);
	326	return;
	327	}
	328	########## END job_parse_blout_multi
236	329
237	330
238	331	sub job_parse_blout {

376	469	}
377	470	########## END remove_raw_blout_bg
378	471
379
380		sub fish_other_homolog {
	472	sub fish_other_homolog_multi {
381	473	my ($i, $j, $k, $i0, $j0, $k0);
382	474	$id = shift; # real idx, not sorted idx
383	475	my @hits = ();
384	476
385		wait_blast_out("$bl_dir/$id.out");
386		open(BLPOUT, "$bl_dir/$id.out") \|\| return;
387		while($i=<BLPOUT>) {
388		last if ($i =~ /^#/);
389		chop($i);
390		push(@hits, [split(/\t/,$i)]);
391		}
392		close(BLPOUT);
	477	if (defined($blm8_data{$id})) {
	478	@hits = @{$blm8_data{$id}};
	479	}
	480
393	481	my $rep_len = $lens[$id];
394	482
395	483	foreach $i (@hits) {
396	484	my $id1 = $i->[0];
397		next unless ($id1 < $NR_no);
	485	next unless ($id1 < $NR_no);
398	486	next if ($idens[$id1] eq "*"); #existing reps
399	487	next if ($lens[$id1] > $rep_len); # in opt_g=1 mode, preventing it from being clustered into short rep
400	488

413	501	$NR_clstr_nos[$id1] = $NR90_no;
414	502	$NR_passed++;
415	503	}
	504	if (defined($blm8_data{$id})) {
	505	delete $blm8_data{$id};
	506	}
	507	return;
	508	}
	509	########## END fish_other_homolog_multi
	510
	511
	512	sub fish_other_homolog {
	513	my ($i, $j, $k, $i0, $j0, $k0);
	514	$id = shift; # real idx, not sorted idx
	515	my @hits = ();
	516
	517	wait_blast_out("$bl_dir/$id.out");
	518	open(BLPOUT, "$bl_dir/$id.out") \|\| return;
	519	while($i=<BLPOUT>) {
	520	last if ($i =~ /^#/);
	521	chop($i);
	522	push(@hits, [split(/\t/,$i)]);
	523	}
	524	close(BLPOUT);
	525	my $rep_len = $lens[$id];
	526
	527	foreach $i (@hits) {
	528	my $id1 = $i->[0];
	529	next unless ($id1 < $NR_no);
	530	next if ($idens[$id1] eq "*"); #existing reps
	531	next if ($lens[$id1] > $rep_len); # in opt_g=1 mode, preventing it from being clustered into short rep
	532
	533	if ( $passeds[$id1] ) { #### if this hit is better -g 1 mode
	534	my $old_e = (split(/\//,$idens[$id1]))[0];
	535	if ($i->[3] < $old_e) {
	536	$idens[$id1] = "$i->[3]/$i->[2]aa/$i->[1]%";
	537	$passeds[$id1] = 1;
	538	$NR_clstr_nos[$id1] = $NR90_no;
	539	}
	540	next;
	541	}
	542
	543	$idens[$id1] = "$i->[3]/$i->[2]aa/$i->[1]%";
	544	$passeds[$id1] = 1;
	545	$NR_clstr_nos[$id1] = $NR90_no;
	546	$NR_passed++;
	547	}
416	548	return;
417	549	}
418	550	########## END fish_other_homolog

481	613	for ($i=0; $i<$self->{no}; $i++) {
482	614	my $p = $self->{sbj}->[$i];
483	615	my ($id1, $len_sub) = split(/\./, $p->{id});
	616	next unless ($len_sub >0) ;
484	617
485	618	if (not defined($id_exist{$id1})) {
486	619	$id_exist{$id1} = 1;

555	688
556	689	#### need $len_rep
557	690	my $len_rep = 0;
558		my $bl = readblast_m8("", $blout);
	691	my $bl = defined($blout) ? readblast_m8("", $blout) : readblast_m8_buffer();
559	692	if ($blast_prog eq "blastn") { keep_strand_with_top_hsp($bl); }
560	693	if (($blast_prog eq "blastpgp") and (not $prof_db)) {keep_hsp_of_last_round($bl); }
561	694

568	701	my $frame = $p->{frame};
569	702	if (not $len_rep) {$len_rep = (split(/\./,$p->{qid}))[1]; }
570	703	my $iden = $p->{iden};
	704	next unless (($len_sub >0) and ($len_rep>0));
571	705	my $cov_aS = $p->{alnln} / $len_sub;
572	706	my $cov_aL = $p->{alnln} / $len_rep;
573	707	my $exp1 = $p->{expect};

591	725	my ($id1, $len_sub) = split(/\./, $p->{id});
592	726	my $frame = $p->{frame};
593	727	if (not $len_rep) {$len_rep = (split(/\./,$p->{qid}))[1]; }
	728	next unless (($len_sub >0) and ($len_rep>0));
594	729
595	730	if ($hsp_no) {
596	731	if ($id1 ne $hsp[0]->[0]) {

729	864	}
730	865	########## END cross1
731	866
732		sub readblast_m8 {
	867	sub readblast_m8_buffer {
733	868	my ($i, $j, $k, $ll, $no);
734		my ($q_seq, $filename) = @_;
735
736
737		my $fh = "BL" ;
738		if ($bl_STDIN) { $fh = "STDIN"; }
739		else { open($fh, $filename) \|\| return; }
740
741	869	my @this_sbj = ();
742	870	$no = 0;
743		while($ll = <$fh>) {
	871	while($ll = shift @blm8_buffer) {
744	872	chop($ll);
745	873	my @lls = split(/\t/,$ll);
746	874	my $frame = "";
747	875	$frame .= ($lls[6] < $lls[7]) ? "+" : "-";
748	876	$frame .= ($lls[8] < $lls[9]) ? "+" : "-";
	877	next unless ($lls[0] and $lls[1]);
749	878	$this_sbj[$no] = {
750	879	'qid' => $lls[0],
751	880	'id' => $lls[1],

770	899	#gi\|388328107\|pdb\|4DDG\|A gi\|388328107\|pdb\|4DDG\|A 91.81 171 9 3 6 171 1 171 6e-89 323
771	900	#gi\|388328107\|pdb\|4DDG\|A gi\|388328107\|pdb\|4DDG\|A 96.51 86 3 0 235 320 155 240 2e-41 166
772	901	}
	902	my $self = {
	903	'no' => $no,
	904	'sbj' => [@this_sbj],
	905	};
	906	return $self;
	907	}
	908	########## END readblast_m8
	909
	910	sub readblast_m8 {
	911	my ($i, $j, $k, $ll, $no);
	912	my ($q_seq, $filename) = @_;
	913
	914
	915	my $fh = "BL" ;
	916	if ($bl_STDIN) { $fh = "STDIN"; }
	917	else { open($fh, $filename) \|\| return; }
	918
	919	my @this_sbj = ();
	920	$no = 0;
	921	while($ll = <$fh>) {
	922	chop($ll);
	923	my @lls = split(/\t/,$ll);
	924	my $frame = "";
	925	$frame .= ($lls[6] < $lls[7]) ? "+" : "-";
	926	$frame .= ($lls[8] < $lls[9]) ? "+" : "-";
	927	next unless ($lls[0] and $lls[1]);
	928	$this_sbj[$no] = {
	929	'qid' => $lls[0],
	930	'id' => $lls[1],
	931	'iden' => $lls[2],
	932	'alnln' => $lls[3],
	933	'ms' => $lls[4],
	934	'gap' => $lls[5],
	935	'qfrom' => $lls[6],
	936	'qend' => $lls[7],
	937	'sfrom' => $lls[8],
	938	'send' => $lls[9],
	939	'expect' => $lls[10],
	940	'score' => $lls[11],
	941	'frame' => $frame,
	942	};
	943
	944	$no++;
	945	# BLASTP 2.2.24 [Aug-08-2010]
	946	# Query: gi\|388328107\|pdb\|4DDG\|A Chain A, Crystal Structure Of Human Otub1UBCH5B~UBUB
	947	# Database: pdbaa.fa
	948	# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
	949	#gi\|388328107\|pdb\|4DDG\|A gi\|388328107\|pdb\|4DDG\|A 91.81 171 9 3 6 171 1 171 6e-89 323
	950	#gi\|388328107\|pdb\|4DDG\|A gi\|388328107\|pdb\|4DDG\|A 96.51 86 3 0 235 320 155 240 2e-41 166
	951	}
773	952	close($fh) if (not $bl_STDIN);
774	953
775	954	my $self = {

790	969	for ($i0=$NR_no-1; $i0>=0; $i0--) { ### from shortest to longest
791	970	$i = $NR_idx[$i0];
792	971	last if ($idens[$i] eq "*"); ### last if reach rep
	972	next if ($lens[$i] < $opt_aL_lower_band);
793	973	next if ($passeds[$i] and ($opt_g==0));
794	974	my $seq = $seqs[$i];
795	975	$seq =~ s/(.{70})/$1\n/g;

812	992
813	993	return(0, 0) unless ($j > 0);
814	994
815		my $cmd = `$formatdb -i $tmp_db`;
	995	my $cmd_line = "$formatdb -i $tmp_db";
	996	$cmd_line = "$formatdb -in $tmp_db" if ($bl_plus);
	997	my $cmd = `$cmd_line`;
	998
816	999	((-e "$tmp_db.phr") and (-e "$tmp_db.pin") and (-e "$tmp_db.psq")) \|\|
817	1000	((-e "$tmp_db.nhr") and (-e "$tmp_db.nin") and (-e "$tmp_db.nsq")) \|\|
818	1001	((-e "$tmp_db.00.phr") and (-e "$tmp_db.00.pin") and (-e "$tmp_db.00.psq")) \|\|

841	1024	input/output:
842	1025	-i in_dbname, required
843	1026	-o out_dbname, required
844		-l length_of_throw_away_sequences, default 10
	1027	-l length_of_throw_away_sequences, default $len_t
845	1028
846	1029	thresholds:
847		-c clustering threshold (sequence identity), default 0.3
848		-ce clustering threshold (blast expect), default -1,
	1030	-c clustering threshold (sequence identity), default $NR_clstr
	1031	-ce clustering threshold (blast expect), default $NR_clstre,
849	1032	it means by default it doesn't use expect threshold,
850	1033	but with positive value, the program cluster seqs if similarities
851	1034	meet either identity threshold or expect threshold
852		-G (1/0) use global identity? default 1
	1035	-G (1/0) use global identity? default $g_iden
853	1036	two sequences Long (i.e. representative) and Short (redunant) may have multiple
854	1037	alignment fragments (i.e. HSPs), see:
855	1038	seq1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx Long sequence

868	1051	Local identity = identity of the top high score HSP
869	1052	if you prefer to use -G 0, it is suggested that you also
870	1053	use -aS, -aL, such as -aS 0.8, to prevent very short matches.
871		-aL alignment coverage for the longer sequence, default 0.0
	1054	-aL alignment coverage for the longer sequence, default $opt_aL
872	1055	if set to 0.9, the alignment must covers 90% of the sequence
873		-aS alignment coverage for the shorter sequence, default 0.0
	1056	-aS alignment coverage for the shorter sequence, default $opt_aS
874	1057	if set to 0.9, the alignment must covers 90% of the sequence
875		-g (1/0), default 0
	1058	-g (1/0), default $opt_g
876	1059	by cd-hit's default algorithm, a sequence is clustered to the first
877	1060	cluster that meet the threshold (fast cluster). If set to 1, the program
878	1061	will cluster it into the most similar cluster that meet the threshold
879	1062	(accurate but slow mode)
880	1063	but either 1 or 0 won't change the representatives of final clusters
881		-circle (1/0), default 0
	1064	-circle (1/0), default $circle
882	1065	when set to 1, treat sequences as circular sequence.
883	1066	bacterial genomes, plasmids are circular, but their genome coordinate maybe arbitary,
884	1067	the 2 HSPs below will be treated as non co-linear with -circle 0

886	1069	-------------circle-----------
887	1070	\| \|
888	1071	seq1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 1
889		\\\\\\\\ /////////////
890		\\\\\\\\ /////////////
	1072	\\\\\\\\\\\\\\\\ /////////////
	1073	\\\\\\\\\\\\\\\\ /////////////
891	1074	HSP 2 -> ////HSP 1 /// <-HSP 2
892		///////////// \\\\\\\\
893		///////////// \\\\\\\\
	1075	///////////// \\\\\\\\\\\\\\\\
	1076	///////////// \\\\\\\\\\\\\\\\
894	1077	seq2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 2
895	1078	\| \|
896	1079	-----------circle--------------
	1080	-sl, length of very long sequences to be skipped, default $skip_long,
	1081	e.g. -sl 5000 means sequences longer than 5000 aa will be treated as singleton clusters
	1082	without clustering, to save time, especially when there is -aL option in place, very
	1083	long sequences will not be clustered anyway.
	1084	-sl 0 means no skipping
897	1085	program:
898		-prog (blastp, blastn, megablast, blastpgp), default blastp
899		-p profile search para, default
900		"-j 3 -F F -e 0.001 -b 500 -v 500"
	1086	-prog (blastp, blastn, megablast, blastpgp), default $blast_prog
	1087	-p profile search para, default
	1088	"$prof_para"
901	1089	-dprof database for building PSSM, default using input
902	1090	you can also use another database that is more comprehensive like NR80
903		-s blast search para, default
904		"-F F -e 0.000001 -b 100000 -v 100000"
905		-bs (1/0) default 1
	1091	-s blast search para, default
	1092	"$bl_para"
	1093	-bs (1/0) default $bl_STDIN
906	1094	pipe blast results from into parser instead of save in hard drive (save time)
907	1095
908	1096	compute:
909		-exec (qsub, local) default local
	1097	-exec (qsub, local) default $exec_mode
910	1098	this program writes a shell script to run blast, this script is
911	1099	either performed locally by sh or remotely by qsub
912	1100	with qsub, you can use PBS, SGE etc
913		-host number of hosts for qsub
914		-core number of cpu cores per computer, default 1
	1101	-host number of qsub jobs, default $num_qsub
	1102	-para number of parallel blast job per qsub job (each blast can use multi cores), default $para_no
	1103	one qsub script can run multiple blast jobs
	1104	-blp number of threads per blast job, default $bl_threads
	1105	number of threads per blast job (option -blp) X number of parallel blast job per qsub job (option -para)
	1106	should <= the number of cores in your computer
	1107	if your computer grid has 32 cores / node, do either of the followings
	1108	-para 4 -blp 8
	1109	-para 8 -blp 4 preferred
	1110	-para 16 -blp 2
	1111	-para 32 -blp 1
	1112	-bat number of sequences a blast job to process, $batch_no_per_node
915	1113	-shf a filename for add local settings into the job shell script
916	1114	for example, when you run PBS jobs, you can add quene name etc in this
917	1115	file and this script will add them into the job shell script
918		e.g. your file may have followings
	1116	e.g. template file for PBS
	1117	#!/bin/sh
919	1118	#PBS -v PATH
920	1119	#PBS -l walltime=8:00:00
921		#PBS -q jobqueue
	1120	#PBS -q job_queue.q
	1121
	1122	e.g. template file for SGE or OGE
	1123	#!/bin/sh
	1124	#\$ -v PATH
	1125	#\$ -q job_queue.q
	1126	#\$ -V
	1127	#\$ -pe orte 8
922	1128
923	1129	job:
924		-rs steps of save restart file and clustering output, default 5000
	1130	-rs steps of save restart file and clustering output, default $restart_seg
925	1131	everytime after process 5000 sequences, program write a
926	1132	restart file and current clustering information
927	1133	-restart restart file, readin a restart file
928	1134	if program crash, stoped, termitated, you can restart it by
929	1135	add a option "-restart sth.restart"
930		-rf steps of re format blast database, default 200,000
	1136	-rf steps of re format blast database, default $reformat_seg
931	1137	if program clustered 200,000 seqs, it remove them from seq
932	1138	pool, and re format blast db to save time
933	1139	-J job, job_file, exe specific jobs like parse blast outonly
934		DON'T use it, it is only used by this program itself
	1140	DO NOT use it, it is only used by this program itself
935	1141	-k (1/0) keep blast raw output file, default $keep_bl
936	1142
937		-P path to executables
	1143	-P path to blast executables
938	1144	EOD
939	1145
940	1146

957	1163	########## END print_usage
958	1164
959	1165
960		## like above, but don't assign seqs to specific node
961		## while let nodes run them autoly
962		sub run_batch_blast3 {
	1166	## copied from run_batch_blast3
	1167	## run multi seq per sample
	1168	## wait for all jobs to finish
	1169	sub run_batch_blast3_multi {
963	1170	my $i0 = shift;
964		my ($id, $i, $j, $k);
	1171	my ($id, $i, $j, $k, $cmd, $ll);
965	1172
966		my $total_jobs = $batch_no_per_node * $host_no * $core_no;
	1173	my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
967	1174
968	1175	for ($k=0; $i0<$NR_no; $i0++) {
969	1176	$id = $NR_idx[$i0];
970	1177	next if ($passeds[$id]);
971	1178	next if ($in_bg[$id]);
	1179	next if ($lens[$id] < $opt_aL_upper_band);
	1180	$in_bg[$id] = 1;
	1181
	1182	my $seq = $seqs[$id];
	1183
	1184	if (($k % $num_multi_seq) ==0) { #### reopen
	1185	close(SEQ) if ($k > 0);
	1186	open(SEQ, "> $seq_dir/$id") \|\| die "Can not write";
	1187	}
	1188	#print SEQ "$dess[$id]\n$seq\n";
	1189	print SEQ ">$id.$lens[$id]\n$seq\n";
	1190	$k++;
	1191	last if ($k >= $total_jobs);
	1192	}
	1193	close(SEQ);
	1194
	1195	if ($exec_mode eq "qsub") {
	1196	for ($j=0; $j<$num_qsub; $j++) {
	1197	my $t = "psi-cd-hit-$j";
	1198	my $cmd = `qsub -N $t $remote_sh_script $j`; #### pass $j to qsub command
	1199	my $qsub_id = 0;
	1200	if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
	1201	print LOG "qsub querying $j, PID $qsub_id\n";
	1202	$qsub_ids{$qsub_id} = 1;
	1203	}
	1204	}
	1205	elsif ($exec_mode eq "local") {
	1206	#my $cmd = `sh $remote_sh_script >/dev/null 2>&1 &`;
	1207	my $cmd = `sh $remote_sh_script`;
	1208	}
	1209
	1210	#### wait finish all submitted
	1211	if ($exec_mode eq "qsub") {
	1212	while(1) {
	1213	SGE_qstat_xml_query();
	1214	last unless (%qsub_ids);
	1215
	1216	my $wait_flag = 0;
	1217	foreach my $qsub_id (keys %qsub_ids) {
	1218	if (defined($qstat_xml_data{$qsub_id})) { #### still running
	1219	$wait_flag = 1;
	1220	}
	1221	else {
	1222	delete $qsub_ids{$qsub_id};
	1223	}
	1224	}
	1225
	1226	if ($wait_flag) {print LOG "wait submitted jobs\n"; sleep(1); }
	1227	}
	1228	}
	1229
	1230	#### read in all parsed blast output
	1231	%blm8_data =();
	1232	opendir(BLMDIR, $blm_dir) \|\| die "can not open $blm_dir";
	1233	my @bl_files = grep { /^\d/ } readdir(BLMDIR);
	1234	closedir(BLMDIR);
	1235
	1236	foreach my $blf (@bl_files) {
	1237	open(BLMTMP, "$blm_dir/$blf") \|\| next;
	1238	while($ll = <BLMTMP>) {
	1239	next if ($ll =~ /^#/);
	1240	chop($ll);
	1241	if ($ll =~ /^>/) {
	1242
	1243	my ($id, $no1) = split(/\s+/, substr($ll,1));
	1244	my @hits = ();
	1245	for ($j=0; $j<$no1; $j++) {
	1246	$ll=<BLMTMP>; chop($ll);
	1247	push(@hits, [split(/\t/,$ll)]);
	1248	}
	1249	if ($no1>=1) {
	1250	$blm8_data{$id} = [@hits];
	1251	}
	1252	}
	1253	}
	1254	close(BLMTMP);
	1255
	1256	$cmd = `rm -f $blm_dir/$blf`;
	1257	print LOG "parse and then rm $blm_dir/$blf\n";
	1258	}
	1259	return;
	1260	}
	1261
	1262	sub run_batch_blast3 {
	1263	my $i0 = shift;
	1264	my ($id, $i, $j, $k, $cmd);
	1265
	1266	#### wait before qsubs
	1267	if ($exec_mode eq "qsub") {
	1268	while(1) {
	1269	SGE_qstat_xml_query();
	1270	last unless (%qsub_ids);
	1271
	1272	my $wait_flag = 0;
	1273	foreach my $qsub_id (keys %qsub_ids) {
	1274	if (defined($qstat_xml_data{$qsub_id})) { #### still running
	1275	$wait_flag = 1;
	1276	$cmd = `qdel -f $qsub_id`; #### at this point, all running jobs are not necessary,
	1277	print LOG "force delete un necessary job $qsub_id\n";
	1278	}
	1279	else {
	1280	delete $qsub_ids{$qsub_id};
	1281	}
	1282	}
	1283
	1284	if ($wait_flag) {print LOG "wait submitted jobs\n"; sleep(1); }
	1285	}
	1286
	1287	#### delete seq files from last batch
	1288	opendir(DIR1, $seq_dir);
	1289	my @files = grep { /^\d/ } readdir(DIR1);
	1290	closedir(DIR1);
	1291	foreach $i (@files) {
	1292	$cmd = `rm -f $seq_dir/$i`;
	1293	print LOG "remove un necessary seq file $i\n"
	1294	}
	1295	}
	1296
	1297	my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
	1298
	1299	for ($k=0; $i0<$NR_no; $i0++) {
	1300	$id = $NR_idx[$i0];
	1301	next if ($passeds[$id]);
	1302	next if ($in_bg[$id]);
	1303	next if ($lens[$id] < $opt_aL_upper_band);
972	1304	$in_bg[$id] = 1;
973	1305
974	1306	my $seq = $seqs[$id];

981	1313	}
982	1314
983	1315	if ($exec_mode eq "qsub") {
984		for ($j=0; $j<$host_no; $j++) {
	1316	for ($j=0; $j<$num_qsub; $j++) {
985	1317	my $t = "psi-cd-hit-$j";
986		print LOG "PBS querying $j\n";
987	1318	my $cmd = `qsub -N $t $remote_sh_script`;
	1319	my $qsub_id = 0;
	1320	if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
	1321	print LOG "qsub querying $j, PID $qsub_id\n";
	1322	$qsub_ids{$qsub_id} = 1;
988	1323	}
989	1324	}
990	1325	elsif ($exec_mode eq "local") {
991		my $cmd = `sh $remote_sh_script >/dev/null 2>&1 &`;
	1326	#my $cmd = `sh $remote_sh_script >/dev/null 2>&1 &`;
	1327	my $cmd = `sh $remote_sh_script`;
992	1328	}
993	1329
994	1330	return;

998	1334
999	1335	sub write_remote_sh_script {
1000	1336	my ($i, $j, $k);
1001		my $local_sh = "";
	1337	my $local_sh = <<EOD;
	1338	#!/bin/sh
	1339	#PBS -v PATH
	1340	#\$ -v PATH
	1341	EOD
	1342
1002	1343	if ($sh_file) {
1003	1344	$local_sh = `cat $sh_file`;
1004	1345	}
1005	1346
1006	1347	open(RESH, "> $remote_sh_script") \|\| die;
1007	1348	print RESH <<EOD;
1008		#!/bin/bash
1009		#\$ -S /bin/bash
1010		#\$ -v PATH
1011		#PBS -v PATH
1012	1349	$local_sh
1013	1350
	1351	para=\$1
1014	1352	cd $pwd
1015	1353	EOD
1016	1354
1017		for ($k=0; $k<$core_no; $k++){
1018		print RESH "./$remote_perl_script $k&\n"
	1355	for ($k=0; $k<$para_no; $k++){
	1356	print RESH "./$remote_perl_script $k \$para &\n"
1019	1357	}
1020	1358	print RESH "wait\n\n";
1021	1359

1026	1364
1027	1365	sub write_remote_perl_script {
1028	1366	my $dir1 = ".";
1029		my $bl2 = ($prof_db) ?
1030		"$blast_exe -d $dir1/$tmp_db $bl_para -R $bl_dir/\$id.prof":
1031		"$blast_exe -d $dir1/$tmp_db $bl_para";
1032		my $cc = ($prof_db) ? 1 : 0;
1033		if ($prof_db) { my $cmd=`formatdb -i $prof_db`; }
	1367	my $bl2 = "$blast_exe -d $dir1/$tmp_db $bl_para";
	1368	$bl2 = "$blast_exe -db $dir1/$tmp_db $bl_para" if ($bl_plus);
	1369
	1370	my $opti = "-i"; $opti = "-query" if ($bl_plus);
	1371	my $opto = "-o"; $opto = "-out" if ($bl_plus);
1034	1372
1035	1373	open(REPERL, "> $remote_perl_script") \|\| die;
1036	1374	print REPERL <<EOD;
1037	1375	#!/usr/bin/perl
1038	1376	\$host = shift;
	1377	\$instance = shift;
1039	1378	\$arg = shift;
1040	1379
1041	1380	#### random sleep, rand() can be a fraction of second

1062	1401	next if (-e "$seq_dir/\$id.lock");
1063	1402	\$cmd = `touch $seq_dir/\$id.lock`;
1064	1403
1065		if ($cc) {
1066		\$cmd = `$prof_exe -d $prof_db $prof_para -i $seq_dir/\$id -C $bl_dir/\$id.prof`;
1067		}
1068
1069		if ($bl_STDIN) {
1070		\$cmd = `$bl2 -i $seq_dir/\$id \| $script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 1`;
	1404	if ($num_multi_seq) {
	1405	\$cmd = `$bl2 $opti $seq_dir/\$id $opto $bl_dir/\$id`;
	1406	\$cmd = `$script_name -J parse_blout_multi $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 0 >> $blm_dir/\$host.\$instance`;
	1407	}
	1408	elsif ($bl_STDIN) {
	1409	\$cmd = `$bl2 $opti $seq_dir/\$id \| $script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 1`;
1071	1410	}
1072	1411	else {
1073		\$cmd = `$bl2 -i $seq_dir/\$id -o $bl_dir/\$id`;
	1412	\$cmd = `$bl2 $opti $seq_dir/\$id $opto $bl_dir/\$id`;
1074	1413	\$cmd = `$script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 0`;
1075	1414	}
1076	1415	\$cmd = `rm -f $seq_dir/\$id`;
1077	1416	\$cmd = `rm -f $seq_dir/\$id.lock`;
1078		if ($cc) { \$cmd = `rm -f $bl_dir/\$id.prof`; }
1079	1417	}
1080	1418
1081	1419	(\$tu, \$ts, \$cu, \$cs) = times();
1082	1420	\$tt = \$tu + \$ts + \$cu + \$cs;
1083		\$cmd = `echo \$tt >> $seq_dir/host.\$host.cpu`;
	1421	\$cmd = `echo \$tt >> $seq_dir/host.\$host.\$instance.cpu`;
1084	1422
1085	1423	EOD
1086	1424	close(REPERL);

1110	1448	########## END wait_blast_out
1111	1449
1112	1450
	1451	sub SGE_qstat_xml_query {
	1452	my ($i, $j, $k, $cmd, $ll);
	1453	%qstat_xml_data = (); #### global
	1454	$cmd = `qstat -f -xml`;
	1455	if ($cmd =~ /<queue_info/) { #### dummy
	1456	$qstat_xml_data{"NULL"}= ["NULL","NULL"];
	1457	}
	1458	my $tmp = <<EOD;
	1459	<?xml version='1.0'?>
	1460	<job_info xmlns:xsd="http://gridscheduler.svn.sourceforge.net/viewvc/gridscheduler/trunk/source/dist/util/resources/schemas/qstat/qstat.xsd?revision=11">
	1461	<queue_info>
	1462	<Queue-List>
	1463	<name>all.q\@master</name>
	1464	<qtype>BIP</qtype>
	1465	<slots_used>0</slots_used>
	1466	<slots_resv>0</slots_resv>
	1467	<slots_total>0</slots_total>
	1468	<load_avg>0.08000</load_avg>
	1469	<arch>linux-x64</arch>
	1470	</Queue-List>
	1471	...
	1472	<Queue-List>
	1473	<name>all.q\@node016</name>
	1474	<qtype>BIP</qtype>
	1475	<slots_used>32</slots_used>
	1476	<slots_resv>0</slots_resv>
	1477	<slots_total>32</slots_total>
	1478	<load_avg>42.59000</load_avg>
	1479	<arch>linux-x64</arch>
	1480	<job_list state="running"> ####### running jobs in this section
	1481	<JB_job_number>3535</JB_job_number>
	1482	<JAT_prio>0.51468</JAT_prio>
	1483	<JB_name>cd-hit</JB_name>
	1484	<JB_owner>ubuntu</JB_owner>
	1485	<state>r</state>
	1486	<slots>4</slots>
	1487	</job_list>
	1488	...
	1489	</queue_info>
	1490	<job_info>
	1491	<job_list state="pending"> ######## pending jobs in this section
	1492	<JB_job_number>3784</JB_job_number>
	1493	<JAT_prio>0.60500</JAT_prio>
	1494	<JB_name>cd-hit</JB_name>
	1495	<JB_owner>ubuntu</JB_owner>
	1496	<state>qw</state>
	1497	<slots>32</slots>
	1498	</job_list>
	1499	...
	1500	</job_info>
	1501	</job_info>
	1502
	1503	EOD
	1504	my @lls = split(/\n/, $cmd);
	1505	$i = 2; #### skip first 2 lines
	1506	for (; $i<$#lls+1; $i++) {
	1507	if ($lls[$i] =~ /<job_list/) {
	1508	my ($id, $name, $state);
	1509	for (; $i<$#lls+1; $i++) {
	1510	last if ($lls[$i] =~ /<\/job_list/);
	1511	if ($lls[$i] =~ /<JB_job_number>(\d+)/) { $id = $1;}
	1512	if ($lls[$i] =~ /<JB_name>([^<]+)/) { $name = $1;}
	1513	if ($lls[$i] =~ /<state>([^<]+)/) {$state = $1;}
	1514	}
	1515	if (defined($id) and defined($name) and defined($state)) {
	1516	$qstat_xml_data{$id} = [$name, $state];
	1517	}
	1518	}
	1519	}
	1520	}
	1521
1113	1522
1114	1523	1;
1115	1524

+165

-0

psi-cd-hit/psi-cd-hit-old.pl less more

	0	#!/usr/bin/perl -w
	1	################################################################################
	2	######### PSI-cd-hit written by Weizhong Li at http://cd-hit.org
	3	################################################################################
	4
	5	our $script_name = $0;
	6	our $script_dir = $0;
	7	$script_dir =~ s/[^\/]+$//;
	8	$script_dir = "./" unless ($script_dir);
	9	require "$script_dir/psi-cd-hit-local-old.pl";
	10
	11	parse_para_etc(@ARGV);
	12	open_LOG();
	13
	14	our @seqs = ();
	15	our @dess = ();
	16	our @idens = ();
	17	our @lens = ();
	18	our @passeds = ();
	19	our @NR_clstr_nos = ();
	20	our @in_bg = ();
	21	our @NR_idx = ();
	22	our $NR_no = 0;
	23	our $DB_no = 0;
	24	our $DB_len = 0;
	25	our $DB_len0 = 0;
	26	our $DB_len_reduced = 0;
	27	our $DB_len_reduced2 = 0; #### for write_restart etc purpose
	28
	29	our $opt_aL_upper_band = 0; #### sequences < this length will not be submitted unless reformatdb
	30	our $opt_al_upper_bandi= 0;
	31	our $opt_aL_lower_band = 0; #### sequences < this length don't need to be searched
	32	my ($i, $j, $k, $i0, $j0, $k0, $ll);
	33
	34	read_db();
	35
	36	our $NR_passed = 0;
	37	our $formatdb_no = $NR_no;;
	38
	39	@NR_idx = (0..($NR_no-1));
	40	@NR_idx = sort { $lens[$b] <=> $lens[$a] } @NR_idx unless (-e $restart_in);
	41
	42	our $NR90_no = 0;
	43	our @NR90_seq = ();
	44
	45	$i0 = 0;
	46	if ( -e $restart_in) { $i0 = read_restart(); } ## restart after crash
	47	elsif ($skip_long > 0) { #### skip very long seqs
	48	for (; $i0<$NR_no; $i0++) {
	49	$i = $NR_idx[$i0];
	50	last if ($lens[$i] < $skip_long);
	51
	52	$NR_passed++;
	53	$NR_clstr_nos[$i] = $NR90_no;
	54	$idens[$i] = "*";
	55	$passeds[$i] = 1;
	56	$NR90_seq[$NR90_no] = [$i];
	57	$NR90_no++;
	58	$DB_len_reduced += $lens[$i];
	59	}
	60	}
	61
	62	#### set init opt_aL_uppper/lower_bands
	63	if ( ($opt_aL > 0.3) ) {
	64	die ("option -aL > 1.0") if ($opt_aL > 1.0);
	65
	66	####################
	67	###################
	68	##################
	69	#################
	70	################
	71	############### <-upper band
	72	############## <- seq below not submit, unless band change
	73	#############
	74	############
	75	###########
	76	########## <- lower band
	77	######### <- seq below not in format db
	78	########
	79	#######
	80	#####
	81	####
	82	###
	83	##
	84	#
	85	my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
	86	my $space = ($total_jobs > $restart_seg) ? $total_jobs : $restart_seg;
	87	my $d1 = $i0+$space;
	88	$d1 = ($NR_no-1) if ($d1 >= $NR_no-1);
	89	$opt_aL_upper_band = $lens[ $NR_idx[$d1] ];
	90	$opt_aL_lower_band = int($opt_aL_upper_band * $opt_aL);
	91	$opt_aL_upper_bandi= $d1;
	92	write_LOG("set opt_aL_band $opt_aL_upper_band($opt_aL_upper_bandi) $opt_aL_lower_band");
	93	}
	94
	95
	96	($DB_no, $DB_len) = blast_formatdb();
	97	$DB_len0 = $DB_len;
	98	$DB_len_reduced = 0;
	99	$DB_len_reduced2 = 0;
	100	for (; $i0<$NR_no; $i0++) {
	101	$i = $NR_idx[$i0];
	102	run_batch_blast3($i0) unless ($in_bg[$i] or (-e "$bl_dir/$i.out") or $passeds[$i]);
	103
	104	if ( not $passeds[$i] ) { # this is a new representative
	105	$NR_passed++;
	106	$NR_clstr_nos[$i] = $NR90_no;
	107	$idens[$i] = "*";
	108	$passeds[$i] = 1;
	109	$NR90_seq[$NR90_no] = [$i];
	110	fish_other_homolog($i);
	111	$NR90_no++;
	112	$DB_len_reduced += $lens[$i];
	113	$DB_len_reduced2 += $lens[$i];
	114	}
	115
	116	watch_progress($i0, $NR90_no, $NR_passed, $NR_no, 0);
	117
	118	if ((($i0+1) % $restart_seg == 0) or ($DB_len_reduced2 > $DB_len0/10) ) {
	119	write_restart(); write_db_clstr(); remove_raw_blout_bg($i0);
	120	$DB_len_reduced2 = 0;
	121	}
	122
	123	my $opt_aL_format_flag = 0;
	124	if ( ($opt_aL > 0.3) ) { #### formatdb maybe needed if current length of seq.i0 close to opt_aL_upper_band
	125	my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
	126	if ( ($opt_aL_upper_bandi - $i0) < $total_jobs ) { #### seqs left for possible submission < total_jobs
	127
	128	my $space = ($total_jobs > $restart_seg) ? $total_jobs : $restart_seg;
	129	my $d1 = $i0+$space;
	130	$d1 = ($NR_no-1) if ($d1 >= $NR_no-1);
	131	$opt_aL_upper_band = $lens[ $NR_idx[$d1] ];
	132	$opt_aL_lower_band = int($opt_aL_upper_band * $opt_aL);
	133	$opt_aL_upper_bandi= $d1;
	134	$opt_aL_format_flag = 1;
	135	write_LOG("set opt_aL_band $opt_aL_upper_band($opt_aL_upper_bandi) $opt_aL_lower_band");
	136	}
	137	}
	138	if ((($i0+1) % (int($NR_no/10)) == 0) or ($DB_len_reduced > $DB_len/10) or $opt_aL_format_flag ) {
	139	($DB_no, $DB_len) = blast_formatdb();
	140	$DB_len_reduced = 0;
	141	}
	142	#if ($formatdb_no - ($NR_no-$NR_passed) >= $reformat_seg) {blast_formatdb(); }
	143	}
	144	## END for ($i=0; $i<$NR_no; $i++)
	145	watch_progress($NR_no-1, $NR90_no, $NR_passed, $NR_no, 1);
	146
	147	if (1) { ### print NR db
	148	open(DBOUT, "> $db_out") \|\| die "Can not write $db_out";
	149	for ($i=0; $i<$NR_no; $i++) {
	150	next unless ($idens[$i] eq "*");
	151	my $seq = $seqs[$i];
	152	$seq =~ s/(.{70})/$1\n/g;
	153	$seq =~ s/\n$//;
	154	print DBOUT "$dess[$i]\n$seq\n";
	155	}
	156	close(DBOUT);
	157	}
	158
	159	write_restart();
	160	write_db_clstr();
	161	remove_blast_db();
	162	close_LOG();
	163
	164

+76

-6

psi-cd-hit/psi-cd-hit.pl less more

25	25	our $DB_len0 = 0;
26	26	our $DB_len_reduced = 0;
27	27	our $DB_len_reduced2 = 0; #### for write_restart etc purpose
	28
	29	our $opt_aL_upper_band = 0; #### sequences < this length will not be submitted unless reformatdb
	30	our $opt_al_upper_bandi= 0;
	31	our $opt_aL_lower_band = 0; #### sequences < this length don't need to be searched
28	32	my ($i, $j, $k, $i0, $j0, $k0, $ll);
29	33
30	34	read_db();

40	44
41	45	$i0 = 0;
42	46	if ( -e $restart_in) { $i0 = read_restart(); } ## restart after crash
	47	elsif ($skip_long > 0) { #### skip very long seqs
	48	for (; $i0<$NR_no; $i0++) {
	49	$i = $NR_idx[$i0];
	50	last if ($lens[$i] < $skip_long);
	51
	52	$NR_passed++;
	53	$NR_clstr_nos[$i] = $NR90_no;
	54	$idens[$i] = "*";
	55	$passeds[$i] = 1;
	56	$NR90_seq[$NR90_no] = [$i];
	57	$NR90_no++;
	58	$DB_len_reduced += $lens[$i];
	59	}
	60	}
	61
	62	#### set init opt_aL_uppper/lower_bands
	63	if ( ($opt_aL > 0.3) ) {
	64	die ("option -aL > 1.0") if ($opt_aL > 1.0);
	65
	66	####################
	67	###################
	68	##################
	69	#################
	70	################
	71	############### <-upper band
	72	############## <- seq below not submit, unless band change
	73	#############
	74	############
	75	###########
	76	########## <- lower band
	77	######### <- seq below not in format db
	78	########
	79	#######
	80	#####
	81	####
	82	###
	83	##
	84	#
	85	my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
	86	my $space = ($total_jobs > $restart_seg) ? $total_jobs : $restart_seg;
	87	my $d1 = $i0+$space;
	88	$d1 = ($NR_no-1) if ($d1 >= $NR_no-1);
	89	$opt_aL_upper_band = $lens[ $NR_idx[$d1] ];
	90	$opt_aL_lower_band = int($opt_aL_upper_band * $opt_aL);
	91	$opt_aL_upper_bandi= $d1;
	92	write_LOG("set opt_aL_band $opt_aL_upper_band($opt_aL_upper_bandi) $opt_aL_lower_band");
	93	}
	94
43	95
44	96	($DB_no, $DB_len) = blast_formatdb();
45	97	$DB_len0 = $DB_len;

47	99	$DB_len_reduced2 = 0;
48	100	for (; $i0<$NR_no; $i0++) {
49	101	$i = $NR_idx[$i0];
50		run_batch_blast3($i0) unless ($in_bg[$i] or (-e "$bl_dir/$i.out") or $passeds[$i]);
	102	run_batch_blast3_multi($i0) unless ($in_bg[$i] or (-e "$bl_dir/$i.out") or $passeds[$i]);
51	103
52	104	if ( not $passeds[$i] ) { # this is a new representative
53	105	$NR_passed++;

55	107	$idens[$i] = "*";
56	108	$passeds[$i] = 1;
57	109	$NR90_seq[$NR90_no] = [$i];
58		fish_other_homolog($i);
	110	fish_other_homolog_multi($i);
59	111	$NR90_no++;
60	112	$DB_len_reduced += $lens[$i];
61	113	$DB_len_reduced2 += $lens[$i];

63	115
64	116	watch_progress($i0, $NR90_no, $NR_passed, $NR_no, 0);
65	117
66		if ((($i0+1) % $restart_seg == 0) or ($DB_len_reduced2 > $DB_len0/10) ) {
67		write_restart(); write_db_clstr(); remove_raw_blout_bg($i0);
68		$DB_len_reduced2 = 0;
	118	if (($i0+1) % $restart_seg == 0 ) {
	119	write_restart(); write_db_clstr();
69	120	}
70		if ((($i0+1) % (int($NR_no/10)) == 0) or ($DB_len_reduced > $DB_len/10)) {
	121
	122	my $opt_aL_format_flag = 0;
	123	if ( ($opt_aL > 0.3) ) { #### formatdb maybe needed if current length of seq.i0 close to opt_aL_upper_band
	124	my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
	125	my $opt_aL_upper_band_old = $opt_aL_upper_band;
	126	if ( ($opt_aL_upper_bandi - $i0) < $total_jobs ) { #### seqs left for possible submission < total_jobs
	127
	128	my $space = ($total_jobs > $restart_seg) ? $total_jobs : $restart_seg;
	129	my $d1 = $i0+$space;
	130	$d1 = ($NR_no-1) if ($d1 >= $NR_no-1);
	131	$opt_aL_upper_band = $lens[ $NR_idx[$d1] ];
	132	if ($opt_aL_upper_band < $opt_aL_upper_band_old) {
	133	$opt_aL_lower_band = int($opt_aL_upper_band * $opt_aL);
	134	$opt_aL_upper_bandi= $d1;
	135	$opt_aL_format_flag = 1;
	136	write_LOG("set opt_aL_band $opt_aL_upper_band($opt_aL_upper_bandi) $opt_aL_lower_band");
	137	}
	138	}
	139	}
	140	if ((($i0+1) % (int($NR_no/10)) == 0) or ($DB_len_reduced > $DB_len/10) or $opt_aL_format_flag ) {
71	141	($DB_no, $DB_len) = blast_formatdb();
72	142	$DB_len_reduced = 0;
73	143	}

+458

-0

usecases/Miseq-16S/16S-ref-db-PE-splice.pl less more

	0	#!/usr/bin/perl
	1	## =========================== NGS tools ==========================================
	2	## NGS tools for metagenomic sequence analysis
	3	## May also be used for other type NGS data analysis
	4	##
	5	## Weizhong Li, UCSD
	6	## liwz@sdsc.edu
	7	## http://weizhongli-lab.org/
	8	## ================================================================================
	9
	10	my $script_name = $0;
	11	my $script_dir = $0;
	12	$script_dir =~ s/[^\/]+$//;
	13	chop($script_dir);
	14	$script_dir = "./" unless ($script_dir);
	15
	16	use Getopt::Std;
	17	getopts("i:j:o:r:e:p:q:c:d:N:t:u:d:M:T:S:",\%opts);
	18	die usage() unless ($opts{i} and $opts{j} and $opts{o} and $opts{d});
	19	my ($i, $j, $k, $cmd);
	20	my ($ll, $lla, $llb, $id, $ida, $idb, $seq, $seqa, $seqb, $qua, $quaa, $quab);
	21	my ($len, $lena, $lenb);
	22
	23	my $fastq = $opts{i};
	24	my $fastq2 = $opts{j};
	25	my $ref = $opts{d};
	26	my $output = $opts{o};
	27	my $trim_R1 = $opts{p}; $trim_R1 = 100 unless ($trim_R1);
	28	my $trim_R2 = $opts{q}; $trim_R2 = 100 unless ($trim_R2);
	29	my $clstr_cutoff = $opts{c}; #### post clustering
	30	my $full_frag = $opts{S};
	31	my $prime_len = 45;
	32	my $output_R1 = "$output-R1";
	33	my $output_R2 = "$output-R2";
	34	my $session = "OTU-session-$$";
	35	my $output_S = "$output-single";
	36	my $consensus_db = "$output-consensus";
	37	my $cd_hit_2d = "$script_dir/../../cd-hit-est-2d"; die "no $cd_hit_2d" unless (-e $cd_hit_2d);
	38	my $cd_hit_est = "$script_dir/../../cd-hit-est"; die "no $cd_hit_est" unless (-e $cd_hit_est);
	39	my $format = input_test($fastq); #fasta or fastq
	40	my $cdhit_opt_M = $opts{M}; $cdhit_opt_M = 16000 unless defined($cdhit_opt_M);
	41
	42	if (defined($clstr_cutoff)) {
	43	die "Clustering cutoff $clstr_cutoff is not reasonable, should be <=1.0 and >= 0.97" unless (($clstr_cutoff <=1.0) and ($clstr_cutoff>=0.97));
	44	}
	45
	46	my %FHZ=();
	47
	48	my %ref_map = ();
	49	foreach my $f (($fastq, $fastq2)) {
	50	my $R = ( $f eq $fastq ) ? "R1" : "R2";
	51	open(OUT, "> $consensus_db.$R") \|\| die "can not write to $consensus_db.$R";
	52
	53	my %con = ();
	54	my $num_seq = 0;
	55	open_files_z_safe("TTTa", $f);
	56
	57	if ($format eq "fastq") {
	58	while(1) {
	59	($lla, $ida, $seqa, $quaa, $lena) = read_next_fastq("TTTa");
	60	last unless ($lla);
	61	for ($i=0; $i<$prime_len; $i++) {
	62	$c=uc(substr($seqa, $i, 1));
	63	$con{$i}{$c}++;
	64	}
	65	$num_seq++;
	66	}
	67	}
	68	else { #### fasta
	69	my $seqa = "";
	70	while($ll = <TTTa>) {
	71	if ($ll =~ /^>/) {
	72	if ($seqa) {
	73	for ($i=0; $i<$prime_len; $i++) {
	74	$c=uc(substr($seqa, $i, 1));
	75	$con{$i}{$c}++;
	76	}
	77	$num_seq++;
	78	}
	79	chop($ll);
	80	$seqa = "";
	81	}
	82	else {
	83	chop($ll);
	84	$seqa .= $ll;
	85	}
	86	}
	87	if ($seqa) {
	88	for ($i=0; $i<$prime_len; $i++) {
	89	$c=uc(substr($seqa, $i, 1));
	90	$con{$i}{$c}++;
	91	}
	92	$num_seq++;
	93	}
	94	} #### END fasta
	95
	96	close(TTTa);
	97
	98	my @cons = (); #which letter
	99	my @cons_v = (); #abundance
	100	for ($i=0; $i<$prime_len; $i++) {
	101	my %t = %{ $con{$i} };
	102	my @k = keys %t;
	103	@k = sort { $t{$b} <=> $t{$a} } @k;
	104	push(@cons, $k[0]);
	105	push(@cons_v, $t{ $k[0] } / $num_seq);
	106	}
	107	## set minimal consensus to be 30
	108	for ($i=33; $i<$prime_len; $i++) {
	109	if ( ($cons_v[$i ] <0.75) and
	110	($cons_v[$i-1] <0.75) and
	111	($cons_v[$i-2] <0.75) ) {
	112	$i = $i-2; last;
	113	}
	114	}
	115	my $trim_len_new = $i;
	116
	117	print OUT ">$R\n";
	118	for ($i=0; $i<$trim_len_new; $i++) {
	119	print OUT $cons[$i];
	120	}
	121	print OUT "\n";
	122	close(OUT);
	123
	124	my $cmd_line = "$cd_hit_2d -i $consensus_db.$R -i2 $ref -d 0 -c 0.8 -n 5 -r 1 -p 1 -b 5 -o $session.$R-vs-ref -G 0 -A 30 -s2 0.01 -M $cdhit_opt_M > $session.$R-vs-ref.log";
	125	print "running $cmd_line\n";
	126	$cmd = `$cmd_line`;
	127
	128	my $parse_template=<<EOD;
	129	>Cluster 0
	130	0 45nt, >R1... *
	131	1 1479nt, >1111882... at 1:42:4:45/+/95.24%
	132	2 1500nt, >1111856... at 1:42:4:45/+/88.10%
	133	3 1426nt, >1111848... at 2:44:3:45/+/90.70%
	134	4 1530nt, >1111847... at 1:42:4:45/+/85.71%
	135	5 1497nt, >1111839... at 1:41:5:45/+/85.37%
	136	6 1492nt, >1111819... at 1:42:4:45/+/88.10%
	137	7 1482nt, >1111782... at 1:42:4:45/+/88.10%
	138	8 1496nt, >1111776... at 1:42:4:45/+/88.10%
	139	9 1500nt, >1111768... at 1:42:4:45/+/85.71%
	140	...
	141	>Cluster 0
	142	0 45nt, >R2... *
	143	1 1428nt, >1111883... at 483:440:2:45/-/84.09%
	144	2 1479nt, >1111882... at 511:468:2:45/-/88.64%
	145	3 1336nt, >1111879... at 435:399:2:38/-/86.49%
	146	4 1402nt, >1111874... at 469:426:2:45/-/84.09%
	147	5 1500nt, >1111856... at 513:470:2:45/-/84.09%
	148	6 1530nt, >1111847... at 532:489:2:45/-/86.36%
	149	7 1497nt, >1111839... at 509:473:2:38/-/86.49%
	150	8 1492nt, >1111819... at 514:471:2:45/-/88.64%
	151	9 1482nt, >1111782... at 502:464:2:40/-/84.62%
	152	10 1496nt, >1111776... at 516:473:2:45/-/84.09%
	153	EOD
	154
	155	open(TMP, "$session.$R-vs-ref.clstr") \|\| die "can not open $session.$R-vs-ref.clstr";
	156	while($ll=<TMP>){
	157	next if ($ll =~ /^>/);
	158	next if ($ll =~ /^0/);
	159	chop($ll);
	160	if ($ll =~ /^\d+\s+\d+(aa\|nt), >(.+)\.\.\./) {
	161	my $id = $2;
	162	my @lls = split(/\s+/, $ll);
	163	my @lls = split(/\//, $lls[-1]); ##516:473:2:45/-/84.09%
	164	my ($query_start, $query_end, $rep_star, $rep_end) = split(/:/, $lls[0]);
	165	$ref_map{$id}{$R}=[$query_start, $query_end, $rep_star, $rep_end, $lls[1]];
	166	}
	167	}
	168	close(TMP);
	169	}
	170
	171	my %ref_cut;
	172	foreach $id (keys %ref_map) {
	173	next unless (defined $ref_map{$id}{"R1"});
	174	next unless (defined $ref_map{$id}{"R2"});
	175
	176	my @R1_info = @{$ref_map{$id}{"R1"}};
	177	my @R2_info = @{$ref_map{$id}{"R2"}};
	178
	179	next unless ($R1_info[4] eq "+");
	180	next unless ($R2_info[4] eq "-");
	181
	182	my $p1 = $R1_info[0] - ($R1_info[2]-1); #### 1-based, can be -1 value for V1
	183	my $p2 = $R2_info[0] + ($R2_info[2]-1); #### 1-based, can be longer than len($seq)
	184	$ref_cut{$id} = [$p1, $p2];
	185	}
	186
	187	open(TMP, $ref) \|\| die "can not open $ref";
	188	open(OUT1, "> $output_R1") \|\| die "can not write to $output_R1";
	189	open(OUT2, "> $output_R2") \|\| die "can not write to $output_R2";
	190	if ($full_frag) {
	191	open(OUT3, "> $output_S") \|\| die "can not write to $output_S";
	192	}
	193	my $seq;
	194	my $des;
	195	$id = "";
	196
	197	while($ll = <TMP>) {
	198	if ($ll =~ /^>/) {
	199	if ($seq) {
	200	if ($ref_cut{$id}) {
	201	$seq =~ s/\s//g;
	202	my ($p1, $p2) = @{$ref_cut{$id}};
	203	my $len = length($seq);
	204	my $seq1 = "";
	205	my $seq2 = "";
	206	if ($p1>=1) {
	207	$seq1 = substr($seq, $p1-1, $trim_R1);
	208	}
	209	else {
	210	my $pad = 1 - $p1; #### add NNN at 5'
	211	$seq1 = "N" x $pad;
	212	$seq1 .= substr($seq, 0, $trim_R1-$pad);
	213	}
	214
	215	if ($p2 <= $len) {
	216	my $p2a = $p2 - $trim_R2; #### 0 - based substr idx
	217	if ($p2a < 0) { #### not long enough
	218	$seq2 = substr($seq, 0, $p2);
	219	}
	220	else {
	221	$seq2 = substr($seq, $p2a, $trim_R2);
	222	}
	223	}
	224	else { #### add NNN at 5'
	225	my $pad = $p2 - $len;
	226	my $trim_t2_t = $trim_R2 - $pad;
	227	$seq2 = "N" x $pad;
	228
	229	my $p2a = $len - $trim_R2_t; #### 0 - based substr idx
	230	if ($p2a < 0) { #### not long enough
	231	$seq2.= $seq;
	232	}
	233	else {
	234	$seq2 .= substr($seq, $p2a, $trim_R2_t);
	235	}
	236	}
	237	$seq2 = reverse_complement($seq2);
	238	### now have $seq1 $seq2
	239	print OUT1 "$des loc=$p1 len=", length($seq1), "\n$seq1\n";
	240	print OUT2 "$des loc=$p2 len=", length($seq2), "\n$seq2\n";
	241	if ($full_frag) {
	242	if ($p1 < 1 ) {$p1 = 1; }
	243	if ($p2 > $len) {$p2 = $len;}
	244	my $eff_len = $p2-$p1+1;
	245	my $seq1 = substr($seq, $p1-1, $eff_len);
	246	print OUT3 "$des loc=$p1:$p2 len=$eff_len\n$seq1\n";
	247	}
	248	}
	249	}
	250	chop($ll);
	251	$des = $ll;
	252	$id = substr($ll,1);
	253	$id =~ s/\s.+$//;
	254	$seq = "";
	255	}
	256	else {
	257	$seq .= $ll;
	258	}
	259	}
	260
	261	if ($seq) {
	262	if ($ref_cut{$id}) {
	263	$seq =~ s/\s//g;
	264	my ($p1, $p2) = @{$ref_cut{$id}};
	265	my $len = length($seq);
	266	my $seq1 = "";
	267	my $seq2 = "";
	268	if ($p1>=1) {
	269	$seq1 = substr($seq, $p1-1, $trim_R1);
	270	}
	271	else {
	272	my $pad = 1 - $p1; #### add NNN at 5'
	273	$seq1 = "N" x $pad;
	274	$seq1 .= substr($seq, 0, $trim_R1-$pad);
	275	}
	276
	277	if ($p2 <= $len) {
	278	my $p2a = $p2 - $trim_R2; #### 0 - based substr idx
	279	if ($p2a < 0) { #### not long enough
	280	$seq2 = substr($seq, 0, $p2);
	281	}
	282	else {
	283	$seq2 = substr($seq, $p2a, $trim_R2);
	284	}
	285	}
	286	else { #### add NNN at 5'
	287	my $pad = $p2 - $len;
	288	my $trim_t2_t = $trim_R2 - $pad;
	289	$seq2 = "N" x $pad;
	290
	291	my $p2a = $len - $trim_R2_t; #### 0 - based substr idx
	292	if ($p2a < 0) { #### not long enough
	293	$seq2.= $seq;
	294	}
	295	else {
	296	$seq2 .= substr($seq, $p2a, $trim_R2_t);
	297	}
	298	}
	299	$seq2 = reverse_complement($seq2);
	300	### now have $seq1 $seq2
	301	print OUT1 "$des loc=$p1 len=", length($seq1), "\n$seq1\n";
	302	print OUT2 "$des loc=$p2 len=", length($seq2), "\n$seq2\n";
	303	if ($full_frag) {
	304	if ($p1 < 1 ) {$p1 = 1; }
	305	if ($p2 > $len) {$p2 = $len;}
	306	my $eff_len = $p2-$p1+1;
	307	my $seq1 = substr($seq, $p1-1, $eff_len);
	308	print OUT3 "$des loc=$p1:$p2 len=$eff_len\n$seq1\n";
	309	}
	310	}
	311	}
	312
	313	close(OUT1);
	314	close(OUT2);
	315	if ($full_frag) { close(OUT3); }
	316	close(TMP);
	317
	318	if (defined($clstr_cutoff)) {
	319	my $output_R1_tmp = "$output_R1.$$";
	320	my $output_R2_tmp = "$output_R2.$$";
	321
	322	my $cmd_line = "$cd_hit_est -i $output_R1 -j $output_R2 -d 0 -c $clstr_cutoff -n 10 -p 1 -b 5" .
	323	" -o $output_R1_tmp -op $output_R2_tmp -G 1 -g 1 -M $cdhit_opt_M -P 1 -l 11 -sc 1 > $output_R1_tmp.log";
	324	print "running $cmd_line\n";
	325	$cmd = `$cmd_line`;
	326
	327	die "Can not run $cd_hit_est" unless (-e "$output_R1_tmp.clstr");
	328	$cmd = `mv $output_R1_tmp $output_R1`;
	329	$cmd = `mv $output_R2_tmp $output_R2`;
	330	$cmd = `mv $output_R1_tmp.clstr $output.clstr`;
	331
	332	if ($full_frag) {
	333	my $output_S_tmp = "$output_S.$$";
	334	my $cmd_line = "$cd_hit_est -i $output_S -d 0 -c $clstr_cutoff -n 10 -p 1 -b 5" .
	335	" -o $output_S_tmp -G 1 -g 1 -M $cdhit_opt_M -l 11 -sc 1 > $output_S_tmp.log";
	336	print "running $cmd_line\n";
	337	$cmd = `$cmd_line`;
	338	die "Can not run $cd_hit_est" unless (-e "$output_S_tmp.clstr");
	339	$cmd = `mv $output_S_tmp $output_S`;
	340	$cmd = `mv $output_S_tmp.clstr $output_S.clstr`;
	341	}
	342	}
	343
	344	$cmd = `rm -f $session*`;
	345
	346	# need %FHZ
	347	# open one or more files including zipped files
	348	# above open_files_z may have broken pipe problem
	349	# so this safe sub, open each file individually
	350	sub open_files_z_safe {
	351	my ($fh, @files) = @_;
	352	my ($i, $j, $k);
	353
	354	my $no = $#files+1;
	355
	356	$FHZ{$fh} = {
	357	'files' => [@files],
	358	'no' => $no,
	359	'open_idx' => 0,
	360	};
	361
	362	my $f0 = $files[0];
	363	if ($f0 =~ /\.gz$/ ) { open($fh, "gunzip -c $f0 \|") \|\| die "can not gunzip -c $f0\n"; }
	364	elsif ($f0 =~ /\.bz2$/) { open($fh, "bzcat $f0 \|") \|\| die "can not bzcat $f0\n"; }
	365	else { open($fh, $f0 ) \|\| die "can not open $f0\n"; }
	366	return 0;
	367	}
	368	########## END open_files_z_safe
	369
	370
	371	sub read_FHZ {
	372	my $fh = shift;
	373	my $ll;
	374
	375	$ll = <$fh>;
	376	if ($ll) { return $ll;} ##### read from existing opened file
	377
	378	#otherwise, last opened file reaches EOF
	379	if ($FHZ{$fh}->{open_idx} < $FHZ{$fh}->{no} -1 ) { ### still file not opened yet
	380	close($fh); #### close last open file
	381
	382	$FHZ{$fh}->{open_idx}++;
	383	my $f0 = $FHZ{$fh}->{files}->[ $FHZ{$fh}->{open_idx} ];
	384
	385	if ($f0 =~ /\.gz$/ ) { open($fh, "gunzip -c $f0 \|") \|\| die "can not gunzip -c $f0\n"; }
	386	elsif ($f0 =~ /\.bz2$/) { open($fh, "bzcat $f0 \|") \|\| die "can not bzcat $f0\n"; }
	387	else { open($fh, $f0 ) \|\| die "can not open $f0\n"; }
	388
	389	$ll = <$fh>;
	390	return $ll;
	391	}
	392	else { #### no more file to open, return undef
	393	return undef;
	394	}
	395	}
	396	########### END read_FHZ
	397
	398
	399	########## read_next_fastq
	400	sub read_next_fastq {
	401	my $fh = shift;
	402	my ($lla, $seqa, $lla2, $quaa, $ida, $lena);
	403	$lla = read_FHZ($fh); return unless ($lla);
	404	chop($lla); $lla =~ s/\s.+$//;
	405	$ida = substr($lla,1);
	406	$seqa = read_FHZ($fh); $seqa =~ s/\s+$//g; $lena = length($seqa);
	407	$lla2 = read_FHZ($fh); #read ID
	408	$quaa = read_FHZ($fh); $quaa =~ s/\s+$//g;
	409	return ($lla, $ida, $seqa, $quaa, $lena);
	410	}
	411	########## END read_next_fastq
	412
	413
	414	sub reverse_complement {
	415	my ($in_seq) = @_;
	416	my $opposite = reverse $in_seq;
	417	$opposite =~ tr/ACGT/TGCA/;
	418	return("$opposite");
	419	}
	420
	421
	422	sub input_test {
	423	my $f = shift;
	424	open(TTT, $f) \|\| die "can not open $f\n";
	425	my $ll = <TTT>;
	426	close(TTT);
	427
	428	my $c = substr($ll,0,1);
	429	if ($c eq ">") {return "fasta";}
	430	else {return "fastq";}
	431	}
	432	########## END input_test
	433
	434
	435	sub usage {
	436	<<EOD;
	437	This script takes a paired-end (PE) read files (Fastq or Fasta) for a 16S dataset, e.g. from V3-V4
	438	region, it also takes a Fasta file of full-length 16S reference database, e.g. Greengene.
	439	this script identifies the sequencing region on the reference sequencs and it cuts the forward
	440	and reverse segments and outputs them in PE fasta files. The output PE reference database can be used
	441	to cluster together with 16S datasets
	442
	443	Options:
	444	======================
	445	-i input fasta or fastq file for R1
	446	-j input fasta or fastq file for R2
	447	-d 16S reference sequence file in fasta format
	448	-o output prefix
	449	-p lenght of forward sequence in output file
	450	-q length of reverse sequence in output file
	451	-S also output full fragment
	452	-c cutoff for clustering the output PE files to remove redundant reference seqeunces.
	453	Suggested cutoffs: 1.00, 0.99, 0.98 and 0.97
	454	The script will not cluster the output unless user specifies this cutoff.
	455	-M available memory to use, default 16000, means 16000MB. This option will be passed to cd-hit.
	456	EOD
	457	}

+117

-0

usecases/Miseq-16S/NG-Omics-Miseq-16S.pl less more

	0	#!/usr/bin/perl
	1	################################################################################
	2	# NGS workflow by Weizhong Li, http://weizhongli-lab.org
	3	################################################################################
	4
	5	########## local variables etc. Please edit
	6	$CD_HIT_dir = "/home/oasis/data/etc/git/cdhit";
	7	$NGS_prog_trimmomatic = "/home/oasis/data/NGS-ann-project/apps/Trimmomatic/trimmomatic-0.32.jar";
	8
	9
	10	########## computation resources for execution of jobs
	11	%NGS_executions = ();
	12	$NGS_executions{"qsub_1"} = {
	13	"type" => "qsub-pe",
	14	"cores_per_node" => 8,
	15	"number_nodes" => 64,
	16	"user" => "weizhong", #### I will use command such as qstat -u weizhong to query submitted jobs
	17	"command" => "qsub",
	18	"command_name_opt" => "-N",
	19	"command_err_opt" => "-e",
	20	"command_out_opt" => "-o",
	21	"template" => <<EOD,
	22	#!/bin/sh
	23	#PBS -v PATH
	24	#PBS -V
	25
	26	#\$ -v PATH
	27	#\$ -V
	28
	29	EOD
	30	};
	31
	32
	33	$NGS_executions{"sh_1"} = {
	34	"type" => "sh",
	35	"cores_per_node" => 8,
	36	"number_nodes" => 1,
	37	};
	38
	39	$NGS_batch_jobs{"qc"} = {
	40	"CMD_opts" => ["100"],
	41	"execution" => "sh_1", # where to execute
	42	"cores_per_cmd" => 4, # number of threads used by command below
	43	"no_parallel" => 1, # number of total jobs to run using command below
	44	"command" => <<EOD,
	45	java -jar $NGS_prog_trimmomatic PE -threads 4 -phred33 \\DATA.0 \\DATA.1 \\SELF/R1.fq \\SELF/R1-s.fq \\SELF/R2.fq \\SELF/R2-s.fq \\
	46	SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:\\CMDOPTS.0 MAXINFO:80:0.5 1>\\SELF/qc.stdout 2>\\SELF/qc.stderr
	47
	48	perl -e '\$i=0; while(<>){ if (/^\@/) {\$i++; print ">Sample\|\\SAMPLE\|\$i ", substr(\$_,1); \$a=<>; print \$a; \$a=<>; \$a=<>;}}' < \\SELF/R1.fq > \\SELF/R1.fa &
	49	perl -e '\$i=0; while(<>){ if (/^\@/) {\$i++; print ">Sample\|\\SAMPLE\|\$i ", substr(\$_,1); \$a=<>; print \$a; \$a=<>; \$a=<>;}}' < \\SELF/R2.fq > \\SELF/R2.fa &
	50
	51	wait
	52	rm -f \\SELF/R1.fq \\SELF/R2.fq \\SELF/R1-s.fq \\SELF/R2-s.fq
	53	EOD
	54	};
	55
	56
	57	$NGS_batch_jobs{"otu"} = {
	58	"injobs" => ["qc"],
	59	"CMD_opts" => ["150", "100", "0.97", "0.0001", "path_to_spliced_ref_db-R1", "path_to_spliced_ref_db-R1", "75"],
	60	"execution" => "sh_1", # where to execute
	61	"cores_per_cmd" => 2, # number of threads used by command below
	62	"no_parallel" => 1, # number of total jobs to run using command below
	63	"command" => <<EOD,
	64	#### cluster at 100% PE
	65	$CD_HIT_dir/cd-hit-est -i \\INJOBS.0/R1.fa -j \\INJOBS.0/R2.fa -o \\SELF/seq.nr -op \\SELF/seq.nr.2 -sf 1 -sc 1 -P 1 -r 0 \\
	66	-cx \\CMDOPTS.0 -cy \\CMDOPTS.1 -c 1.0 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > \\SELF/seq.nr.log
	67	#### cluster at 99% PE and SE for R1,R2
	68	$CD_HIT_dir/cd-hit-est -i \\SELF/seq.nr -o \\SELF/seq.chimeric-clstr.R1 -r 0 -cx \\CMDOPTS.6 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000 -d 0 -p 1 > \\SELF/seq.chimeric-clstr.R1.log
	69	$CD_HIT_dir/cd-hit-est -i \\SELF/seq.nr.2 -o \\SELF/seq.chimeric-clstr.R2 -r 0 -cx \\CMDOPTS.6 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000 -d 0 -p 1 > \\SELF/seq.chimeric-clstr.R2.log
	70	$CD_HIT_dir/cd-hit-est -i \\SELF/seq.nr -j \\SELF/seq.nr.2 -o \\SELF/seq.99 -op \\SELF/seq.99.2 -P 1 -r 0 \\
	71	-cx \\CMDOPTS.0 -cy \\CMDOPTS.1 -c 0.99 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > \\SELF/seq.99.log
	72	$CD_HIT_dir/usecases/Miseq-16S/filter-chimeric-and-small.pl -c \\CMDOPTS.3 -k \\SELF/seq.nr.clstr \\
	73	-i \\SELF/seq.chimeric-clstr.R1.clstr -j \\SELF/seq.chimeric-clstr.R2.clstr \\
	74	-a \\SELF/seq.99.clstr -f \\SELF/seq.99 -g \\SELF/seq.99.2 -o \\SELF/seq.99f
	75	$CD_HIT_dir/clstr_rev.pl \\SELF/seq.nr.clstr \\SELF/seq.99f.clstr > \\SELF/seq.99f-all.clstr
	76	$CD_HIT_dir/cd-hit-est -i \\SELF/seq.99f -j \\SELF/seq.99f.2 -o \\SELF/seq.97 -op \\SELF/seq.97.2 -P 1 -r 0 \\
	77	-cx \\CMDOPTS.0 -cy \\CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > \\SELF/seq.97.log
	78	$CD_HIT_dir/cd-hit-est-2d -i \\SELF/seq.97 -j \\SELF/seq.97.2 -i2 \\CMDOPTS.4 -j2 \\CMDOPTS.5 -o \\SELF/seq.97.ref -op \\SELF/seq.97.ref.2 -P 1 -r 0 \\
	79	-cx \\CMDOPTS.0 -cy \\CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > \\SELF/seq.97.ref.log
	80	$CD_HIT_dir/clstr_rev.pl \\SELF/seq.99f-all.clstr \\SELF/seq.97.clstr > \\SELF/seq.97-all.clstr
	81	$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < \\SELF/seq.97.ref.clstr > \\SELF/seq.97.reftop.clstr
	82	$CD_HIT_dir/clstr_merge.pl \\SELF/seq.97-all.clstr \\SELF/seq.97.reftop.clstr > \\SELF/OTU.clstr
	83
	84	rm -f \\SELF/seq.chimeric-clstr.R1 \\SELF/seq.chimeric-clstr.R1.log \\SELF/seq.chimeric-clstr.R2 \\SELF/seq.chimeric-clstr.R2.log
	85	rm -f \\SELF/seq.97.ref \\SELF/seq.97.ref.2 \\SELF/seq.97.ref.log
	86	mv \\SELF/seq.99f.log \\SELF/chimeric-small-clusters-list.txt
	87
	88	EOD
	89	};
	90
	91
	92	$NGS_batch_jobs{"otu-pooled"} = {
	93	"CMD_opts" => ["150", "100", "0.97", "0.0001", "path_to_spliced_ref_db-R1", "path_to_spliced_ref_db-R1", "75"],
	94	"execution" => "sh_1", # where to execute
	95	"cores_per_cmd" => 2, # number of threads used by command below
	96	"no_parallel" => 1, # number of total jobs to run using command below
	97	"command" => <<EOD,
	98	#### before running
	99	#### concat seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt
	100	$CD_HIT_dir/cd-hit-est -i seq.99f -j seq.99f.2 -o seq.97 -op seq.97.2 -P 1 -r 0 \\
	101	-cx \\CMDOPTS.0 -cy \\CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > seq.97.log
	102	$CD_HIT_dir/cd-hit-est-2d -i seq.97 -j seq.97.2 -i2 \\CMDOPTS.4 -j2 \\CMDOPTS.5 -o seq.97.ref -op seq.97.ref.2 -P 1 -r 0 \\
	103	-cx \\CMDOPTS.0 -cy \\CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > seq.97.ref.log
	104	$CD_HIT_dir/clstr_rev.pl seq.99f-all.clstr seq.97.clstr > seq.97-all.clstr
	105	$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.97.ref.clstr > seq.97.reftop.clstr
	106	$CD_HIT_dir/clstr_merge.pl seq.97-all.clstr seq.97.reftop.clstr > OTU.clstr
	107	$CD_HIT_dir/usecases/clstr_2_OTU_table.pl -i OTU.clstr -o OTU.txt
	108	rm -f seq.97.ref seq.97.ref.2 seq.97.ref.log
	109
	110	EOD
	111	};
	112
	113	##############################################################################################
	114	########## END
	115	1;
	116

+1189

-0

usecases/Miseq-16S/NG-Omics-WF.pl less more

	0	#!/usr/bin/perl
	1	# =============================== NG-Omics-WF ==================================
	2	# _ _ _____ ____ _ __ ________
	3	# \| \ \| \|/ ____\| / __ \ (_) \ \ / / ____\|
	4	# \| \\| \| \| __ ______\| \| \| \|_ __ ___ _ ___ ___ _____\ \ /\ / /\| \|__
	5	# \| . ` \| \| \|_ \|______\| \| \| \| '_ ` _ \\| \|/ __/ __\|______\ \/ \/ / \| __\|
	6	# \| \|\ \| \|__\| \| \| \|__\| \| \| \| \| \| \| \| (__\__ \ \ /\ / \| \|
	7	# \|_\| \_\|\_____\| \____/\|_\| \|_\| \|_\|_\|\___\|___/ \/ \/ \|_\|
	8	#
	9	# =========================== Next Generation Omics data workflow tools ========
	10	#
	11	# Workflow tools for next generation genomics, metagenomics, RNA-seq
	12	# and other type of omics data analyiss,
	13	#
	14	# Software originally developed since 2010 by Weizhong Li at UCSD
	15	# currently at JCVI
	16	#
	17	# http://weizhongli-lab.org/ngomicswf liwz@sdsc.edu
	18	# ==============================================================================
	19
	20	use Getopt::Std;
	21	use POSIX;
	22
	23	getopts("i:R:s:J:Q:r:j:Z:t:S:T:",\%opts);
	24	die usage() unless ($opts{i} and ($opts{s} or $opts{S}));
	25
	26	my $sample_in = $opts{s};
	27	my $sample_command_in = $opts{S}; #### ';' delimited samples, ':' delimited entries, e.g. sample1:R1.fq:R2.fq;sample2:R1.fq:R2.fq or sample1;sample2;sample3
	28	my $input_conf = $opts{i};
	29	my $this_task = $opts{J};
	30	our $G_NGS_root = $opts{r};
	31	my $queue_system = $opts{Q}; $queue_system = "SGE" unless $queue_system;
	32	my $subset_wfs = $opts{R};
	33	my $subset_jobs = $opts{j};
	34	my $second_opt = $opts{Z};
	35	my $opt_file = $opts{t};
	36	my $opt_command_in = $opts{T}; #### ';' delimited jobs, ":" delimited entries, e.g. JobID_A:opt0:opt1:opt2;JobID_B:opt0:opt1
	37
	38	my $pwd = `pwd`; chop($pwd);
	39	my $sleep_time_min = 15;
	40	my $sleep_time_max = 120;
	41	my $log_dir = "$pwd/WF-LOG";
	42	my $log_file = "$log_dir/LOG";
	43	my $log_fileq = "$log_dir/LOGq";
	44	my $sh_dir = "$pwd/WF-sh";
	45	my $sh_bundle_dir = "$pwd/WF-sh-bundle";
	46	my $subset_flag = 0; #### run only one job, subset of jobs, or jobs in sub workflows
	47	my %subset_jobs = ();
	48	my %qstat_xml_data = ();
	49	my ($i, $j, $k, $ll, $cmd);
	50
	51	######## scan through WF configration
	52	######## and generate job list
	53	require $input_conf;
	54	my %job_list = (); # as $job_list{$t_job_id}{$t_sample_id} = {};
	55	my ($t_sample_id, $t_job_id, $t_execution_id);
	56	my ($t_sample, $t_job, $t_execution);
	57	task_level_jobs();
	58	my @NGS_batch_jobs = sort {($NGS_batch_jobs{$a}->{'job_level'} <=> $NGS_batch_jobs{$b}->{'job_level'}) or ($a cmp $b)} keys %NGS_batch_jobs;
	59
	60	$cmd = `mkdir -p $log_dir` unless (-e $log_dir);
	61	$cmd = `mkdir -p $sh_dir` unless (-e $sh_dir);
	62	$cmd = `mkdir -p $sh_bundle_dir` unless (-e $sh_bundle_dir);
	63	open(LOG, ">> $log_file") \|\| die "can not write to $log_file";
	64
	65	######## parse NGS_samples
	66	my %NGS_sample_data = ();
	67	my @NGS_samples = ();
	68	if (defined($sample_in)) {
	69	open(TMP, $sample_in) \|\| die "can not open $sample_in";
	70	while($ll=<TMP>){
	71	next if ($ll =~ /^#/);
	72	next unless ($ll =~ /^\w/); chop($ll);
	73	my ($id, @data) = split(/\s+/,$ll);
	74	push(@NGS_samples, $id);
	75	$NGS_sample_data{$id} = [@data];
	76	if (not (-e $id)) { $cmd = `mkdir $id`;}
	77	}
	78	close(TMP);
	79	}
	80	elsif (defined($sample_command_in)) {
	81	my @lls = split(/,/, $sample_command_in);
	82	foreach $ll (@lls) {
	83	my ($id, @data) = split(/:/, $ll);
	84	push(@NGS_samples, $id);
	85	$NGS_sample_data{$id} = [@data];
	86	if (not (-e $id)) { $cmd = `mkdir $id`;}
	87	}
	88	}
	89	else {
	90	die "no input samples";
	91	}
	92
	93	my %CMD_opts = ();
	94	if (-e $opt_file) {
	95	##format example
	96	##CMDOPT JobID_A:opt0:opt1:opt2
	97	##CMDOPT JobID_B:opt0:opt1
	98	##CMDOPT JobID_C:opt0:opt1:opt2:opt3
	99	open(TMP, $opt_file) \|\| die "can not open $opt_file";
	100	while($ll = <TMP>){
	101	next if ($ll =~ /^#/);
	102	next unless ($ll =~ /^CMDOPT/);
	103	chop($ll);
	104	my ($i, $opt1) = split(/\s+/, $ll);
	105	my ($job_id, @opts) = split(/:/, $opt1);
	106	$CMD_opts{$job_id} = [@opts];
	107	}
	108	close(TMP);
	109	}
	110	elsif ($opt_command_in) {
	111	my @lls = split(/,/, $opt_command_in);
	112	foreach $ll (@lls) {
	113	my ($job_id, @opts) = split(/:/, $ll);
	114	$CMD_opts{$job_id} = [@opts];
	115	}
	116	}
	117
	118	########## processing subset of jobs
	119	if ($subset_wfs) {
	120	my @wfs = split(/,/, $subset_wfs);
	121	$subset_flag = 1;
	122	foreach $i (@wfs) {
	123	my @jobs = @{ $NGS_batch_sets{$i}->{"jobs"} };
	124	foreach $j (@jobs) { $subset_jobs{$j} = 1; }
	125	}
	126	}
	127	if ($subset_jobs) {
	128	$subset_flag = 1;
	129	my @jobs = split(/,/, $subset_jobs);
	130	foreach $j (@jobs) { $subset_jobs{$j} = 1; }
	131	add_subset_jobs_by_dependency();
	132	}
	133	if ($subset_flag) {
	134	my $job_str = join(" ", keys %subset_jobs);
	135	write_log("Running subset of jobs: $job_str");
	136	}
	137
	138	my $verify_flag = 0;
	139	foreach $t_job_id (keys %NGS_batch_jobs) {
	140	if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
	141	$t_job = $NGS_batch_jobs{$t_job_id};
	142	$t_execution = $NGS_executions{ $t_job->{"execution"} };
	143
	144	my $pe_parameter = ""; #### setup pe parameters
	145	if ($t_execution->{'type'} eq "qsub-pe") {
	146	my $t_cores_per_cmd = $t_job->{"cores_per_cmd"};
	147	$t_cores_per_cmd = 1 unless ($t_cores_per_cmd);
	148	$pe_parameter = "#\$ -pe orte $t_cores_per_cmd";
	149	}
	150
	151	if ($t_job->{"cores_per_cmd"} > $t_execution->{"cores_per_node"} ) {
	152	$verify_flag = 1;
	153	write_log("$t_job_id needs $t_job->{\"cores_per_cmd\"} cores, but $t_job->{\"execution\"} only has $t_execution->{\"cores_per_node\"} cores");
	154	}
	155
	156	my $cmds_per_node = POSIX::floor( $t_execution->{"cores_per_node"} / $t_job->{"cores_per_cmd"});
	157	my $nodes_total = POSIX::ceil($t_job->{"no_parallel"} / $cmds_per_node);
	158	$t_job->{"cmds_per_node"} = $cmds_per_node;
	159	$t_job->{"nodes_total"} = $nodes_total;
	160
	161	if ($t_job->{"nodes_total"} > $t_execution->{"number_nodes"}) {
	162	$verify_flag = 1;
	163	write_log("$t_job_id needs $t_job->{\"nodes_total\"} nodes, but $t_job->{\"execution\"} only has $t_execution->{\"number_nodes\"} nodes");
	164	}
	165
	166	my @CMD_opts = ();
	167	@CMD_opts = @{$t_job->{CMD_opts}} if (defined($t_job->{CMD_opts} ));
	168	@CMD_opts = @{$CMD_opts{$t_job_id}} if (defined($CMD_opts{$t_job_id})); #### command line take over default
	169
	170	foreach $t_sample_id (@NGS_samples) {
	171	my @t_commands = split(/\t/, $t_job->{"command"});
	172	my $t_command = "";
	173	foreach my $c0 (@t_commands) {
	174	my $c1 = $c0;
	175	$c1 =~ s/\\SAMPLE/$t_sample_id/g;
	176	$c1 =~ s/\\SELF/$t_job_id/g;
	177	# take it easy, assuming maxium 20 input files
	178	$c1 =~ s/\\INFILES\.0/$t_job->{"infiles"}->[0]/g; $c1 =~ s/\\INFILES\.10/$t_job->{"infiles"}->[10]/g;
	179	$c1 =~ s/\\INFILES\.1/$t_job->{"infiles"}->[1]/g; $c1 =~ s/\\INFILES\.11/$t_job->{"infiles"}->[11]/g;
	180	$c1 =~ s/\\INFILES\.2/$t_job->{"infiles"}->[2]/g; $c1 =~ s/\\INFILES\.12/$t_job->{"infiles"}->[12]/g;
	181	$c1 =~ s/\\INFILES\.3/$t_job->{"infiles"}->[3]/g; $c1 =~ s/\\INFILES\.13/$t_job->{"infiles"}->[13]/g;
	182	$c1 =~ s/\\INFILES\.4/$t_job->{"infiles"}->[4]/g; $c1 =~ s/\\INFILES\.14/$t_job->{"infiles"}->[14]/g;
	183	$c1 =~ s/\\INFILES\.5/$t_job->{"infiles"}->[5]/g; $c1 =~ s/\\INFILES\.15/$t_job->{"infiles"}->[15]/g;
	184	$c1 =~ s/\\INFILES\.6/$t_job->{"infiles"}->[6]/g; $c1 =~ s/\\INFILES\.16/$t_job->{"infiles"}->[16]/g;
	185	$c1 =~ s/\\INFILES\.7/$t_job->{"infiles"}->[7]/g; $c1 =~ s/\\INFILES\.17/$t_job->{"infiles"}->[17]/g;
	186	$c1 =~ s/\\INFILES\.8/$t_job->{"infiles"}->[8]/g; $c1 =~ s/\\INFILES\.18/$t_job->{"infiles"}->[18]/g;
	187	$c1 =~ s/\\INFILES\.9/$t_job->{"infiles"}->[9]/g; $c1 =~ s/\\INFILES\.19/$t_job->{"infiles"}->[19]/g;
	188
	189	$c1 =~ s/\\DATA\.0/$NGS_sample_data{$t_sample_id}->[0]/g; $c1 =~ s/\\DATA\.10/$NGS_sample_data{$t_sample_id}->[10]/g;
	190	$c1 =~ s/\\DATA\.1/$NGS_sample_data{$t_sample_id}->[1]/g; $c1 =~ s/\\DATA\.11/$NGS_sample_data{$t_sample_id}->[11]/g;
	191	$c1 =~ s/\\DATA\.2/$NGS_sample_data{$t_sample_id}->[2]/g; $c1 =~ s/\\DATA\.12/$NGS_sample_data{$t_sample_id}->[12]/g;
	192	$c1 =~ s/\\DATA\.3/$NGS_sample_data{$t_sample_id}->[3]/g; $c1 =~ s/\\DATA\.13/$NGS_sample_data{$t_sample_id}->[13]/g;
	193	$c1 =~ s/\\DATA\.4/$NGS_sample_data{$t_sample_id}->[4]/g; $c1 =~ s/\\DATA\.14/$NGS_sample_data{$t_sample_id}->[14]/g;
	194	$c1 =~ s/\\DATA\.5/$NGS_sample_data{$t_sample_id}->[5]/g; $c1 =~ s/\\DATA\.15/$NGS_sample_data{$t_sample_id}->[15]/g;
	195	$c1 =~ s/\\DATA\.6/$NGS_sample_data{$t_sample_id}->[6]/g; $c1 =~ s/\\DATA\.16/$NGS_sample_data{$t_sample_id}->[16]/g;
	196	$c1 =~ s/\\DATA\.7/$NGS_sample_data{$t_sample_id}->[7]/g; $c1 =~ s/\\DATA\.17/$NGS_sample_data{$t_sample_id}->[17]/g;
	197	$c1 =~ s/\\DATA\.8/$NGS_sample_data{$t_sample_id}->[8]/g; $c1 =~ s/\\DATA\.18/$NGS_sample_data{$t_sample_id}->[18]/g;
	198	$c1 =~ s/\\DATA\.9/$NGS_sample_data{$t_sample_id}->[9]/g; $c1 =~ s/\\DATA\.19/$NGS_sample_data{$t_sample_id}->[19]/g;
	199
	200	$c1 =~ s/\\INJOBS\.0/$t_job->{"injobs"}->[0]/g; $c1 =~ s/\\INJOBS\.10/$t_job->{"injobs"}->[10]/g;
	201	$c1 =~ s/\\INJOBS\.1/$t_job->{"injobs"}->[1]/g; $c1 =~ s/\\INJOBS\.11/$t_job->{"injobs"}->[11]/g;
	202	$c1 =~ s/\\INJOBS\.2/$t_job->{"injobs"}->[2]/g; $c1 =~ s/\\INJOBS\.12/$t_job->{"injobs"}->[12]/g;
	203	$c1 =~ s/\\INJOBS\.3/$t_job->{"injobs"}->[3]/g; $c1 =~ s/\\INJOBS\.13/$t_job->{"injobs"}->[13]/g;
	204	$c1 =~ s/\\INJOBS\.4/$t_job->{"injobs"}->[4]/g; $c1 =~ s/\\INJOBS\.14/$t_job->{"injobs"}->[14]/g;
	205	$c1 =~ s/\\INJOBS\.5/$t_job->{"injobs"}->[5]/g; $c1 =~ s/\\INJOBS\.15/$t_job->{"injobs"}->[15]/g;
	206	$c1 =~ s/\\INJOBS\.6/$t_job->{"injobs"}->[6]/g; $c1 =~ s/\\INJOBS\.16/$t_job->{"injobs"}->[16]/g;
	207	$c1 =~ s/\\INJOBS\.7/$t_job->{"injobs"}->[7]/g; $c1 =~ s/\\INJOBS\.17/$t_job->{"injobs"}->[17]/g;
	208	$c1 =~ s/\\INJOBS\.8/$t_job->{"injobs"}->[8]/g; $c1 =~ s/\\INJOBS\.18/$t_job->{"injobs"}->[18]/g;
	209	$c1 =~ s/\\INJOBS\.9/$t_job->{"injobs"}->[9]/g; $c1 =~ s/\\INJOBS\.19/$t_job->{"injobs"}->[19]/g;
	210
	211	$c1 =~ s/\\CMDOPTS\.0/$CMD_opts[0]/g; $c1 =~ s/\\CMDOPTS\.10/$CMD_opts[10]/g;
	212	$c1 =~ s/\\CMDOPTS\.1/$CMD_opts[1]/g; $c1 =~ s/\\CMDOPTS\.11/$CMD_opts[11]/g;
	213	$c1 =~ s/\\CMDOPTS\.2/$CMD_opts[2]/g; $c1 =~ s/\\CMDOPTS\.12/$CMD_opts[12]/g;
	214	$c1 =~ s/\\CMDOPTS\.3/$CMD_opts[3]/g; $c1 =~ s/\\CMDOPTS\.13/$CMD_opts[13]/g;
	215	$c1 =~ s/\\CMDOPTS\.4/$CMD_opts[4]/g; $c1 =~ s/\\CMDOPTS\.14/$CMD_opts[14]/g;
	216	$c1 =~ s/\\CMDOPTS\.5/$CMD_opts[5]/g; $c1 =~ s/\\CMDOPTS\.15/$CMD_opts[15]/g;
	217	$c1 =~ s/\\CMDOPTS\.6/$CMD_opts[6]/g; $c1 =~ s/\\CMDOPTS\.16/$CMD_opts[16]/g;
	218	$c1 =~ s/\\CMDOPTS\.7/$CMD_opts[7]/g; $c1 =~ s/\\CMDOPTS\.17/$CMD_opts[17]/g;
	219	$c1 =~ s/\\CMDOPTS\.8/$CMD_opts[8]/g; $c1 =~ s/\\CMDOPTS\.18/$CMD_opts[18]/g;
	220	$c1 =~ s/\\CMDOPTS\.9/$CMD_opts[9]/g; $c1 =~ s/\\CMDOPTS\.19/$CMD_opts[19]/g;
	221	$t_command .= "$c1\n";
	222	}
	223
	224
	225	my @t_infiles = map { "$t_sample_id/$_" } @{$t_job->{"infiles"}};
	226	my @t_injobs = @{$t_job->{"injobs"}};
	227	my $t_sh_file = "$sh_dir/$t_job_id.$t_sample_id.sh";
	228	my $f_start = "$pwd/$t_sample_id/$t_job_id/WF.start.date";
	229	my $f_complete = "$pwd/$t_sample_id/$t_job_id/WF.complete.date";
	230	my $f_cpu = "$pwd/$t_sample_id/$t_job_id/WF.cpu";
	231	$job_list{$t_job_id}{$t_sample_id} = {
	232	'sample_id' => $t_sample_id,
	233	'job_id' => $t_job_id,
	234	'status' => 'wait', #### status can be wait (input not ready), ready (input ready), submitted (submitted or running), completed
	235	'command' => $t_command,
	236	'sh_file' => $t_sh_file,
	237	'infiles' => [@t_infiles],
	238	'injobs' => [@t_injobs],
	239	'start_file' => $f_start,
	240	'complete_file'=> $f_complete,
	241	'cpu_file' => $f_cpu,
	242	};
	243
	244	my $v_command = "";
	245	foreach my $vf (@{$t_job->{"non_zero_files"}}) {
	246	$v_command .= "if ! [ -s $t_job_id/$vf ]; then echo \"zero size $t_job_id/$vf\"; exit; fi\n";
	247	}
	248
	249
	250	if (not -e $t_sh_file) {
	251	write_log("Write sh file to $t_sh_file");
	252	open(TSH, "> $t_sh_file") \|\| die "can not write to $t_sh_file\n";
	253	print TSH <<EOD;
	254	$t_execution->{"template"}
	255	$pe_parameter
	256
	257	my_host=`hostname`
	258	my_pid=\$\$
	259	my_core=$t_job->{"cores_per_cmd"}
	260	my_queue=$t_job->{"execution"}
	261	my_time_start=`date +%s`;
	262
	263	cd $pwd
	264	cd $t_sample_id
	265	mkdir $t_job_id
	266	if ! [ -f $f_start ]; then date +\%s > $f_start; fi
	267	$t_command
	268	$v_command
	269	date +\%s > $f_complete
	270	#times >> $f_cpu
	271
	272	my_time_end=`date +%s`;
	273	my_time_spent=\$((my_time_end-my_time_start))
	274	echo "sample=$t_sample_id job=$t_job_id host=\$my_host pid=\$my_pid queue=\$my_queue cores=\$my_core time_start=\$my_time_start time_end=\$my_time_end time_spent=\$my_time_spent" >> $f_cpu
	275
	276	EOD
	277	close(TSH);
	278	#validate_cmd_line($t_command, $t_sh_file, $t_sample_id);
	279	}
	280	} ########## foreach my $c0 (@t_commands)
	281	} ########## foreach $t_job (keys %NGS_batch_jobs)
	282
	283	die if ($verify_flag);
	284
	285	if ($this_task eq "log-cpu" ) { task_log_cpu(); exit 0;}
	286	elsif ($this_task eq "list-jobs" ) { task_list_jobs(); exit 0;}
	287	elsif ($this_task eq "snapshot" ) { task_snapshot(); exit 0;}
	288	elsif ($this_task eq "delete-jobs" ) { task_delete_jobs($second_opt); exit 0;}
	289	elsif ($this_task eq "write-sh" ) { exit 0;}
	290	elsif ($this_task ) { die "undefined task $this_task";}
	291
	292	################################################################################################
	293	# _____ _ _ _____ _____ _ _ _ _ _
	294	# \| __ \ \| \ \| \|/ ____\|/ ____\|\| \| \| \| \| \| (_) \| \|
	295	# \| \|__) \| _ _ __ \| \\| \| \| __\| (___ \| \|__ __ _\| \|_ ___\| \|__ _ ___ \| \|__ ___
	296	# \| _ / \| \| \| '_ \ \| . ` \| \| \|_ \|\___ \ \| '_ \ / _` \| __/ __\| '_ \ \| \|/ _ \\| '_ \/ __\|
	297	# \| \| \ \ \|_\| \| \| \| \| \| \|\ \| \|__\| \|____) \|\| \|_) \| (_\| \| \|\| (__\| \| \| \| \| \| (_) \| \|_) \__ \
	298	# \|_\| \_\__,_\|_\| \|_\| \|_\| \_\|\_____\|_____/ \|_.__/ \__,_\|\__\___\|_\| \|_\| \| \|\___/\|_.__/\|___/
	299	# ______ ______ _/ \|
	300	# \|______\| \|______\|__/
	301	########## Run NGS_batch_jobs for each samples http://patorjk.com/software/taag
	302	################################################################################################
	303
	304
	305	my %execution_submitted = (); # number of submitted jobs (qsub) or threads (local sh)
	306	my $sleep_time = $sleep_time_min;
	307	while(1) {
	308	my $flag_job_done = 1;
	309
	310	########## reset execution_submitted to 0
	311	foreach $i (keys %NGS_executions) { $execution_submitted{$i} = 0; }
	312
	313	my $flag_qstat_xml_call = 0;
	314	foreach $t_job_id (keys %NGS_batch_jobs) {
	315	my $t_job = $NGS_batch_jobs{$t_job_id};
	316	my $t_execution = $NGS_executions{ $t_job->{"execution"} };
	317	my $exe_type = $t_execution->{type};
	318	$flag_qstat_xml_call = 1 if (($queue_system eq "SGE") and (($exe_type eq "qsub") or ($exe_type eq "qsub-pe")));
	319	}
	320	SGE_qstat_xml_query() if $flag_qstat_xml_call;
	321
	322	########## check and update job status for submitted jobs
	323	foreach $t_job_id (keys %NGS_batch_jobs) {
	324	if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
	325	my $t_job = $NGS_batch_jobs{$t_job_id};
	326	foreach $t_sample_id (@NGS_samples) {
	327	my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
	328	my $status = $t_sample_job->{'status'};
	329
	330	next if ($status eq "completed");
	331	########## check file system to update job status
	332	########## in case this is a restart run
	333	check_submitted_job($t_job_id, $t_sample_id);
	334	next if ($t_sample_job->{'status'} eq "completed");
	335	$flag_job_done = 0;
	336	}
	337	}
	338
	339	if ($flag_job_done) { write_log("job completed!"); last; }
	340
	341	########## check and update job status based on dependance
	342	foreach $t_job_id (keys %NGS_batch_jobs) {
	343	if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
	344	my $t_job = $NGS_batch_jobs{$t_job_id};
	345	foreach $t_sample_id (@NGS_samples) {
	346	my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
	347	my $status = $t_sample_job->{'status'};
	348
	349	next unless ($status eq "wait");
	350	my @t_infiles = @{ $t_sample_job->{'infiles'} };
	351	my @t_injobs = @{ $t_sample_job->{'injobs'} };
	352	my $t_ready_flag = 1;
	353
	354	foreach $i (@t_infiles) {
	355	next if (-s $i); #### non-zero size file
	356	$t_ready_flag = 0;
	357	last;
	358	}
	359
	360	foreach $i (@t_injobs) {
	361	next if ( $job_list{$i}{$t_sample_id}->{'status'} eq "completed"); #### injob completed
	362	$t_ready_flag = 0;
	363	last;
	364	}
	365	if ($t_ready_flag) {
	366	$t_sample_job->{"status"} = "ready";
	367	write_log("$t_job_id,$t_sample_id: change status to ready");
	368	}
	369	}
	370	}
	371
	372	########## submit local sh jobs
	373	my $has_submitted_some_jobs = 0;
	374	foreach $t_job_id (keys %NGS_batch_jobs) {
	375	if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
	376	my $t_job = $NGS_batch_jobs{$t_job_id};
	377	my $t_execution = $NGS_executions{ $t_job->{"execution"} };
	378	my $t_execution_id = $t_job->{"execution"};
	379
	380	if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
	381	next unless ($t_execution->{'type'} eq "sh");
	382	next if ( $execution_submitted{$t_execution_id} >= $t_execution->{"cores_per_node"} ); #### all cores are used
	383
	384	foreach $t_sample_id (@NGS_samples) {
	385	my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
	386	my $status = $t_sample_job->{'status'};
	387	next unless ($status eq "ready");
	388	next if ( ($execution_submitted{$t_execution_id} + $t_job->{"cores_per_cmd"} * $t_job->{"no_parallel"}) > $t_execution->{"cores_per_node"} ); #### no enough available cores
	389	#### now submitting
	390
	391	my $t_sh_file = $t_sample_job->{'sh_file'};
	392	my $t_sh_pid = "$t_sh_file.pids";
	393	for ($i=0; $i<$t_job->{"no_parallel"}; $i++) {
	394	$cmd = `sh $t_sh_file >/dev/null 2>&1 &`;
	395	}
	396	$cmd = `touch $t_sh_pid`;
	397	$t_sample_job->{'status'} = "submitted";
	398	write_log("$t_job_id,$t_sample_id: change status to submitted");
	399	$execution_submitted{ $t_execution_id } += $t_job->{"cores_per_cmd"} * $t_job->{"no_parallel"};
	400	$has_submitted_some_jobs = 1;
	401	}
	402	}
	403
	404	########## submit qsub-pe jobs, multiple jobs may share same node
	405	foreach $t_job_id (keys %NGS_batch_jobs) {
	406	if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
	407	my $t_job = $NGS_batch_jobs{$t_job_id};
	408	my $t_execution = $NGS_executions{ $t_job->{"execution"} };
	409	my $t_execution_id = $t_job->{"execution"};
	410
	411	next unless ($t_execution->{'type'} eq "qsub-pe");
	412	next if ( $execution_submitted{$t_execution_id} >= $t_execution->{"number_nodes"} ); #### resource full
	413
	414	my $t_cores_per_node = $t_execution->{"cores_per_node"};
	415	my $t_cores_per_cmd = $t_job->{"cores_per_cmd"};
	416	my $t_cores_per_job = $t_cores_per_cmd * $t_job->{"no_parallel"};
	417	my $t_nodes_per_job = $t_cores_per_job / $t_cores_per_node;
	418
	419	foreach $t_sample_id (@NGS_samples) {
	420	my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
	421	my $status = $t_sample_job->{'status'};
	422	next unless ($status eq "ready");
	423
	424	my $t_sh_file = $t_sample_job->{'sh_file'};
	425	my $t_sh_pid = "$t_sh_file.pids";
	426	open(TID, "> $t_sh_pid") \|\| die "can not write to $t_sh_pid";
	427
	428	for ($i=0; $i<$t_job->{"no_parallel"}; $i++) {
	429	my $t_stderr = "$t_sh_file.$i.stderr";
	430	my $t_stdout = "$t_sh_file.$i.stdout";
	431	$cmd = `qsub $t_execution->{"command_name_opt"} $t_job_id $t_execution->{"command_err_opt"} $t_stderr $t_execution->{"command_out_opt"} $t_stdout $t_sh_file 2>$log_fileq`;
	432	my $qsub_id = 0;
	433	if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
	434	print TID "$qsub_id\n"; #### $cmd returns qsub id, write these ids to pid file for future qstat
	435	$execution_submitted{$t_execution_id} += $t_nodes_per_job;
	436	write_log("$t_sh_bundle submitted for sample $t_sample_id, qsubid $cmd");
	437	}
	438
	439	close(TID);
	440	$has_submitted_some_jobs = 1;
	441	$t_sample_job->{'status'} = "submitted";
	442	}
	443	} ########## END foreach $t_job_id (keys %NGS_batch_jobs)
	444
	445	########## submit qsub jobs
	446	foreach $t_job_id (keys %NGS_batch_jobs) {
	447	if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
	448	my $t_job = $NGS_batch_jobs{$t_job_id};
	449	my $t_execution = $NGS_executions{ $t_job->{"execution"} };
	450	my $t_execution_id = $t_job->{"execution"};
	451
	452	if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
	453	next unless ($t_execution->{'type'} eq "qsub");
	454	next if ( $execution_submitted{$t_execution_id} >= $t_execution->{"number_nodes"} ); #### resource full
	455
	456	my $t_cores_per_node = $t_execution->{"cores_per_node"};
	457	my $t_cores_per_cmd = $t_job->{"cores_per_cmd"};
	458	my $t_cores_per_job = $t_cores_per_cmd * $t_job->{"no_parallel"};
	459	my $t_nodes_per_job = POSIX::ceil($t_cores_per_job / $t_cores_per_node);
	460	my $t_cmds_per_node = int($t_cores_per_node / $t_cores_per_cmd);
	461	my $t_jobs_per_node = int($t_cores_per_node / $t_cores_per_job);
	462
	463	########## 1. this loop process jobs need 1 or more nodes per sample, ie. bundle within a sample, e.g. blast against refseq
	464	foreach $t_sample_id (@NGS_samples) {
	465	my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
	466	my $status = $t_sample_job->{'status'};
	467	next unless ($status eq "ready");
	468	next unless ($t_jobs_per_node <= 1); #### unless need >= 1 node, including jobs use between (51%-100%) cores per node
	469	last if ( ($execution_submitted{$t_execution_id} + $t_nodes_per_job) > $t_execution->{"number_nodes"}); #### no enough available queues
	470
	471	my $t_sh_file = $t_sample_job->{'sh_file'};
	472	my $t_sh_bundle = "$sh_bundle_dir/$t_job_id.$t_sample_id.$$.sh";
	473	my $t_stderr = "$t_sh_bundle.stderr";
	474	my $t_stdout = "$t_sh_bundle.stdout";
	475	my $t_sh_pid = "$t_sh_file.pids";
	476
	477	open(TID, "> $t_sh_pid") \|\| die "can not write to $t_sh_pid";
	478	open(BSH, "> $t_sh_bundle") \|\| die "can not write to $t_sh_bundle";
	479	print BSH <<EOD;
	480	$t_execution->{"template"}
	481	cd $pwd
	482	EOD
	483	for ($i=0; $i<$t_cmds_per_node; $i++) {
	484	print BSH "sh $t_sh_file &\n";
	485	print BSH "sleep 3\n";
	486	}
	487	print BSH "wait\n";
	488	close(BSH);
	489
	490	for ($i=0; $i<$t_nodes_per_job; $i++) {
	491	$cmd = `qsub $t_execution->{"command_name_opt"} $t_job_id $t_execution->{"command_err_opt"} $t_stderr $t_execution->{"command_out_opt"} $t_stdout $t_sh_bundle 2>$log_fileq`;
	492	my $qsub_id = 0;
	493	if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
	494	print TID "$qsub_id\n"; #### $cmd returns qsub id, write these ids to pid file for future qstat
	495	$execution_submitted{$t_execution_id}++;
	496	write_log("$t_sh_bundle submitted for sample $t_sample_id, qsubid $cmd");
	497	}
	498	close(TID);
	499	$has_submitted_some_jobs = 1;
	500	$t_sample_job->{'status'} = "submitted";
	501	} ########## END foreach $t_sample_id (@NGS_samples)
	502
	503
	504	########## 2. this loop process jobs need less than 1 node per sample, ie. bundle jobs across samples, e.g. qc
	505	my @t_bundle = ();
	506	my $available_nodes = $t_execution->{"number_nodes"} - $execution_submitted{$t_execution_id};
	507	my $no_sample_can_be_processed = $available_nodes * $t_jobs_per_node;
	508	my @t_samples = ();
	509	my $t_batch_no = 0;
	510
	511	foreach $t_sample_id (@NGS_samples) { #### same loop as next, to find out @t_samples and last sample can run
	512	my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
	513	my $status = $t_sample_job->{'status'};
	514	next unless ($status eq "ready");
	515	next unless ($t_jobs_per_node > 1); #### unless a node can host 2 or more jobs
	516	last if ( $t_execution->{"number_nodes"} - $execution_submitted{$t_execution_id} <=0);
	517	push(@t_samples, $t_sample_id);
	518	}
	519	my $last_sample_can_run = $t_samples[-1];
	520	@t_samples = ();
	521
	522	foreach $t_sample_id (@NGS_samples) {
	523	my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
	524	my $status = $t_sample_job->{'status'};
	525	next unless ($status eq "ready");
	526	next unless ($t_jobs_per_node > 1); #### unless a node can host 2 or more jobs
	527	last if ( $t_execution->{"number_nodes"} - $execution_submitted{$t_execution_id} <=0);
	528	push(@t_samples, $t_sample_id);
	529
	530	#### bundle @t_samples to one qsub job
	531	if ((($#t_samples+1) == $t_jobs_per_node) or ($t_sample_id eq $last_sample_can_run)) {
	532	my $t_sh_bundle = "$sh_bundle_dir/$t_job_id.samples-$t_batch_no.$$.sh";
	533	my $t_stderr = "$t_sh_bundle.stderr";
	534	my $t_stdout = "$t_sh_bundle.stdout";
	535
	536	open(BSH, "> $t_sh_bundle") \|\| die "can not write to $t_sh_bundle";
	537	print BSH <<EOD;
	538	$t_execution->{"template"}
	539	cd $pwd
	540	EOD
	541	foreach $i (@t_samples) {
	542	my $t_sh_file = $job_list{$t_job_id}{$i}->{'sh_file'};
	543	for ($j=0; $j<$t_job->{"no_parallel"}; $j++) {
	544	print BSH "sh $t_sh_file &\n";
	545	print BSH "sleep 3\n";
	546	}
	547	}
	548	print BSH "wait\n";
	549	close(BSH);
	550
	551	$cmd = `qsub $t_execution->{"command_name_opt"} $t_job_id $t_execution->{"command_err_opt"} $t_stderr $t_execution->{"command_out_opt"} $t_stdout $t_sh_bundle 2>$log_fileq`;
	552	my $qsub_id = 0;
	553	if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
	554
	555	foreach $i (@t_samples) {
	556	my $t_sh_file = $job_list{$t_job_id}{$i}->{'sh_file'};
	557	my $t_sh_pid = "$t_sh_file.pids";
	558	open(TID, "> $t_sh_pid") \|\| die "can not write to $t_sh_pid";
	559	print TID "$qsub_id\n"; #### $cmd returns qsub id, write these ids to pid file for future qstat
	560	write_log("$t_sh_bundle submitted for sample $i, qsubid $cmd");
	561	close(TID);
	562	$job_list{$t_job_id}{$i}->{'status'} = "submitted";
	563	}
	564
	565	$has_submitted_some_jobs = 1;
	566	$execution_submitted{$t_execution_id}++;
	567	@t_samples = (); #### clear
	568	$t_batch_no++;
	569	}
	570	} ########## END foreach $t_sample_id (@NGS_samples)
	571	} ########## END foreach $t_job_id (keys %NGS_batch_jobs)
	572
	573
	574	#### if has submitted some jobs, reset waiting time, otherwise double waiting time
	575	print_job_status_summary();
	576	if ($has_submitted_some_jobs) {
	577	$sleep_time = $sleep_time_min;
	578	}
	579	else {
	580	$sleep_time = $sleep_time*2;
	581	$sleep_time = $sleep_time_max if ($sleep_time > $sleep_time_max);
	582	}
	583	write_log("sleep $sleep_time seconds");
	584	sleep($sleep_time);
	585	} ########## END while(1)
	586
	587	task_log_cpu();
	588	################################################################################
	589	########## END Run NGS_batch_jobs for each samples
	590	################################################################################
	591
	592	close(LOG);
	593	##########
	594
	595
	596	sub write_log {
	597	my @txt = @_;
	598	my $i;
	599	my $date = `date`; chop($date);
	600	foreach $i (@txt) {
	601	print LOG "$date $i\n";
	602	print STDERR "$date $i\n";
	603	}
	604	print LOG "\n";
	605	print STDERR "\n";
	606	}
	607	########## END write_log
	608
	609	sub SGE_qstat_xml_query {
	610	my ($i, $j, $k, $cmd, $ll);
	611	%qstat_xml_data = (); #### global
	612	$cmd = `qstat -f -xml`;
	613	if ($cmd =~ /<queue_info/) { #### dummy
	614	$qstat_xml_data{"NULL"}= ["NULL","NULL"];
	615	}
	616
	617	my @lls = split(/\n/, $cmd);
	618	$i = 2; #### skip first 2 lines
	619	for (; $i<$#lls+1; $i++) {
	620	if ($lls[$i] =~ /<job_list/) {
	621	my ($id, $name, $state);
	622	for (; $i<$#lls+1; $i++) {
	623	last if ($lls[$i] =~ /<\/job_list/);
	624	if ($lls[$i] =~ /<JB_job_number>(\d+)/) { $id = $1;}
	625	if ($lls[$i] =~ /<JB_name>([^<]+)/) { $name = $1;}
	626	if ($lls[$i] =~ /<state>([^<]+)/) {$state = $1;}
	627	}
	628	if (defined($id) and defined($name) and defined($state)) {
	629	$qstat_xml_data{$id} = [$name, $state];
	630	}
	631	}
	632	}
	633	}
	634
	635	########## check submitted job by checking pids, or qsub ids
	636	########## update job status from wait\|ready -> submitted if pid file exit (in case of restart of this script)
	637	########## update job status from wait\|ready\|submitted -> completed if sh calls or qsub calls finished
	638	########## these pids or qsub ids are done
	639	sub check_submitted_job {
	640	my ($t_job_id, $t_sample_id) = @_;
	641	my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
	642	my $t_job = $NGS_batch_jobs{$t_job_id};
	643	my $t_execution = $NGS_executions{ $t_job->{"execution"} };
	644
	645	my ($i, $j, $k, $flag, $ll, $cmd);
	646
	647	my $t_sh_file = $t_sample_job->{'sh_file'};
	648	my $t_sh_pid = "$t_sh_file.pids";
	649
	650	# status won't change unless there is a pid file
	651	return unless (-e $t_sh_pid);
	652
	653	my $status = $t_sample_job->{'status'};
	654	if (($status eq "wait") or ($status eq "ready")) {
	655	$t_sample_job->{'status'} = "submitted";
	656	write_log("$t_job_id,$t_sample_id: change status to submitted");
	657	}
	658
	659	my $exe_type = $t_execution->{type};
	660
	661	if ($exe_type eq "sh") {
	662	$cmd = `ps -ef \| grep "$t_sh_file" \| grep -v grep`;
	663	if ($cmd =~ /\w/) { # still running
	664	$execution_submitted{ $t_job->{"execution"} } += $t_job->{"cores_per_cmd"} * $t_job->{"no_parallel"};
	665	}
	666	elsif (validate_job_files($t_job_id, $t_sample_id)) {
	667	$t_sample_job->{'status'} = "completed";
	668	write_log("$t_job_id,$t_sample_id: change status to completed");
	669	}
	670	else {
	671	$t_sample_job->{'status'} = "error";
	672	write_log("$t_job_id,$t_sample_id: change status to error");
	673	}
	674	return;
	675	}
	676	elsif (($exe_type eq "qsub") or ($exe_type eq "qsub-pe")) {
	677	my @pids = ();
	678	open(CHECK, $t_sh_pid) \|\| die "Can not open $t_sh_pid\n";
	679	while($ll = <CHECK>) {
	680	chop($ll); next unless ($ll =~ /\w/);
	681	push(@pids, $ll);
	682	}
	683	close(CHECK);
	684
	685	my $finish_flag = 1;
	686	foreach $i (@pids) {
	687	if (($queue_system eq "SGE") and %qstat_xml_data) {
	688	if (defined($qstat_xml_data{$i})) {
	689	$t_sample_job->{'status'} = "running" if (($qstat_xml_data{$i}->[1] eq "r") and ($t_sample_job->{'status'} eq "submitted"));
	690	$finish_flag = 0;
	691	$execution_submitted{ $t_job->{"execution"} } ++;
	692	}
	693	}
	694	elsif ($queue_system eq "SGE") {
	695	$cmd = `qstat -j $i \| grep job_number`;
	696	if ($cmd =~ /$i/) {
	697	$finish_flag = 0;
	698	$execution_submitted{ $t_job->{"execution"} } ++;
	699	}
	700	}
	701	else {
	702	$cmd = `qstat -r $i \| grep $i`;
	703	$j = (split(/\D/,$cmd))[0];
	704	if ($j == $i) { # this job is running
	705	$finish_flag = 0;
	706	$execution_submitted{ $t_job->{"execution"} } ++;
	707	}
	708	}
	709	}
	710	if ($finish_flag == 1) {
	711	if (validate_job_files($t_job_id, $t_sample_id)) {
	712	$t_sample_job->{'status'} = "completed";
	713	write_log("$t_job_id,$t_sample_id: change status to completed");
	714	}
	715	else {
	716	$t_sample_job->{'status'} = "error";
	717	write_log("$t_job_id,$t_sample_id: change status to error");
	718	}
	719	}
	720	return;
	721	}
	722	else {
	723	die "unknown execution type: $exe_type\n";
	724	}
	725	}
	726	########## END sub check_submitted_job
	727
	728
	729	# WF.start.date and WF.complete.date need to have non-zero size
	730	sub validate_job_files {
	731	my ($t_job_id, $t_sample_id) = @_;
	732	my ($i, $j, $k);
	733	my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
	734
	735	return 0 unless (-s $t_sample_job->{'start_file'} );
	736	return 0 unless (-s $t_sample_job->{'complete_file'} );
	737	return 0 unless (-s $t_sample_job->{'cpu_file'} );
	738
	739	return 1; #### pass
	740	}
	741	########## END validate_job_files
	742
	743
	744	sub print_job_status_summary {
	745	my ($t_job_id, $t_sample_id);
	746	my ($i, $j, $k);
	747
	748	my %job_status = ();
	749	my $job_total = 0;
	750	foreach $t_job_id (keys %NGS_batch_jobs) {
	751	if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
	752	foreach $t_sample_id (@NGS_samples) {
	753	my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
	754	my $status = $t_sample_job->{'status'};
	755	$job_status{$status}++;
	756	$job_total++;
	757	}
	758	}
	759
	760	print STDERR "total jobs: $job_total,";
	761	foreach $i (sort keys %job_status) {
	762	print STDERR "$i: $job_status{$i},";
	763	}
	764	print STDERR "\n";
	765	}
	766	########## END print_job_status_summary
	767
	768
	769	sub validate_cmd_line {
	770	my ($i, $j, $k);
	771	my ($t_command, $t_sh_file, $t_sample_id) = @_;
	772	my @cmds = split(/\n/,$t_command);
	773
	774	my @warn_path = ();
	775	foreach $i (@cmds) {
	776	my ($key_cmd, @opts) = split(/\s+/, $i);
	777	if ($key_cmd =~ /\//) {
	778	if (not -e $key_cmd) { push(@warn_path, $key_cmd); }
	779	}
	780	@opts = grep {/\//} @opts;
	781	foreach $j (@opts) {
	782	my @opts1 = split(/,\|;\|>\|<\|\\|/,$j);
	783	foreach $k (@opts1) {
	784	$k = "$t_sample_id/$k" unless (($k =~ /^\//) or ($k =~ /^\./));
	785	if (not -e $k) { push(@warn_path, $k); }
	786	}
	787	}
	788	}
	789
	790	if (@warn_path) {
	791	print STDERR "File or program doesn't exist in $t_sh_file: ", join(" ", @warn_path), "\n";
	792	}
	793
	794	}
	795	########## END validate_cmd_line
	796
	797	sub add_subset_jobs_by_dependency {
	798	my ($i, $j, $k, $ll, $t_job_id, $t_sample_id, $t_job);
	799
	800	while(1) {
	801	my $num_subset_jobs = scalar keys %subset_jobs;
	802
	803	foreach $t_job_id (keys %subset_jobs) {
	804	$t_job = $NGS_batch_jobs{$t_job_id};
	805	my @t_injobs = @{$t_job->{"injobs"}};
	806
	807	for $j (@t_injobs) {
	808	$subset_jobs{$j} = 1;
	809	}
	810	}
	811
	812	last if ($num_subset_jobs == scalar keys %subset_jobs);
	813	}
	814	}
	815	########## END add_subset_jobs_by_dependency
	816
	817
	818	sub task_level_jobs {
	819	my ($i, $j, $k, $ll, $t_job_id, $t_sample_id, $t_job);
	820	my %job_level = ();
	821
	822	while(1) {
	823	my $change_flag = 0;
	824
	825	foreach $t_job_id (keys %NGS_batch_jobs) {
	826	$t_job = $NGS_batch_jobs{$t_job_id};
	827	my @t_injobs = @{$t_job->{"injobs"}};
	828
	829	if (@t_injobs) {
	830	my $max_level_injob;
	831	foreach $j (@t_injobs) {
	832	next unless defined ($job_level{$j});
	833	$max_level_injob = $job_level{$j} if ($job_level{$j} > $max_level_injob);
	834	}
	835
	836	next unless (defined($max_level_injob));
	837	$max_level_injob++; #### one more level
	838	if (not defined ($job_level{$t_job_id})) {
	839	$job_level{$t_job_id}=$max_level_injob;
	840	$change_flag = 1;
	841	}
	842	elsif ($max_level_injob > $job_level{$t_job_id}) {
	843	$job_level{$t_job_id}=$max_level_injob;
	844	$change_flag = 1;
	845	}
	846	}
	847	else {
	848	if (not defined ($job_level{$t_job_id})) {
	849	$job_level{$t_job_id}=1;
	850	$change_flag = 1;
	851	}
	852	}
	853	}
	854	last unless ($change_flag);
	855	}
	856
	857	foreach $t_job_id (sort keys %NGS_batch_jobs) {
	858	$NGS_batch_jobs{$t_job_id}->{"job_level"} = $job_level{$t_job_id};
	859	}
	860	}
	861	########## END task_list_jobs
	862
	863	sub task_snapshot {
	864	my ($t_job_id, $t_sample_id);
	865	my ($i, $j, $k);
	866
	867	if ($this_task) {
	868	my $flag_qstat_xml_call = 0;
	869	foreach $t_job_id (keys %NGS_batch_jobs) {
	870	my $t_job = $NGS_batch_jobs{$t_job_id};
	871	my $t_execution = $NGS_executions{ $t_job->{"execution"} };
	872	my $exe_type = $t_execution->{type};
	873	$flag_qstat_xml_call = 1 if (($queue_system eq "SGE") and (($exe_type eq "qsub") or ($exe_type eq "qsub-pe")));
	874	}
	875	SGE_qstat_xml_query() if $flag_qstat_xml_call;
	876
	877	foreach $t_sample_id (@NGS_samples) {
	878	foreach $t_job_id (keys %NGS_batch_jobs) {
	879	check_submitted_job($t_job_id, $t_sample_id);
	880	}
	881	}
	882	}
	883
	884	my $max_len_sample = 0;
	885	foreach $t_sample_id (@NGS_samples) {
	886	$max_len_sample = length($t_sample_id) if (length($t_sample_id) > $max_len_sample);
	887	}
	888	my $max_len_job = 0;
	889	foreach $t_job_id (@NGS_batch_jobs) {
	890	$max_len_job = length($t_job_id) if (length($t_job_id) > $max_len_job);
	891	}
	892
	893	print <<EOD;
	894	Job status:
	895	.\twait
	896	-\tsubmitted
	897	r\trunning
	898	+\tcompleted
	899	!\terror
	900	EOD
	901
	902	for ($i=$max_len_job-1; $i>=0; $i--) {
	903	print ' 'x$max_len_sample, "\t";
	904	foreach $t_job_id (@NGS_batch_jobs) {
	905	print " ", ($i<length($t_job_id) ? substr(reverse($t_job_id), $i, 1):" ");
	906	}
	907	print "\n";
	908	}
	909
	910	foreach $t_sample_id (@NGS_samples) {
	911	print "$t_sample_id\t";
	912	foreach $t_job_id (@NGS_batch_jobs) {
	913	my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
	914	my $status = $t_sample_job->{'status'};
	915	if ($status eq "completed") { print " +";}
	916	elsif ($status eq "submitted") { print " -";}
	917	elsif ($status eq "running" ) { print " r";}
	918	elsif ($status eq "wait" ) { print " .";}
	919	elsif ($status eq "error" ) { print " !";}
	920	else { print " _";}
	921	}
	922	print "\n";
	923	}
	924	}
	925	########## END task_snapshot
	926
	927	sub task_list_jobs {
	928	my ($i, $j, $k, $ll, $t_job_id, $t_sample_id, $t_job);
	929	foreach $t_job_id (@NGS_batch_jobs) {
	930	$t_job = $NGS_batch_jobs{$t_job_id};
	931	#my @t_infiles = @{$t_job->{"infiles"}};
	932	my @t_injobs = @{$t_job->{"injobs"}};
	933
	934	#print "\tInput_files:", join(",", @t_infiles) if @t_infiles;
	935	print "$t_job_id\tIn_jobs:[" , join(",", @t_injobs), "]\tJob_level:$t_job->{'job_level'}\n";
	936	}
	937	}
	938	########## END task_list_jobs
	939
	940	sub file1_after_file2 {
	941	my ($file1, $file2) = @_;
	942
	943	# if not exist file1, assume it is in future, so it is newer
	944	if (not -e ($file1)) {return 0;}
	945	if (not -e ($file2)) {return 0;}
	946
	947	my $mtime1 = (stat($file1))[9];
	948	my $mtime2 = (stat($file2))[9];
	949
	950	return ( ($mtime1 > $mtime2) ? 1 : 0);
	951	}
	952	######## END file1_after_file2
	953
	954	sub file1_same_or_after_file2 {
	955	my ($file1, $file2) = @_;
	956
	957	# if not exist file1, assume it is in future, so it is newer
	958	if (not -e ($file1)) {return 0;}
	959	if (not -e ($file2)) {return 0;}
	960
	961	my $mtime1 = (stat($file1))[9];
	962	my $mtime2 = (stat($file2))[9];
	963
	964	return ( ($mtime1 >= $mtime2) ? 1 : 0);
	965	}
	966	######## END file1_after_file2
	967
	968
	969	sub task_delete_jobs {
	970	my $opt = shift;
	971	my ($i, $j, $k, $ll, $t_job_id, $t_sample_id);
	972	my ($mode, $c) = split(/:/, $opt);
	973	my $tmp_sh = "NGS-$$.sh";
	974
	975	open(TMPSH, "> $tmp_sh") \|\| die "can not write to file $tmp_sh";
	976	print TMPSH "#Please execute the following commands\n";
	977	foreach $t_sample_id (@NGS_samples) {
	978	my %job_to_delete_ids = ();
	979	if ($mode eq "jobids") {
	980	%job_to_delete_ids = map {$_, 1} split(/,/,$c);
	981	}
	982	elsif ($mode eq "run_after") {
	983	die "file $c doesn't exist!" unless (-e $c);
	984	foreach $t_job_id (keys %NGS_batch_jobs) {
	985	my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
	986	my $t_sh_file = $t_sample_job->{'sh_file'};
	987	my $t_sh_pid = "$t_sh_file.pids";
	988	next unless (-e $t_sh_pid); #### unless the job is submitted
	989	#$job_to_delete_ids{$t_job_id} = 1 if (file1_same_or_after_file2( $t_sample_job->{'start_file'} , $c));
	990	$job_to_delete_ids{$t_job_id} = 1 if (file1_same_or_after_file2( $t_sh_pid , $c));
	991
	992	}
	993	}
	994	else {
	995	die "unknown option for deleting jobs: $opt";
	996	}
	997
	998	# now %job_to_delete_ids are jobs need to be deleted
	999	# next find all jobs that depends on them, recrusively
	1000	my $no_jobs_to_delete = scalar keys %job_to_delete_ids;
	1001	while(1) {
	1002	foreach $t_job_id (keys %NGS_batch_jobs) {
	1003	my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
	1004	my $t_sh_file = $t_sample_job->{'sh_file'};
	1005	my $t_sh_pid = "$t_sh_file.pids";
	1006	next unless (-e $t_sh_pid); #### unless the job is submitted
	1007	my @t_injobs = @{ $t_sample_job->{'injobs'} };
	1008	foreach my $t_job_id_2 (@t_injobs) {
	1009	$job_to_delete_ids{$t_job_id} = 1 if ($job_to_delete_ids{$t_job_id_2});
	1010	}
	1011	}
	1012	last if ($no_jobs_to_delete == (scalar keys %job_to_delete_ids)); #### no more depending jobs
	1013	$no_jobs_to_delete = scalar keys %job_to_delete_ids;
	1014	}
	1015
	1016	if ($no_jobs_to_delete) {
	1017	print TMPSH "#jobs to be deleted for $t_sample_id: ", join(",", keys %job_to_delete_ids), "\n";
	1018	print "#jobs to be deleted for $t_sample_id: ", join(",", keys %job_to_delete_ids), "\n";
	1019	foreach $t_job_id (keys %job_to_delete_ids) {
	1020	my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
	1021	my $t_sh_file = $t_sample_job->{'sh_file'};
	1022	my $t_sh_pid = "$t_sh_file.pids";
	1023	print TMPSH "\\rm -rf $pwd/$t_sample_id/$t_job_id\n";
	1024	print TMPSH "\\rm $t_sh_pid\n";
	1025	print TMPSH "\\rm $t_sh_file..std\n";
	1026
	1027	#### find the qsub ids to be deleted
	1028	my $qids = `cat $t_sh_pid`; $qids =~ s/\n/ /g; $qids =~ s/\s+/ /g;
	1029	print TMPSH "qdel $qids\n";
	1030	}
	1031	}
	1032	}
	1033	close(TMPSH);
	1034	print "The script is not delete the file, please run $tmp_sh to delete files!!!\n\n";
	1035	}
	1036	########## END task_list_jobs
	1037
	1038	sub task_log_cpu {
	1039	my ($i, $j, $k, $ll, $t_job_id, $t_sample_id);
	1040
	1041	my %cpu_info;
	1042	foreach $t_job_id (keys %NGS_batch_jobs) {
	1043	if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
	1044	my $t_job = $NGS_batch_jobs{$t_job_id};
	1045	foreach $t_sample_id (@NGS_samples) {
	1046
	1047	$cpu_info{$t_job_id}{$t_sample_id} = [$t_wall, $t_cpu];
	1048	}
	1049	}
	1050
	1051	foreach $t_sample_id (@NGS_samples) {
	1052	my $f_cpu = "$pwd/$t_sample_id/WF.cpu";
	1053	open(CPUOUT, "> $f_cpu") \|\| die "Can not open $f_cpu";
	1054	print CPUOUT "#job_name\tCores\tWall(s)\tWall_time\tCPU(s)\tCPU_time\n";
	1055	my $min_start = 1402092131 * 999999;
	1056	my $max_end = 0;
	1057	my $sum_cpu = 0;
	1058	foreach $t_job_id (keys %NGS_batch_jobs) {
	1059	if ($subset_flag) {next unless ($subset_jobs{$t_job_id});}
	1060	my $t_job = $NGS_batch_jobs{$t_job_id};
	1061	my $t_core = $t_job->{"cores_per_cmd"} * $t_job->{"no_parallel"};
	1062
	1063	my $t_sample_job = $job_list{$t_job_id}{$t_sample_id};
	1064	my $f_start = $t_sample_job->{'start_file'};
	1065	my $f_complete = $t_sample_job->{'complete_file'};
	1066	my $f_cpu = $t_sample_job->{'cpu_file'};
	1067	my $t_start = `cat $f_start`; $t_start =~ s/\s//g; $min_start = $t_start if ($t_start < $min_start);
	1068	my $t_end = `cat $f_complete`; $t_end =~ s/\s//g; $max_end = $t_end if ($t_end > $max_end);
	1069	my $t_wall = int($t_end - $t_start);
	1070	$t_wall = 0 unless ($t_wall>0);
	1071
	1072	my $t_cpu = 0;
	1073	if (open(TCPU, $f_cpu)) {
	1074	while($ll = <TCPU>) {
	1075	chop($ll);
	1076	if ($ll =~ /^(\d+)m(\d+)/) {
	1077	$t_cpu += $1 * 60;
	1078	}
	1079	}
	1080	close(TCPU);
	1081	}
	1082	$sum_cpu += $t_cpu;
	1083
	1084	my $t_walls = time_str1($t_wall);
	1085	my $t_cpus = time_str1($t_cpu);
	1086	print CPUOUT "$t_job_id\t$t_core\t$t_wall\t$t_walls\t$t_cpu\t$t_cpus\n";
	1087	}
	1088	my $t_wall = ($max_end - $min_start); $t_wall = 0 unless ($t_wall>0);
	1089	my $t_walls = time_str1($t_wall);
	1090	my $sum_cpus= time_str1($sum_cpu);
	1091	print CPUOUT "total\t-\t$t_wall\t$t_walls\t$sum_cpu\t$sum_cpus\n";
	1092	close(CPUOUT);
	1093	}
	1094	}
	1095	######### END task_log_cpu
	1096
	1097	sub time_str1 {
	1098	my $s = shift;
	1099	my $str = "";
	1100
	1101	$str .= int($s/3600); $str .= "h"; $s = $s % 3600;
	1102	$str .= int($s/60); $str .= "m"; $s = $s % 60;
	1103	$str .= $s; $str .= "s";
	1104
	1105	return $str;
	1106	}
	1107	########## END time_str1;
	1108
	1109
	1110
	1111
	1112
	1113
	1114	sub usage {
	1115	<<EOD;
	1116
	1117	# =============================== NG-Omics-WF ==================================
	1118	# _ _ _____ ____ _ __ ________
	1119	# \| \\ \| \|/ ____\| / __ \\ (_) \\ \\ / / ____\|
	1120	# \| \\\| \| \| __ ______\| \| \| \|_ __ ___ _ ___ ___ _____\\ \\ /\\ / /\| \|__
	1121	# \| . ` \| \| \|_ \|______\| \| \| \| '_ ` _ \\\| \|/ __/ __\|______\\ \\/ \\/ / \| __\|
	1122	# \| \|\\ \| \|__\| \| \| \|__\| \| \| \| \| \| \| \| (__\\__ \\ \\ /\\ / \| \|
	1123	# \|_\| \\_\|\\_____\| \\____/\|_\| \|_\| \|_\|_\|\\___\|___/ \\/ \\/ \|_\|
	1124	#
	1125	# =========================== Next Generation Omics data workflow tools ========
	1126
	1127	To run workflow:
	1128	$0 -s sample_file -i workflow_file
	1129
	1130	Options:
	1131
	1132	-i workflow configration file, required
	1133
	1134	-s sample data file, required unless -S is present
	1135	File format example
	1136	#Sample data file example, TAB or space delimited for following lines
	1137	Sample_ID1 sample_data_0 sample_data_1
	1138	Sample_ID2 sample_data_0 sample_data_1
	1139	Sample_ID3 sample_data_0 sample_data_1
	1140
	1141	-S sample data from command line, required unless -s is present
	1142	format: Sample_ID1:sample_data_0:sample_data_0:sample_data_1,Sample_ID2:sample_data_0:sample_data_1
	1143
	1144	-j run sub sets of jobs, optional, the workflow will run all jobs by default
	1145	e.g. -j qc or -j qc,fastqc
	1146
	1147	-t parameter file, optional, replace default paramters in workflow configration file
	1148	File format example
	1149	#parameter file example, TAB or space delimited for following lines
	1150	CMDOPT JobID_A:opt0:opt1:opt2
	1151	CMDOPT JobID_B:opt0:opt1
	1152
	1153	-T parameter from command line
	1154	format: JobID_A:opt0:opt1:opt2,JobID_B:opt0:opt1
	1155
	1156	-r root directory of NGS-tools
	1157
	1158	-J optional tasks
	1159	write-sh: write sh files and quite
	1160	log-cpu: gathering cpu time for each run for each sample
	1161	list-jobs: list jobs
	1162	snapshot: snapshot current job status
	1163	delete-jobs: delete jobs, must supply jobs delete syntax by option -Z
	1164	e.g. -J delete-jobs -Z jobids:assembly,blast ---delete assembly,blast and all jobs depends on them
	1165	-J delete-jobs -Z run_after:filename ---delete jobs that has start time (WF.start.date) after this file, and all depending jobs
	1166
	1167	-Z secondary parameter used by other options, such as -J
	1168
	1169	-Q queue system, default SGE
	1170	can be PBS, SGE
	1171
	1172	Question and comments:
	1173	http://weizhongli-lab.org/ngomicswf liwz\@sdsc.edu
	1174
	1175	EOD
	1176	}
	1177
	1178
	1179
	1180	############################################################################################
	1181	# _______ ________ _________ ___________________ ________ .____ _________
	1182	# \ \ / _____/ / _____/ \__ ___/\_____ \ \_____ \ \| \| / _____/
	1183	# / \| \/ \ ___ \_____ \ ______ \| \| / \| \ / \| \\| \| \_____ \
	1184	#/ \| \ \_\ \/ \ /_____/ \| \| / \| \/ \| \ \|___ / \
	1185	#\____\|__ /\______ /_______ / \|____\| \_______ /\_______ /_______ \/_______ /
	1186	# \/ \/ \/ \/ \/ \/ \/
	1187	############################################################################################
	1188

+200

-0

usecases/Miseq-16S/README less more

	0	CD-HIT usecases: CD-HIT-OTU-MiSeq (http://cd-hit.org)
	1
	2	Please also check https://github.com/weizhongli/cdhit/wiki,
	3	which offers most up-to-date documents.
	4
	5
	6	================================================================================================
	7	Introduction of CD-HIT-OTU-MiSeq
	8	================================================================================================
	9	This use case is developed for clustering 16S rDNA sequences sequenced with MiSeq
	10	platform into OTUs for microbiome studies.
	11	In recent years, Illumina MiSeq sequencers became dominant in 16S rDNA sequencing. The
	12	Paired End (PE) reads need to be assembled first. However many reads can not be accurately
	13	assembled because the poor quality at the 3’ ends of both PE reads in the overlapping region.
	14	This causes that many sequences are discarded in the analysis. CD-HIT-OTU-MiSeq has unique
	15	features to cluster MiSeq 16S sequences.
	16
	17	* The package can clustering PE reads without joining them into contigs.
	18	* Users can choose a high quality portion of the PE reads for analysis
	19	(e.g. first 200 / 150 bases from forward / reverse reads), according to base quality profile.
	20	* We implemented a tool that can splice out the target region (e.g. V3-V4) from a full-length
	21	16S reference database into the PE sequences. CD-HIT-OTU-MiSeq can cluster the spliced PE
	22	reference database together with samples, so we can derive Operational Tax-onomic Units (OTUs)
	23	and annotate these OTUs concurrently.
	24	* Chimeric sequences are effectively identified through de novo approache.
	25
	26	The most important unique feature of CD-HIT-OTU-MiSeq is to only use high quality region at
	27	the 5’ ends of R1 and R2 reads. For example, the effective clustering read length can be 200 bases
	28	for R1 and 150 bases for R2. The effective portions of PE reads are clustered together with
	29	spliced PE sequences from the reference database to derive OTUs (Figure).
	30
	31
	32	================================================================================================
	33	Installation
	34	================================================================================================
	35	1. Install CD-HIT package
	36	* download current CD-HIT at https://github.com/weizhongli/cdhit/releases,
	37	for example cd-hit-v4.6.2-2015-0511.tar.gz
	38	* unpack the file with “tar xvf cd-hit-v4.6.2-2015-0511.tar.gz –gunzip”
	39	* change dir by “cd cd-hit-v4.6.2-2015-0511”
	40	* compile the programs by “make” with multi-threading (default),
	41	or by “make openmp=no” without multi-threading (on old systems without OpenMP)
	42	* cd cd-hit-auxtools
	43	* compile cd-hit-auxtools by “make”
	44	* CD-HIT-OTU-MiSeq scripts are inside a folder like cd-hit-v4.6.2-2015-0511/usecases/Miseq-16S
	45
	46
	47	2. Install Trimmomatic
	48	CD-HIT-OTU-MiSeq uses Trimmomatic for sequence quality control. It can be downloaded from
	49	http://www.usadellab.org/cms/?page=trimmomatic or https://github.com/timflutre/trimmomatic.
	50	We also have a copy at http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/.
	51
	52
	53	3. Modify NG-Omics-Miseq-16S.pl
	54	Please edit usecases/Miseq-16S/NG-Omics-Miseq-16S.pl, in the top few lines:
	55	$CD_HIT_dir = "PATH_to_cd-hit";
	56	$NGS_prog_trimmomatic = "PATH_to_trimmomatic/trimmomatic-0.32.jar"; #### where you have installed Trimmomatic
	57
	58	4. Download reference dataset
	59	Reference database can be downloaded from http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/.
	60	The reference database Greengene-13-5-99.fasta.gz was re-formatted from original Greengene database,
	61	so that sequences with more specific annotations are at the beginning of the file. Please gunzip after
	62	download.
	63
	64	You can also download Greengene directly. You should download Greengene from
	65	http://greengenes.secondgenome.com/downloads, or ftp://greengenes.microbio.me/.
	66	Please download file like greengenes_release/gg_13_5/gg_13_5_otus.tar.gz, unpack the tar file.
	67	You may find gg_13_5_otus/taxonomy/99_otu_taxonomy.txt and gg_13_5_otus/rep_set/99_otus.fasta.
	68
	69	There is a script: usecases/Miseq-16S/greengene-ann1.pl, please run this script to re-format greengene:
	70	PATH_to_cd-hit/usecases/Miseq-16S/greengene-ann1.pl -i gg_13_5_otus/taxonomy/99_otu_taxonomy.txt -j gg_13_5_otus/rep_set/99_otus.fasta -o Greengene-13-5-99.fasta
	71
	72	5. Download sample datasets
	73	Sample datasets can be downloaded from http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/.
	74	The Miseq-otu-example.tar.gz contains two Miseq 16S samples. You can download and unpack to test.
	75
	76
	77	================================================================================================
	78	Usage of CD-HIT-OTU-MiSeq
	79	================================================================================================
	80	1. Prepare fastq files and sample file
	81	Most projects have multiple samples sequenced at the same variable regions.
	82	After your samples are sequenced, your sequencing center should give you two paired ended fastq files
	83	for each samples. Put them in a working directory in similar way as the testing datasets,
	84	where the R1.fq and R2.fq are placed in a folder for each sample. the folder name is the sample name.
	85	So in the working directory, you should have files:
	86
	87	sample_name_1/R1.fq
	88	sample_name_1/R2.fq
	89	sample_name_2/R1.fq
	90	sample_name_2/R2.fq
	91	...
	92	sample_name_N/R1.fq
	93	sample_name_N/R2.fq
	94
	95
	96	2. Prepare sample file
	97	Next is to prepare a SAMPLE_file, a text file, in the working directory. The file should look like:
	98
	99	sample_name_1 R1.fq R2.fq
	100	sample_name_2 R1.fq R2.fq
	101	...
	102	sample_name_N R1.fq R2.fq
	103
	104
	105	3. Prepare reference database
	106	We implemented a tool that can splice out the target amplicon region (e.g. V3-V4) from a
	107	full-length 16S rRNA reference sequence database, such as Greengene, RDP and Silva,
	108	into PE sequences. If there are multiple samples in a project sequenced with the same
	109	amplicon of same variable region, only one spliced reference database is needed.
	110	Please run:
	111
	112	Path_to_cd-hit_dir/usecases/Miseq-16S/16S-ref-db-PE-splice.pl -i sample_name_1/R1.fq -j sample_name_2/R2.fq -d Greengene-13-5-99.fasta -o gg_13_5-PE99.150-100 -p 150 -q 100 -c 0.99
	113
	114	Where Greengene-13-5-99.fasta is our re-formatted Greengene sequence file.
	115	-p 150 specify the effective clustering read length for R1 to be 150
	116	-q 100 specify the effective clustering read length for R2 to be 100
	117	-p and -q option need to be consistent with parameters in OTU clustering in step 4
	118	see next section for suggestions in choose effective clustering read length
	119
	120	This program will output spliced PE files gg_13_5-PE99.150-100-R1 and gg_13_5-PE99.150-100-R2.
	121
	122
	123	4. Run sequence QC and OTU clustering for each sample
	124	In the working directory, run
	125
	126	PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
	127
	128	where: 150 and 100 are the effective length,
	129	see next section for suggestions in choose effective clustering read length
	130	0.97 is the OTU clustering cutoff,
	131	0.00001 is the abundance cutoff,
	132	75 is the length for chimeric checking at each R1 and R2 read
	133	PATH_to-gg_13_5-PE99.150-100-R1 and PATH_to-gg_13_5-PE99.150-100-R2 need to be full path
	134	e.g. /home/user/myproj/PATH_to-gg_13_5-PE99.150-100-R1
	135
	136	This command will generate shell scripts for QC and for OTU for each sample.
	137	The scripts will be in WF-sh folder. You can first run all the qc.sample_name.sh and after all
	138	these jobs finished you then run all otu.sample_name.sh
	139
	140	NG-Omics-WF.pl https://github.com/weizhongli/ngomicswf is a very powerful workflow and pipeline
	141	tool developed in our group. It is not fully released yet, since we need more time to document
	142	this tool. However, you can try to use NG-Omics-WF.pl to automatically run all your samples.
	143	First edit NG-Omics-Miseq-16S.pl and modify cores_per_node around line #36 to match the
	144	number of CPU cores of your computer, then run
	145
	146	nohup PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 &
	147
	148	After the job finished, the OTU results will be in sample_name/otu folder, important files include
	149	OTU.clstr: file lists all clusters and sequences
	150	chimeric-small-clusters-list.txt: list of chimeric reads and low abundance reads not used
	151
	152
	153	5. Pool all samples together
	154	If you have multiple samples, you don't just want to stop here. It is important
	155	to pool all sample together and re-run OTU clustering so that all samples can be
	156	compared, run
	157
	158	PATH_to_cd-hit-dir/usecases/pool_samples.pl -s SAMPLE_file -o pooled
	159
	160	This will pool sequences from all samples. We can handle hundred and more sample without problem.
	161
	162
	163	6. Cluster pooled samples, run
	164
	165	PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -S pooled -j otu-pooled -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
	166
	167	This command will generate a script WF-sh/otu-pooled.pooled.sh, you can
	168	run this sh script. When it is finished, OTUs will be in the pooled directory:
	169	OTU.clstr: file list all clusters and sequences from all samples in CD-HIT format
	170	OTU.txt: spread sheet list number of sequences in each OTU for each sample, it also show annotation for each OTU.
	171	chimeric-small-clusters-list.txt: list of chimeric reads and low abundance reads not used
	172
	173
	174	================================================================================================
	175	Choose effective clustering read length
	176	================================================================================================
	177	The key of this method is to use the high quality portion of reads from both R1 and R2, so how
	178	to choose effective clustering read length depends on the actual quality of the PE reads. In our
	179	paper five pairs of effective clustering read lengths (225, 175), (200, 150), (175, 125),
	180	(150, 100) and (125, 75) were selected for samples sequenced at V34 or V45.
	181	Two pairs of effective clustering read lengths (150, 100) and (125, 75) were used for
	182	samples of V4 region. All these settings gave good results.
	183
	184	You can try some different settings and compare the resutls. Also, programs such as FASTQC
	185	(http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) can be used to scan the raw reads
	186	to help choose the effective clustering read length of R1 and R2.
	187
	188
	189
	190
	191	================================================================================================
	192	Other topics
	193	================================================================================================
	194
	195	Questions, comments to the author Weizhong Li, liwz@sdsc.edu
	196
	197
	198
	199

+222

-0

usecases/Miseq-16S/cd-hit-otu-miseq-PE.pl less more

	0	#!/usr/bin/perl
	1
	2	use Getopt::Std;
	3	my $script_name = $0;
	4	my $script_dir = $0;
	5	$script_dir =~ s/[^\/]+$//;
	6	chop($script_dir);
	7	$script_dir = "./" unless ($script_dir);
	8
	9	getopts("i:j:o:p:c:s:t:m:e:Z:a:f:d:R:",\%opts);
	10	die usage() unless ($opts{i} and $opts{o});
	11
	12	my $input = $opts{i};
	13	my $input2 = $opts{j};
	14	my $dir = $opts{o};
	15	my $abs_cutoff = $opts{a}; $abs_cutoff = 0.00005 unless ($abs_cutoff); #5e-5
	16	my $otu_cutoff = $opts{c}; $otu_cutoff = 0.97 unless ($otu_cutoff);
	17	my $chimera_f = $opts{m}; $chimera_f = "true" unless ($chimera_f);
	18	my $debug_mode = $opts{Z};
	19	my $fast_mode = $opts{f}; #### use cd-hit-dup for stage 1 and 2 clustering
	20	my $cdhit_opt = $opts{d};
	21	my $restart_n = $opts{R}; $restart_n = 0 unless (defined($restart_n));
	22	my $LOGf = "$dir/OTU.log";
	23	my $cd_hit_dup = "$script_dir/../../cd-hit-auxtools/cd-hit-dup"; die "no $cd_hit_dup" unless (-e $cd_hit_dup);
	24	my $cd_hit_est = "$script_dir/../../cd-hit-est"; die "no $cd_hit_est" unless (-e $cd_hit_est);
	25
	26	my ($i, $j, $k, $str, $cmd, $ll);
	27	$cmd = `mkdir -p $dir`;
	28	open(LOG, "> $LOGf") \|\| die "can not write to $LOGf";
	29	my $f2 = "$dir/seq";
	30
	31	################################################################################
	32	#### Stage 0 ----------- clustering at 100% - stage 0
	33	################################################################################
	34	my $clstr = "$f2.dup.clstr";
	35	my $clstr2 = "$f2.dup2.clstr";
	36	if ($restart_n <= 0) {
	37	nice_run("$cd_hit_dup -i $input -i2 $input2 -o $f2.dup -o2 $f2.dup.2 -u 100 -d 0 -m false -f $chimera_f > $f2.dup2.log");
	38	nice_run("cat $f2.dup.clstr $f2.dup2.clstr > $f2-stage0.clstr.tmp");
	39	nice_run("$script_dir/cd-hit/clstr_sort_by.pl < $f2-stage0.clstr.tmp > $f2-stage0.clstr; rm -f $f2-stage0.clstr.tmp");
	40	nice_run("$script_dir/clstr_sort_rep.pl $f2-stage0.clstr $input > $f2-stage0-rep.fa");
	41	#
	42	# /home/oasis/data/etc/git/cdhit/cd-hit-auxtools/cd-hit-dup -i qc/R1.fa -i2 qc/R2.fa -o otu/seq.dup -o2 otu/seq.dup.2 -u 100 -d 0 -f true > otu/seq.dup.log # no work
	43	# /home/oasis/data/etc/git/cdhit/cd-hit-auxtools/cd-hit-dup -i qc/R1.fa -i2 qc/R2.fa -o otu/seq.dup -o2 otu/seq.dup.2 -u 100 -d 0 > otu/seq.dup.log
	44	#
	45	# what if cd-hit-est
	46	# /home/oasis/data/etc/git/cdhit/cd-hit-est -i qc/R1.fa -j qc/R2.fa -o otu/seq.nr -op otu/seq.nr.2 -sf 1 -sc 1 -P 1 -r 0 -cx 100 -cy 100 -c 1.0 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > otu/seq.nr.log
	47	# /home/oasis/data/etc/git/cdhit/cd-hit-est -i otu/seq.nr -o otu/seq.nr.R1 -r 0 -cx 100 -c 1.0 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > otu/seq.nr.R1.log
	48	# /home/oasis/data/etc/git/cdhit/cd-hit-est -i otu/seq.nr.2 -o otu/seq.nr.R2 -r 0 -cx 100 -c 1.0 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > otu/seq.nr.R2.log
	49
	50	# /home/oasis/data/etc/git/cdhit/cd-hit-est -i otu/seq.nr -j otu/seq.nr.2 -o otu/seq.99 -op otu/seq.99.2 -P 1 -r 0 -cx 100 -cy 100 -c 0.99 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > otu/seq.99.log
	51	# /home/oasis/data/etc/git/cdhit/cd-hit-est -i otu/seq.99 -j otu/seq.99.2 -o otu/seq.97 -op otu/seq.97.2 -P 1 -r 0 -cx 100 -cy 100 -c 0.97 -n 10 -G 1 -b 5 -T 1 -M 8000 -d 0 -p 1 > otu/seq.97.log
	52	# do not sort 99.clstr, always trust cd-hit-dup ordered sequences
	53	# /home/oasis/data/etc/git/cdhit/clstr_rev.pl otu/seq.nr.clstr otu/seq.99.clstr \| /home/oasis/data/etc/git/cdhit/clstr_sort_by.pl > otu/seq.99-full.clstr
	54	# /home/oasis/data/etc/git/cdhit/clstr_rev.pl otu/seq.99-full.clstr otu/seq.97.clstr \| /home/oasis/data/etc/git/cdhit/clstr_sort_by.pl > otu/seq.97-full.clstr
	55	#
	56	# combine ref
	57	# /home/oasis/data/etc/git/cdhit/cd-hit-est -i seq.99.wref.R1 -o seq.97.wref.R1only -r 0 -cx 100 -c 0.97 -n 10 -b 5 -T 1 -M 8000 -d 1 -p 1 -G 0 -A 50 -g 1
	58	#
	59	}
	60	if (not $debug_mode) {
	61	my $no1 = count_seqs_from_fasta_file($input);
	62	my $no_clstr = count_clstrs_from_clstr_file($clstr);
	63	my $no_clstr2 = count_clstrs_from_clstr_file($clstr2);
	64	print LOG "Number_contigs\t$no1\n";
	65	print LOG "Number_unique_contigs\t$no_clstr\n";
	66	print LOG "Number_unique_chimaric_contigs\t$no_clstr2\n";
	67	}
	68
	69	################################################################################
	70	#### Stage 1 ---------- clustering at 99.25% #### distance 0.75%
	71	################################################################################
	72	my $seq_n = `grep -c "^>" $input`; $seq_n =~ s/\D//g;
	73	my $cutoff = int($seq_n * $abs_cutoff);
	74	my $c1 = 0.9925;
	75	if ($restart_n <= 1) {
	76	if ($fast_mode) {
	77	nice_run("$script_dir/cd-hit-auxtools/cd-hit-dup -i $f2-stage0-rep.fa -o $f2-stage1 -d 0 -m false -e 3 > $f2-stage1.log");
	78	}
	79	else {
	80	nice_run("$script_dir/cd-hit/cd-hit-est -i $f2-stage0-rep.fa -o $f2-stage1 -c $c1 -n 10 -l 11 -p 1 -d 0 -g 1 -b 3 $cdhit_opt > $f2-stage1.log");
	81	}
	82	nice_run("$script_dir/cd-hit/clstr_rev.pl $f2-stage0.clstr $f2-stage1.clstr \| $script_dir/cd-hit/clstr_sort_by.pl > $f2-stage1-all.clstr");
	83	nice_run("$script_dir/clstr_sort_rep.pl $f2-stage1-all.clstr $f2-stage1 > $f2-stage1-rep.fa");
	84	}
	85	if (not $debug_mode) {
	86	$no_clstr = count_clstrs_from_clstr_file("$f2-stage1.clstr");
	87	print LOG "Stage1 clustering at $c1\n";
	88	print LOG "Number_clusters_stage1\t$no_clstr\n";
	89	}
	90
	91	################################################################################
	92	#### Stage 2 ---------- clustering at 98.50% #### distance 1.50%
	93	################################################################################
	94	$c1 = 0.985;
	95	if ($restart_n <= 2) {
	96	if ($fast_mode) {
	97	nice_run("$script_dir/cd-hit-auxtools/cd-hit-dup -i $f2-stage1-rep.fa -o $f2-stage2 -d 0 -m false -e 6 > $f2-stage2.log");
	98	}
	99	else {
	100	nice_run("$script_dir/cd-hit/cd-hit-est -i $f2-stage1-rep.fa -o $f2-stage2 -c $c1 -n 10 -l 11 -p 1 -d 0 -g 1 -b 3 $cdhit_opt > $f2-stage2.log");
	101	}
	102	nice_run("$script_dir/cd-hit/clstr_rev.pl $f2-stage1-all.clstr $f2-stage2.clstr \| $script_dir/cd-hit/clstr_sort_by.pl > $f2-stage2-all.clstr");
	103	nice_run("$script_dir/clstr_sort_rep.pl $f2-stage2-all.clstr $f2-stage2 > $f2-stage2-rep.fa");
	104	}
	105	if (not $debug_mode) {
	106	$no_clstr = count_clstrs_from_clstr_file("$f2-stage2.clstr");
	107	print LOG "Stage2 clustering at $c1\n";
	108	print LOG "Number_clusters_stage2\t$no_clstr\n";
	109	}
	110
	111
	112	################################################################################
	113	#### Stage pre-3 ---------- filtering
	114	################################################################################
	115
	116	if ($restart_n <= 3) {
	117	nice_run("$script_dir/clstr_select_rep.pl size $cutoff 999999999 < $f2-stage2-all.clstr > $f2-stage2-rep-big.ids");
	118	nice_run("$script_dir/fetch_fasta_by_ids.pl $f2-stage2-rep-big.ids $f2-stage2-rep.fa > $f2-stage2-rep-big.fa");
	119	nice_run("$script_dir/fetch_fasta_exclude_ids.pl $f2-stage2-rep-big.ids $f2-stage2-rep.fa > $f2-stage2-rep-small.fa");
	120
	121	if (-s $clstr2) {
	122	nice_run("$script_dir/clstr_select_rep.pl size 1 999999999 < $clstr2 > $dir/chimaric.ids"); ## save chimaric ids
	123	nice_run("$script_dir/fetch_fasta_exclude_ids.pl $dir/chimaric.ids $f2-stage2-rep-big.fa > $f2-stage2-rep-big-good.fa"); ## exclude chimaric reads from $t1-pri-rep.fa
	124	nice_run("rm -f $f2-stage2-rep-big.fa");
	125
	126	nice_run("$script_dir/fetch_fasta_exclude_ids.pl $dir/chimaric.ids $f2-stage2-rep-small.fa > $f2-stage2-rep-small-good.fa");
	127	nice_run("rm -f $f2-stage2-rep-small.fa");
	128	}
	129	else {
	130	nice_run("mv $f2-stage2-rep-big.fa $f2-stage2-rep-big-good.fa");
	131	nice_run("mv $f2-stage2-rep-small.fa $f2-stage2-rep-small-good.fa");
	132	}
	133	}
	134
	135	if (not $debug_mode) {
	136	print LOG "Min_clstr_size\t$cutoff\n";
	137	my $no_seq = count_seqs_from_fasta_file("$f2-stage2-rep-big-good.fa");
	138	print LOG "Number_clstrs_above_min_size\t$no_seq\n";
	139	}
	140
	141	################################################################################
	142	#### Stage 3 ---------- clustering at 97%
	143	################################################################################
	144	$c1 = $otu_cutoff;
	145	if ($restart_n <= 3) {
	146	nice_run("$script_dir/cd-hit/cd-hit-est -i $f2-stage2-rep-big-good.fa -o $f2-stage3 -c $c1 -n 8 -l 11 -p 1 -d 0 -g 1 -b 5 $cdhit_opt > $f2-stage3.log");
	147	nice_run("$script_dir/cd-hit/clstr_rev.pl $f2-stage2-all.clstr $f2-stage3.clstr \| $script_dir/cd-hit/clstr_sort_by.pl > $f2-stage3-all.clstr");
	148	nice_run("$script_dir/clstr_sort_rep.pl $f2-stage3-all.clstr $f2-stage3 > $f2-stage3-rep.fa");
	149	nice_run("mv -f $f2-stage3-all.clstr $dir/OTU.clstr");
	150	nice_run("$script_dir/cd-hit-otu-table-faa.pl -i $dir/OTU.clstr -s $f2-stage3-rep.fa -o $dir/OTU-dist.txt -f $dir/OTU.fa");
	151	}
	152
	153	if (not $debug_mode) {
	154	$no_clstr = count_clstrs_from_clstr_file("$dir/OTU.clstr");
	155	$no_seq = count_seqs_from_clstr_file("$dir/OTU.clstr");
	156	print LOG "OTU clustering at $c1\n";
	157	print LOG "Number_OTUs\t$no_clstr\n";
	158	print LOG "Number_seqs_in_OTUs\t$no_seq\n";
	159	my ($tu,$ts,$cu,$cs)=times(); my $tt=$tu+$ts+$cu+$cs;
	160	print LOG "Total_CPU_time\t$tt\n";
	161	}
	162	close(LOG);
	163
	164
	165	sub usage {
	166	<<EOF
	167	Usage:
	168	$script_name -i contig_fasta_file -o output_dir -a abundance_cutoff -c OTU_cutoff -m check_chimera_flag
	169
	170	Options:
	171	-i input fasta file of contig
	172	-o output dir
	173	-c OTU cutoff, default 0.97
	174	-m whether to perform chimera checking (true/false), default true
	175	-a abundance cutoff, default 0.00005
	176	small clusters < this size will be considiered as noise and will be removed
	177	if total input sequence is 50,000, then clusters < 2 (i.e. singletons) are removed
	178	-f 1 or 0, default 0
	179	if set to 1, then use cd-hit-dup instead of cd-hit-est for stage 1 and 2 clustering
	180	which is very fast
	181	-R restart flag, if re-run at different abundance cutoff value or something,
	182	with this parameter, program can skip the first n step and restart at certain step
	183	values:
	184	0 default, start from the scratch cd-hit-dup
	185	1 cd-hit-est at 99.25
	186	2 cd-hit-est at 98.75
	187	3 filtering and cd-hit-est at 97%
	188
	189	EOF
	190	}
	191	###### END usage
	192
	193	sub nice_run {
	194	my $str = shift;
	195	print STDERR "$str\n";
	196	my $cmd = `$str` unless ($debug_mode);
	197	return $cmd;
	198	}
	199	##########
	200
	201	sub count_clstrs_from_clstr_file {
	202	my $clstr = shift;
	203	my $n = `grep -c "^>" $clstr`;
	204	$n =~ s/\s//g;
	205	return $n;
	206	}
	207
	208	sub count_seqs_from_clstr_file {
	209	my $clstr = shift;
	210	my $n = `grep -cv "^>" $clstr`;
	211	$n =~ s/\s//g;
	212	return $n;
	213	}
	214
	215	sub count_seqs_from_fasta_file {
	216	my $faa = shift;
	217	my $n = `grep -c "^>" $faa`;
	218	$n =~ s/\s//g;
	219	return $n;
	220	}
	221

+82

-0

usecases/Miseq-16S/clstr_2_OTU_table.pl less more

	0	#!/usr/bin/perl
	1	#
	2	use Getopt::Std;
	3	getopts("i:s:S:o:f:j:",\%opts);
	4
	5	my $input = $opts{i}; $input = "OTU.clstr" unless $input;
	6	my $output = $opts{o}; $output = "OTU.txt" unless ($output);
	7	my ($i, $j, $k, $str, $cmd, $ll);
	8
	9	my %count = ();
	10	my %count_t = ();
	11	my %count_s = ();
	12	my $OTU_2_ann = ();
	13	my $tree_flag = 0; #### for greengene header format
	14	# >4360486\|k__Bacteria;.p__Firmicutes;.c__Clostridia;.o__Clostridiales;.f__Lachnospiraceae;.g__Roseburia;.s__faecis
	15	open(TMP, $input) \|\| die "can not open $input";
	16	my $OTU=0;
	17	while($ll=<TMP>){
	18	if ($ll =~ /^>/) {
	19	$OTU++;
	20	}
	21	else {
	22	chop($ll);
	23	if ($ll =~ /\d+(aa\|nt), >(.+)\.\.\./) {
	24	my $id = $2;
	25	if ($id =~ /^Sample\\|([^\\|]+)\\|/) {
	26	$sample_id = $1;
	27	$sample_id{$sample_id}=1;
	28	$count{$OTU}{$sample_id}++;
	29	$count_t{$OTU}++;
	30	$count_s{$sample_id}++;
	31	}
	32	else {
	33	$OTU_2_ann{$OTU} = $id;
	34	$tree_flag = 1 if ($id =~ /\\|k__Bacteria;.p__/);
	35	}
	36	}
	37	else {
	38	die "format error $ll";
	39	}
	40	}
	41	}
	42	close(TMP);
	43
	44	my @sample_ids = sort keys %sample_id;
	45
	46	open(OUT1, "> $output") \|\| die "can not write $output";
	47	print OUT1 "OTU";
	48	foreach $sample_id (@sample_ids){
	49	print OUT1 "\t$sample_id";
	50	}
	51	if ($tree_flag) {
	52	print OUT1 "\t", join("\t", qw/Kingdom Phylum Class Order Family Genus Species/);
	53	}
	54	#print OUT1 "\tTotal\n";
	55	print OUT1 "\tAnnotation\n";
	56
	57	for ($i=1; $i<=$OTU; $i++){
	58	$ann = "None";
	59	if ($OTU_2_ann{$i}) { $ann = $OTU_2_ann{$i}; }
	60	print OUT1 "OTU$i";
	61	foreach $sample_id (@sample_ids){
	62	$k = $count{$i}{$sample_id}? $count{$i}{$sample_id} : 0;
	63	print OUT1 "\t$k";
	64	}
	65	if ($tree_flag) {
	66	my ($tax_k, $tax_p, $tax_c, $tax_o, $tax_f, $tax_g, $tax_s);
	67	if ($ann =~ /k__(\w+)/) {$tax_k = $1} else {$tax_k = "";}
	68	if ($ann =~ /p__(\w+)/) {$tax_p = $1} else {$tax_p = "";}
	69	if ($ann =~ /c__(\w+)/) {$tax_c = $1} else {$tax_c = "";}
	70	if ($ann =~ /o__(\w+)/) {$tax_o = $1} else {$tax_o = "";}
	71	if ($ann =~ /f__(\w+)/) {$tax_f = $1} else {$tax_f = "";}
	72	if ($ann =~ /g__(\w+)/) {$tax_g = $1} else {$tax_g = "";}
	73	if ($ann =~ /s__(\w+)/) {$tax_s = $1} else {$tax_s = "";}
	74	print OUT1 "\t", join("\t", ($tax_k, $tax_p, $tax_c, $tax_o, $tax_f, $tax_g, $tax_s));
	75	}
	76	#print OUT1 "\t$count_t{$i}";
	77	print OUT1 "\t$ann\n";
	78	}
	79	close(OUT1);
	80
	81

+237

-0

usecases/Miseq-16S/filter-chimeric-and-small.pl less more

	0	#!/usr/bin/perl
	1
	2	use Getopt::Std;
	3	my $script_name = $0;
	4	my $script_dir = $0;
	5	$script_dir =~ s/[^\/]+$//;
	6	chop($script_dir);
	7	$script_dir = "./" unless ($script_dir);
	8
	9	getopts("k:i:j:o:p:c:s:t:m:e:Z:a:f:d:R:g:",\%opts);
	10	die usage() unless ($opts{k} and $opts{i} and $opts{j} and $opts{a} and $opts{f} and $opts{g} and $opts{o});
	11
	12	my $input0 = $opts{k}; ## nr.clstr
	13	my $input = $opts{i}; ## R1 only clstr
	14	my $input2 = $opts{j}; ## R2 only clstr
	15	my $clstr_99 = $opts{a}; ## seq.99.clstr #### can be any 2nd -preclustering e.g. 98.5%
	16	my $seq_99 = $opts{f}; ## seq.99 - fasta file R1
	17	my $seq_992 = $opts{g}; ## seq.99 - fasta file R2
	18	my $output = $opts{o}; ## seq.99f
	19	my $abs_cutoff = $opts{c}; $abs_cutoff = 0.0001 unless ($abs_cutoff);
	20	my $output_2 = "$output.2"; ## seq.99f.2 -- R2
	21	my $output_cls = "$output.clstr"; ## seq.99f.clstr
	22	my $output_log = "$output.log"; ## seq.99f.log
	23
	24	my ($i, $j, $k, $str, $cmd, $ll);
	25
	26	my $num_total_seq;
	27	my %seq_nr_size;
	28	my %seqs_of_nr;
	29	open(LOG, "> $output_log") \|\| die "can not open $output_log";
	30	open(TMP, $input0) \|\| die "can not open $input0";
	31	if (1) {
	32	my $rep;
	33	while($ll=<TMP>){
	34	if ($ll =~ /^>/) {
	35	$rep = "";
	36	}
	37	else {
	38	chop($ll);
	39	my $id;
	40	if ($ll =~ /\d+(aa\|nt), >(.+)\.\.\./) {
	41	$id = $2;
	42	$num_total_seq++;
	43	if ($ll =~ /\*$/) { $rep=$id; $seq_nr_size{$rep}=0; $seqs_of_nr{$rep} = [];}
	44	$seq_nr_size{$rep}++ if ($rep);
	45	push(@{$seqs_of_nr{$rep}}, $id) if ($rep);
	46	}
	47	}
	48	}
	49	}
	50	close(TMP);
	51
	52	my %seq_R1_clstr;
	53	my %seq_R2_clstr;
	54	foreach my $f (($input, $input2)) {
	55	open(TMP, $f) \|\| die "can not open $f";
	56	my $rep;
	57
	58	while($ll=<TMP>){
	59	if ($ll =~ /^>/) {
	60	$rep = "";
	61	}
	62	else {
	63	chop($ll);
	64	my $id;
	65	if ($ll =~ /\d+(aa\|nt), >(.+)\.\.\./) {
	66	$id = $2;
	67	if ($ll =~ /\*$/) {
	68	$rep=$id;
	69	}
	70	if ($rep) {
	71	if ($f eq $input) { $seq_R1_clstr{$id} = $rep;}
	72	else { $seq_R2_clstr{$id} = $rep;}
	73	}
	74	}
	75	}
	76	}
	77	close(TMP);
	78	}
	79
	80	#### open $clstr_99 first time
	81	open(TMP, $clstr_99) \|\| die "can not open $clstr_99";
	82	%rep_2_otu = ();
	83	$OTU = -1;
	84	while($ll=<TMP>){
	85	if ($ll =~ /^>/) {
	86	$OTU++;
	87	}
	88	else {
	89	my $id;
	90	if ($ll =~ /\d+(aa\|nt), >(.+)\.\.\./) {
	91	$id = $2;
	92	$rep_2_otu{$id} = $OTU;
	93	}
	94	}
	95	}
	96	close(TMP);
	97
	98	my %chimeric_ids = ();
	99	#### those ids are candidates, if they are recurited by other non-chimeric clusters,
	100	#### then they are not chimeric anymore
	101	foreach $i (keys %seq_R1_clstr) {
	102	my $rep1 = $seq_R1_clstr{$i};
	103	my $rep2 = $seq_R2_clstr{$i};
	104
	105	next if ($rep1 eq $rep2);
	106	next unless ($seq_nr_size{$rep1} >= $seq_nr_size{$i}*2);
	107	next unless ($seq_nr_size{$rep2} >= $seq_nr_size{$i}*2);
	108
	109	my $OTU1 = $rep_2_otu{$rep1};
	110	my $OTU2 = $rep_2_otu{$rep2};
	111	next if ($OTU1 eq $OTU2);
	112	$chimeric_ids{$i} = 1;
	113	}
	114
	115	#### parse seq.99.clstr
	116	my $cutoff_clstr_size = int($num_total_seq * $abs_cutoff);
	117	$cutoff_clstr_size = 1 unless ($cutoff_clstr_size >= 1); #### singleton will be removed
	118	#print LOG "cutoff_clstr_size\t$cutoff_clstr_size\n";
	119
	120	open(TMP, $clstr_99) \|\| die "can not open $clstr_99";
	121	open(OUT, "> $output_cls") \|\| die "can not write to $output_cls";
	122	my %good_ids = ();
	123	my @seqs_this_cls = ();
	124	if (1) {
	125	my $clstr_txt = "";
	126	my $clstr_size = 0;
	127	my $rep;
	128
	129	while($ll=<TMP>){
	130	if ($ll =~ /^>/) {
	131	if ($clstr_txt) {
	132	if (($clstr_size > $cutoff_clstr_size) and (not $chimeric_ids{$rep})) {
	133	print OUT $clstr_txt;
	134	$good_ids{$rep} = 1;
	135	}
	136	elsif ( $chimeric_ids{$rep} ) {
	137	foreach $j (@seqs_this_cls) {
	138	foreach $i ( @{ $seqs_of_nr{$j} } ) {
	139	print LOG "$i\tChimeric_cluster\t$rep\t$clstr_size\tP1:$seq_R1_clstr{$rep}\tP2:$seq_R2_clstr{$rep}\tOTU1:$rep_2_otu{$seq_R1_clstr{$rep}}\tOTU2:$rep_2_otu{$seq_R2_clstr{$rep}}\n";
	140	}
	141	}
	142	}
	143	else {
	144	foreach $j (@seqs_this_cls) {
	145	foreach $i ( @{ $seqs_of_nr{$j} } ) {
	146	print LOG "$i\tSmall_cluster\t$rep\t$clstr_size\n";
	147	}
	148	}
	149	}
	150	}
	151	$clstr_size = 0;
	152	$clstr_txt = $ll;
	153	$rep = "";
	154	@seqs_this_cls=();
	155	}
	156	else {
	157	$clstr_txt .= $ll;
	158	chop($ll);
	159	my $id;
	160	if ($ll =~ /\d+(aa\|nt), >(.+)\.\.\./) {
	161	$id = $2;
	162	$clstr_size += $seq_nr_size{$id};
	163	$rep=$id if ($ll =~ /\*$/);
	164	push(@seqs_this_cls, $id);
	165	}
	166	}
	167	}
	168	if ($clstr_txt) {
	169	if (($clstr_size > $cutoff_clstr_size) and (not $chimeric_ids{$rep})) {
	170	print OUT $clstr_txt;
	171	$good_ids{$rep} = 1;
	172	}
	173	elsif ( $chimeric_ids{$rep} ) {
	174	foreach $j (@seqs_this_cls) {
	175	foreach $i ( @{ $seqs_of_nr{$j} } ) {
	176	print LOG "$i\tChimeric_cluster\t$rep\t$clstr_size\tP1:$seq_R1_clstr{$rep}\tP2:$seq_R2_clstr{$rep}\tOTU1:$rep_2_otu{$seq_R1_clstr{$rep}}\tOTU2:$rep_2_otu{$seq_R2_clstr{$rep}}\n";
	177	}
	178	}
	179	}
	180	else {
	181	foreach $j (@seqs_this_cls) {
	182	foreach $i ( @{ $seqs_of_nr{$j} } ) {
	183	print LOG "$i\tSmall_cluster\t$rep\t$clstr_size\n";
	184	}
	185	}
	186	}
	187	}
	188	}
	189	close(TMP);
	190	close(OUT);
	191
	192	foreach my $f (($seq_99, $seq_992)) {
	193	my $fout = ($f eq $seq_99) ? $output : $output_2;
	194
	195	open(TMP, $f) \|\| die "can not open $f";
	196	open(OUT, ">$fout") \|\| die "can not write to $fout";
	197
	198	my $flag = 0;
	199	while($ll = <TMP>) {
	200	if ($ll =~ /^>/) {
	201	$gi = substr($ll,1);
	202	chop($gi);
	203	$gi =~ s/\s.+$//;
	204	$flag = ( $good_ids{$gi} ) ? 1 : 0;
	205	}
	206	print OUT $ll if ($flag);
	207	}
	208
	209	close(TMP);
	210	close(OUT);
	211	}
	212
	213
	214	close(LOG);
	215
	216	sub usage {
	217	<<EOF
	218	Usage:
	219	$script_name -k seq.nr.clstr -i seq.nr.R1.clstr -j seq.nr.R2.clstr -c 0.0001 -a seq.99.clstr -f seq.99 -g seq.99.2 -o seq.99f
	220
	221	Options:
	222	-k input seq.nr.clstr
	223	-i input seq.nr.R1.clstr
	224	-j input seq.nr.R2.clstr
	225	-a input seq.99.clstr
	226	-f input seq.99
	227	-g input seq.99.2
	228	-o output
	229	-c abundance cutoff, default $abs_cutoff
	230	small clusters < this size will be considiered as noise and will be removed
	231	if total input sequence is 50,000, then clusters < 2 (i.e. singletons) are removed
	232
	233	EOF
	234	}
	235	###### END usage
	236

+207

-0

usecases/Miseq-16S/filter-chimeric-by-ref.pl less more

	0	#!/usr/bin/perl
	1
	2	use Getopt::Std;
	3	my $script_name = $0;
	4	my $script_dir = $0;
	5	$script_dir =~ s/[^\/]+$//;
	6	chop($script_dir);
	7	$script_dir = "./" unless ($script_dir);
	8
	9	getopts("k:i:j:o:p:c:s:t:m:e:Z:a:f:d:R:g:",\%opts);
	10	die usage() unless ($opts{i} and $opts{j} and $opts{a} and $opts{f} and $opts{g} and $opts{o});
	11
	12	my $input = $opts{i}; ## R1 only clstr
	13	my $input2 = $opts{j}; ## R2 only clstr
	14	my $clstr_99 = $opts{a}; ## seq.97f-full.clstr #### can be any 2nd -preclustering e.g. 98.5%
	15	my $seq_99 = $opts{f}; ## seq.99 - fasta file R1
	16	my $seq_992 = $opts{g}; ## seq.99 - fasta file R2
	17	my $output = $opts{o}; ## seq.99f
	18	my $abs_cutoff = $opts{c}; $abs_cutoff = 0.01 unless ($abs_cutoff); #### small cluster will be checked for chimeric
	19	my $output_2 = "$output.2"; ## seq.99f.2 -- R2
	20	my $output_cls = "$output.clstr"; ## seq.99f.clstr
	21	my $output_log = "$output.log"; ## seq.99f.log
	22
	23	my ($i, $j, $k, $str, $cmd, $ll);
	24
	25	my $num_total_seq;
	26	my %seq_nr_size;
	27	my %seqs_of_rep;
	28	open(LOG, "> $output_log") \|\| die "can not open $output_log";
	29	open(TMP, $clstr_99) \|\| die "can not open $clstr_99";
	30	if (1) {
	31	my $rep;
	32	while($ll=<TMP>){
	33	if ($ll =~ /^>/) {
	34	$rep = "";
	35	}
	36	else {
	37	chop($ll);
	38	my $id;
	39	if ($ll =~ /\d+(aa\|nt), >(.+)\.\.\./) {
	40	$id = $2;
	41	$num_total_seq++ if ($id =~ /^Sample/);
	42	if ($ll =~ /\*$/) { $rep=$id; $seq_nr_size{$rep}=0; $seqs_of_rep{$rep} = [];}
	43	$seq_nr_size{$rep}++ if ($rep);
	44	push(@{$seqs_of_rep{$rep}}, $id) if ($rep);
	45	}
	46	}
	47	}
	48	}
	49	close(TMP);
	50
	51
	52	my %seq_R1_clstr;
	53	my %seq_R2_clstr;
	54	foreach my $f (($input, $input2)) {
	55	open(TMP, $f) \|\| die "can not open $f";
	56	my $rep;
	57
	58	while($ll=<TMP>){
	59	if ($ll =~ /^>/) {
	60	$rep = "";
	61	}
	62	else {
	63	chop($ll);
	64	my $id;
	65	if ($ll =~ /\d+(aa\|nt), >(.+)\.\.\./) {
	66	$id = $2;
	67	if ($ll =~ /\*$/) {
	68	$rep=$id;
	69	}
	70	if ($rep and ($id =~ /^Sample/) ) {
	71	if ($f eq $input) { $seq_R1_clstr{$id} = $rep;}
	72	else { $seq_R2_clstr{$id} = $rep;}
	73	}
	74	}
	75	}
	76	}
	77	close(TMP);
	78	}
	79
	80	my $cutoff_clstr_size = int($num_total_seq * $abs_cutoff);
	81	$cutoff_clstr_size = 1 unless ($cutoff_clstr_size >= 1);
	82	#print LOG "cutoff_clstr_size\t$cutoff_clstr_size\n";
	83
	84	my %chimeric_ids = ();
	85	#### those ids are candidates, if they are recurited by other non-chimeric clusters,
	86	#### then they are not chimeric anymore
	87	foreach $i (keys %seq_nr_size) {
	88	next unless ($i =~ /^Sample/);
	89	my $rep1 = $seq_R1_clstr{$i};
	90	my $rep2 = $seq_R2_clstr{$i};
	91	next unless ($rep1 and $rep2);
	92
	93	next if ($rep1 eq $rep2);
	94	next if ($rep1 eq $i);
	95	next if ($rep2 eq $i);
	96	next if ($seq_nr_size{$i} > $cutoff_clstr_size);
	97	if (defined($seq_nr_size{$rep1})) { next unless ($seq_nr_size{$rep1} >= $seq_nr_size{$i}*2); }
	98	if (defined($seq_nr_size{$rep2})) { next unless ($seq_nr_size{$rep2} >= $seq_nr_size{$i}*2); }
	99
	100	$chimeric_ids{$i} = 1;
	101	}
	102
	103	#### parse seq.97fwref.clstr
	104	#### do chimeric checking for sample-only clusters
	105	open(TMP, $clstr_99) \|\| die "can not open $clstr_99";
	106	open(OUT, "> $output_cls") \|\| die "can not write to $output_cls";
	107	my %good_ids = ();
	108	if (1) {
	109	my $clstr_txt = "";
	110	my $clstr_size = 0;
	111	my $rep;
	112	my $refonly = 1;
	113
	114	while($ll=<TMP>){
	115	if ($ll =~ /^>/) {
	116	if ($clstr_txt) {
	117	if ( not $refonly ) {
	118	if (not $chimeric_ids{$rep}) {
	119	print OUT $clstr_txt;
	120	$good_ids{$rep} = 1;
	121	}
	122	elsif ( $chimeric_ids{$rep} ) {
	123	foreach $i ( @{ $seqs_of_rep{$rep} }) {
	124	print LOG "Chimeric_cluster\t$i\t$rep\t$clstr_size\tP1:$seq_R1_clstr{$rep}\tP2:$seq_R2_clstr{$rep}\n";
	125	}
	126	}
	127	}
	128	}
	129	$clstr_size = 0;
	130	$clstr_txt = $ll;
	131	$rep = "";
	132	$refonly = 1;
	133	}
	134	else {
	135	$clstr_txt .= $ll;
	136	chop($ll);
	137	my $id;
	138	if ($ll =~ /\d+(aa\|nt), >(.+)\.\.\./) {
	139	$id = $2;
	140	$clstr_size++;
	141	$rep=$id if ($ll =~ /\*$/);
	142	$refonly = 0 if ($id =~ /^Sample/);
	143	}
	144	}
	145	}
	146	if ($clstr_txt) {
	147	if ( not $refonly ) {
	148	if (not $chimeric_ids{$rep}) {
	149	print OUT $clstr_txt;
	150	$good_ids{$rep} = 1;
	151	}
	152	elsif ( $chimeric_ids{$rep} ) {
	153	foreach $i ( @{ $seqs_of_rep{$rep} }) {
	154	print LOG "Chimeric_cluster\t$i\t$rep\t$clstr_size\tP1:$seq_R1_clstr{$rep}\tP2:$seq_R2_clstr{$rep}\n";
	155	}
	156	}
	157	}
	158	}
	159
	160	}
	161	close(TMP);
	162	close(OUT);
	163
	164	foreach my $f (($seq_99, $seq_992)) {
	165	my $fout = ($f eq $seq_99) ? $output : $output_2;
	166
	167	open(TMP, $f) \|\| die "can not open $f";
	168	open(OUT, ">$fout") \|\| die "can not write to $fout";
	169
	170	my $flag = 0;
	171	while($ll = <TMP>) {
	172	if ($ll =~ /^>/) {
	173	$gi = substr($ll,1);
	174	chop($gi);
	175	$gi =~ s/\s.+$//;
	176	$flag = ( $good_ids{$gi} ) ? 1 : 0;
	177	}
	178	print OUT $ll if ($flag);
	179	}
	180
	181	close(TMP);
	182	close(OUT);
	183	}
	184
	185	close(LOG);
	186
	187	sub usage {
	188	<<EOF
	189	Usage:
	190	$script_name -i seq.nr.R1.clstr -j seq.nr.R2.clstr -c 0.0001 -a seq.97f-full.clstr -f seq.99 -g seq.99.2 -o seq.99f
	191
	192	Options:
	193	-i input seq.nr.R1.clstr
	194	-j input seq.nr.R2.clstr
	195	-a input seq.97f-full.clstr
	196	-f input seq.99
	197	-g input seq.99.2
	198	-o output cluster without chimeric cluster, without ref-only cluster
	199	-c abundance cutoff, default $abs_cutoff
	200	small clusters < this size will be checked for chimeric and be removed if is chimeric
	201	if total input sequence is 50,000, then clusters < 2 (i.e. singletons) are checked
	202
	203	EOF
	204	}
	205	###### END usage
	206

+51

-0

usecases/Miseq-16S/filter-nontop-ref.pl less more

	0	#!/usr/bin/perl
	1
	2	use Getopt::Std;
	3	my $script_name = $0;
	4	my $script_dir = $0;
	5	$script_dir =~ s/[^\/]+$//;
	6	chop($script_dir);
	7	$script_dir = "./" unless ($script_dir);
	8
	9	my ($i, $j, $k, $str, $cmd, $ll);
	10
	11	my $clstr = "";
	12	my $best_ref = "";
	13	my $best_score = 0;
	14
	15	my $refonly = 1;
	16	while($ll=<>){
	17	if ($ll =~ /^>/) {
	18	if ($clstr) {
	19	print $clstr;
	20	print $best_ref if ($best_ref);
	21	}
	22
	23	$clstr = $ll;
	24	$best_ref = "";
	25	$best_score = 0;
	26	}
	27	else {
	28	if ($ll =~ /\d+(aa\|nt), >(.+)\.\.\./) {
	29	my $id = $2;
	30	if ($id =~ /^Sample/) {
	31	$clstr .= $ll;
	32	}
	33	elsif ( $ll =~ /\/([\d\|\.]+)%$/) {
	34	my $iden = $1;
	35	if ($iden > $best_score) {
	36	$best_score = $iden;
	37	$best_ref = $ll;
	38	}
	39	}
	40	}
	41	else {
	42	print STDERR "format err: $ll";
	43	}
	44	}
	45	}
	46
	47	if ($clstr) {
	48	print $clstr;
	49	print $best_ref if ($best_ref);
	50	}

+34

-0

usecases/Miseq-16S/filter-refonly-cluster.pl less more

	0	#!/usr/bin/perl
	1
	2	use Getopt::Std;
	3	my $script_name = $0;
	4	my $script_dir = $0;
	5	$script_dir =~ s/[^\/]+$//;
	6	chop($script_dir);
	7	$script_dir = "./" unless ($script_dir);
	8
	9	my ($i, $j, $k, $str, $cmd, $ll);
	10
	11	my $num_total_seq;
	12	my %seq_nr_size;
	13
	14	if (1) {
	15	my $clstr = "";
	16	my $refonly = 1;
	17	while($ll=<>){
	18	if ($ll =~ /^>/) {
	19	print $clstr unless ($refonly);
	20	$clstr = $ll;
	21	$refonly = 1;
	22	}
	23	else {
	24	$clstr .= $ll;
	25	my $id;
	26	if ($ll =~ /\d+(aa\|nt), >(.+)\.\.\./) {
	27	$id = $2;
	28	$refonly = 0 if ($id =~ /^Sample/);
	29	}
	30	}
	31	}
	32	}
	33

+75

-0

usecases/Miseq-16S/greengene-ann1.pl less more

	0	#!/usr/bin/perl
	1	## =========================== NGS tools ==========================================
	2	## NGS tools for metagenomic sequence analysis
	3	## May also be used for other type NGS data analysis
	4	##
	5	## Weizhong Li, UCSD
	6	## liwz@sdsc.edu
	7	## http://weizhongli-lab.org/
	8	## ================================================================================
	9
	10	use Getopt::Std;
	11	getopts("i:j:o:r:e:p:q:c:d:N:t:u:d:M:T:S:",\%opts);
	12	die usage() unless ($opts{i} and $opts{j} and $opts{o});
	13	my ($i, $j, $k, $cmd);
	14	my ($ll, $lla, $llb, $id, $ida, $idb, $seq, $seqa, $seqb, $qua, $quaa, $quab);
	15	my ($len, $lena, $lenb);
	16
	17	my $file1 = $opts{i};
	18	my $fasta = $opts{j};
	19	my $output = $opts{o};
	20
	21	my %id_2_ann;
	22	open(TMP, $file1) \|\| die "can not open $file1";
	23	while($ll=<TMP>){
	24	chop($ll);
	25	my ($id, $txt) = split(/\s+/, $ll, 2);
	26	$txt =~ s/ /./g;
	27	$id_2_ann{$id} = $txt;
	28	}
	29	close(TMP);
	30
	31	my %id_2_seq = ();
	32	my $id = "";
	33	open(TMP, $fasta) \|\| die "can not open $fasta";
	34	while($ll=<TMP>){
	35	if ($ll =~ /^>(\d+)/) {
	36	chop($ll);
	37	$id = $1;
	38	$ann = $id_2_ann{$id};
	39	$id = "$id\|$ann" if ($ann);
	40	}
	41	else {
	42	$id_2_seq{$id} .= $ll;
	43	}
	44	}
	45
	46	close(TMP);
	47
	48	my @ids = keys %id_2_seq;
	49	@ids = sort {length($b) <=> length($a) } @ids;
	50
	51	open(OUT, "> $output") \|\| die "can not write to $output";
	52	foreach $id (@ids) {
	53	print OUT ">$id\n$id_2_seq{$id}";
	54	}
	55	close(OUT);
	56
	57
	58
	59	sub usage {
	60	<<EOD;
	61	This script formats Greengene FASTA file for CD-HIT-OTU-MiSeq. You should download Greengene sequences
	62	from http://greengenes.secondgenome.com/downloads, or ftp://greengenes.microbio.me/.
	63	download file like greengenes_release/gg_13_5/gg_13_5_otus.tar.gz, unpack the tar file. You may find
	64	gg_13_5_otus/taxonomy/99_otu_taxonomy.txt and gg_13_5_otus/rep_set/99_otus.fasta
	65
	66	Run this script as $0 -i gg_13_5_otus/taxonomy/99_otu_taxonomy.txt -j gg_13_5_otus/rep_set/99_otus.fasta -o gg_13_5_processed.fasta
	67
	68	Options:
	69	======================
	70	-i path for gg_13_5_otus/taxonomy/99_otu_taxonomy.txt
	71	-j path for gg_13_5_otus/rep_set/99_otus.fasta
	72	-o output FASTA file of formatted Greengene reference DB
	73	EOD
	74	}

+78

-0

usecases/Miseq-16S/pool_samples.pl less more

	0	#!/usr/bin/perl
	1	#
	2	use Getopt::Std;
	3	getopts("s:S:o:f:j:",\%opts);
	4
	5	die usage() unless ($opts{s} or $opts{S});
	6
	7	my $output = $opts{o};
	8	$output = "pooled" unless ($output);
	9	my $sample_in = $opts{s};
	10	my $sample_command_in = $opts{S}; #### ',' delimited samples, ':' delimited entries, e.g. sample1:R1.fq:R2.fq;sample2:R1.fq:R2.fq or sample1;sample2;sample3
	11	my $job = $opts{j};
	12	$job = "otu" unless ($job);
	13
	14	my @file_list = qw/seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt/;
	15
	16	my ($i, $j, $k, $cmd);
	17	$cmd = `mkdir $output` unless (-e $output);
	18
	19	foreach $i (@file_list) {
	20	if (-e "$output/$i") {
	21	die "output dir $output & file $output/$i already exist, please remove all files from $output and re-run\n";
	22	}
	23	}
	24
	25	######## parse NGS_samples
	26	my @NGS_samples = ();
	27	if (defined($sample_in)) {
	28	open(TMP, $sample_in) \|\| die "can not open $sample_in";
	29	while($ll=<TMP>){
	30	next if ($ll =~ /^#/);
	31	next unless ($ll =~ /^\w/); chop($ll);
	32	my ($id, @data) = split(/\s+/,$ll);
	33	push(@NGS_samples, $id);
	34	}
	35	close(TMP);
	36	}
	37	elsif (defined($sample_command_in)) {
	38	my @lls = split(/,/, $sample_command_in);
	39	foreach $ll (@lls) {
	40	my ($id, @data) = split(/:/, $ll);
	41	push(@NGS_samples, $id);
	42	}
	43	}
	44	else {
	45	die "no input samples";
	46	}
	47
	48	foreach $i (@file_list) {
	49	my $target = "$output/$i";
	50	foreach $j (@NGS_samples) {
	51	my $source = "$j/$job/$i";
	52	if (-e $source) {
	53	print STDERR "cat $source >> $target\n";
	54	$cmd = `cat $source >> $target`;
	55	}
	56	else {
	57	print STDERR "Warning, $source missing\n";
	58	}
	59	}
	60	}
	61
	62	sub usage {
	63	<<EOD;
	64	$0 -s sample_file -o output_dir
	65	-s sample data file, required unless -S is present
	66	File format example
	67	#Sample data file example, TAB or space delimited for following lines
	68	Sample_ID1 sample_data_0 sample_data_1
	69	Sample_ID2 sample_data_0 sample_data_1
	70	Sample_ID3 sample_data_0 sample_data_1
	71
	72	-S sample data from command line, required unless -s is present
	73	format: Sample_ID1:sample_data_0:sample_data_0:sample_data_1,Sample_ID2:sample_data_0:sample_data_1
	74
	75	EOD
	76	}
	77