Commit 2b64e48a993ba43ab8000046f34c5a34a17d2376 - libnhgri-blastall-perl

Imported Upstream version 0.66 Jens Preussner 11 years ago

16 changed file(s) with 2665 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all

+1719

-0

Blastall.pm less more

	0	package NHGRI::Blastall;
	1	require 5.004;
	2
	3	###########################################################################
	4	# See the bottom of this file for the POD documentation. Search for the
	5	# string '=head'.
	6	#
	7	# You can run this file through either pod2man or pod2html to produce pretty
	8	# documentation in manual or html file format (these utilities are part of the
	9	# Perl 5 distribution).
	10	#
	11	# The most recent version and complete docs are available at:
	12	# ftp://ftp.nhgri.nih.gov/pub/software/
	13	###########################################################################
	14	# This software is "United States Government Work" under the
	15	# terms of the United States Copyright Act. It was written as part of the
	16	# authors' official duties for the United States Government and thus
	17	# cannot be copyrighted. This software/database is freely available to
	18	# the public for use without a copyright notice. Restrictions cannot be
	19	# placed on its present or future use.
	20	#
	21	# Although all reasonable efforts have been taken to ensure the accuracy
	22	# and reliability of the software and data, the National Human Genome
	23	# Research Institute (NHGRI) and the U.S. Government does not and cannot
	24	# warrant the performance or results that may be obtained by using this
	25	# software or data. NHGRI and the U.S. Government disclaims all
	26	# warranties as to performance, merchantability or fitness for any
	27	# particular purpose.
	28	#
	29	# In any work or product derived from this material, proper attribution
	30	# of the authors as the source of the software or data should be made,
	31	# using http://genome.nhgri.nih.gov/blastall as the citation.
	32	###########################################################################
	33
	34	use strict;
	35	use vars qw($VERSION $REVISION $BLASTALL $BLASTCL3 $BL2SEQ %WU_BLAST
	36	$FORMATDB %DEFAULT_FILTERS);
	37	use IO::File;
	38	use Carp qw(carp croak);
	39
	40	$VERSION = '0.67';
	41	$REVISION = '# $Id: Blastall.pm,v 1.3 2002/01/15 20:09:08 jpearson Exp $';
	42
	43	# you can set these variables to the full path of the corresponding binaries
	44	$BLASTALL = 'blastall';
	45	$BLASTCL3 = 'blastcl3';
	46	$BL2SEQ = 'bl2seq';
	47	$FORMATDB = 'formatdb';
	48
	49	# you can adjust the locations of wu-blast programs
	50	# for instance 'wu_blastn' => '/usr/local/wu/blastn',
	51	# I haven't added support for wu-blastall (hopefully I will get to this)
	52	%WU_BLAST = ('wu_blastn' => 'wu-blastn',
	53	'wu_blastp' => 'wu-blastp',
	54	'wu_blastx' => 'wu-blastx',
	55	'wu_tblastn' => 'wu-tblastn',
	56	'wu_tblastx' => 'wu-tblastx'
	57	);
	58
	59	###########################################################################
	60	# You probably shouldn't be mucking below here...
	61	###########################################################################
	62	%DEFAULT_FILTERS = ('id' => '=~'
	63	,'defline' => '=~'
	64	,'subject_length' => '>'
	65	,'scores' => '>'
	66	,'expects' => '<'
	67	,'identities' => '>'
	68	,'match_lengths' => '>'
	69	,'subject_strands' => 'eq'
	70	,'query_strands' => 'eq'
	71	,'query_frames' => '=='
	72	,'subject_frames' => '=='
	73	);
	74
	75	sub new {
	76	my $this = shift;
	77	my %args = @_;
	78	my $class = ref($this) \|\| $this;
	79	my $self = \%args;
	80	bless $self, $class;
	81	return($self);
	82	}
	83	sub blastall {
	84	my $self = shift;
	85	my $rh_opts = _get_opts(@_);
	86
	87	my ($ra_args,$tmp_file) = _get_args($rh_opts);
	88	unshift @{$ra_args}, $BLASTALL;
	89
	90	my $rc = 0xffff & system(@{$ra_args});
	91	if ($rc == 0) {
	92	$self->{options} = $rh_opts;
	93	push @{$self->{tmp_file}}, $tmp_file; # record so we can DESTROY later
	94	$self->{report_file} = $self->{options}->{o} \|\| $tmp_file;
	95	return(1);
	96	} else {
	97	foreach (@{$ra_args}) { print STDERR "$_ "; }
	98	print STDERR "\n";
	99	carp qq!the above system call to $BLASTALL failed.
	100	Possible problems include
	101	1. Is blastall in your path or is \$BLASTALL the full path to blastall?
	102	2. Do you have the DATABASE installed
	103	3. Are you pointing to the correct location of the query sequence
	104	4. Is your DATABASE corrupted
	105	5. Are you running the correct program for the type of database and
	106	query sequence you are running. e.g. BLASTP -> PROTEIN
	107	6. Perhaps try blastcl3 instead of blastall.
	108	!;
	109	return undef;
	110	}
	111	}
	112	sub formatdb {
	113	my $self = shift;
	114	my $rh_opts = _get_opts(@_);
	115
	116	my $ra_args = _get_formatdb_args($rh_opts);
	117	unshift @{$ra_args}, $FORMATDB;
	118
	119	my $rc = 0xffff & system(@{$ra_args});
	120	if ($rc == 0) {
	121	$self->{options} = $rh_opts;
	122	my $ra_indexes = _get_formatdb_indexes($rh_opts);
	123	push @{$self->{indexes}}, @{$ra_indexes};
	124	#$self->{indexes} = _get_formatdb_indexes($rh_opts);
	125	$self->{tmp_file} = [];
	126	return(1);
	127	} else {
	128	foreach (@{$ra_args}) { print STDERR "$_ "; }
	129	print STDERR "\n";
	130	carp qq!the above system call to $FORMATDB failed.
	131	Possible problems include
	132	1. Is formatdb in your path or is \$FORMATDB the full path to formatdb?
	133	!;
	134	return undef;
	135	}
	136	}
	137	sub remove_formatdb_indexes {
	138	my $self = shift;
	139	if ($self->{indexes}) {
	140	foreach my $i (@{$self->{indexes}}) {
	141	unlink $i or carp "could not remove $i:$!";
	142	}
	143	}
	144	}
	145	sub blast_one_to_many {
	146	my $self = shift;
	147	my $rh_opts = _get_opts(@_);
	148	my ($ra_args,$tmp_file) = _get_args($rh_opts);
	149	if ($rh_opts->{p} eq 'blastp' or $rh_opts->{p} eq 'blastx') {
	150	$self->formatdb(p => 'T', i => $rh_opts->{d});
	151	} else {
	152	$self->formatdb(p => 'F', i => $rh_opts->{d});
	153	}
	154	$self->blastall($rh_opts);
	155	$self->remove_formatdb_indexes();
	156	}
	157	sub bl2seq {
	158	my $self = shift;
	159	my $rh_opts = _get_opts(@_);
	160
	161	my ($ra_args,$tmp_file) = _get_args($rh_opts);
	162	unshift @{$ra_args}, $BL2SEQ;
	163
	164	my $rc = 0xffff & system(@{$ra_args});
	165	if ($rc == 0) {
	166	$self->{options} = $rh_opts;
	167	push @{$self->{tmp_file}}, $tmp_file; # record so we can DESTROY later
	168	$self->{report_file} = $self->{options}->{o} \|\| $tmp_file;
	169	return(1);
	170	} else {
	171	foreach (@{$ra_args}) { print STDERR "$_ "; }
	172	print STDERR "\n";
	173	carp qq!the above system call to $BL2SEQ failed.
	174	Possible problems include
	175	1. Is bl2seq in your path or is \$BL2SEQ the full path to bl2seq?
	176	2. Are you running the correct program for the type of sequences you
	177	are comparing? e.g. BLASTP -> PROTEIN
	178	!;
	179	return undef;
	180	}
	181	}
	182	sub wu_blastall {
	183	my $self = shift;
	184	my $rh_opts = _get_opts(@_);
	185
	186	my ($ra_args,$tmp_file) = _get_wu_args($rh_opts);
	187	my $command = join ' ', @{$ra_args};
	188	my $output = $rh_opts->{o} \|\| $tmp_file;
	189	$command .= " > $output";
	190	#print "$command\n";
	191
	192	my $rc = 0xffff & system("$command");
	193	if ($rc == 0) {
	194	$self->{options} = $rh_opts;
	195	push @{$self->{tmp_file}}, $tmp_file; # record so we can DESTROY later
	196	$self->{report_file} = $self->{options}->{o} \|\| $tmp_file;
	197	return(1);
	198	} else {
	199	print STDERR "$command\n";
	200	carp qq!the above system call failed.!;
	201	return undef;
	202	}
	203	}
	204	sub net_blastall {
	205	carp "net_blastall is depricated please use blastcl3 instead\n";
	206	blastcl3(@_);
	207	}
	208	sub blastcl3 {
	209	{
	210	local $BLASTALL = $BLASTCL3;
	211	blastall(@_);
	212	}
	213	}
	214	sub mask {
	215	my $self = shift;
	216	my $rh_opts;
	217	if (@_ > 1) {
	218	my %opts = @_;
	219	$rh_opts = \%opts;
	220	} else {
	221	$rh_opts = shift;
	222	}
	223	my ($rc);
	224	# type can be wu_blastall, blastcl3, blastall or net_blastall(depricated)
	225	my $type_of_blast = $rh_opts->{type} \|\| 'blastall';
	226	delete $rh_opts->{type};
	227	my ($ra_args,$tmp_file) = _get_args($rh_opts);
	228	unshift @{$ra_args}, $BLASTALL;
	229	my $m = new NHGRI::Blastall;
	230	$rh_opts->{o} = $rh_opts->{o} \|\| $tmp_file;
	231	if ($type_of_blast eq 'net_blastall') {
	232	$m->net_blastall($rh_opts);
	233	} elsif ($type_of_blast eq 'blastcl3') {
	234	$m->blastcl3($rh_opts);
	235	} elsif ($type_of_blast eq 'wu_blastall') {
	236	$m->wu_blastall($rh_opts);
	237	} else {
	238	# 2002-01-12 JVP added "return undef unless"
	239	return undef unless ($m->blastall($rh_opts));
	240	}
	241	my $report_file = $m->{report_file};
	242	my $fa_file = $m->{options}->{i};
	243	my($fa_line,$sequence) = _get_sequence($fa_file);
	244	my($ra_mask_positions) = _get_mask_positions($report_file);
	245	my $masked_sequence = _get_masked_sequence($sequence,$ra_mask_positions);
	246	return ("$fa_line\n$masked_sequence", $ra_mask_positions) if (wantarray);
	247	return ("$fa_line\n$masked_sequence");
	248	}
	249	sub print_report {
	250	my $self = shift;
	251	my $report_file = $self->{report_file};
	252	return undef unless ($report_file);
	253	open REPORT, $report_file or return(0);
	254	my $report = join ('',<REPORT>);
	255	print $report;
	256	return;
	257	}
	258	sub filter {
	259	my $self = shift;
	260	my $rh_criteria;
	261	if (@_ > 1) {
	262	my %criteria = @_;
	263	$rh_criteria = \%criteria;
	264	} else {
	265	$rh_criteria = shift;
	266	}
	267
	268	my @filtered_results = ();
	269	$self->_parse_report() unless ($self->{been_parsed});
	270
	271	my ($ra_stripped_values,$ra_cmp_ops) =
	272	_get_stripped_values_and_comparison_operators($rh_criteria);
	273	foreach my $entry (@{$self->{results}}) {
	274	if ($entry = _entry_passes_filter($entry,$ra_stripped_values,
	275	$ra_cmp_ops,$rh_criteria)) {
	276	push @filtered_results,$entry;
	277	}
	278	}
	279	$self->{results} = \@filtered_results;
	280	return @filtered_results;
	281	}
	282	sub unfilter {
	283	my $self = shift;
	284	$self->_parse_report();
	285	}
	286	sub result {
	287	my($self,@r) = @_;
	288	my $ra_results = [];
	289
	290	$self->_parse_report() unless ($self->{been_parsed});
	291	return $self->_all_results unless @r;
	292	if (@r == 1) {
	293	$ra_results = $self->_result_for_one(@r);
	294	} elsif (@r == 2) {
	295	$ra_results = $self->_result_for_two(@r);
	296	} else {
	297	return undef;
	298	}
	299	return(@{$ra_results});
	300	}
	301	sub read_report {
	302	my $self = shift;
	303	my $file = shift;
	304	$self->{report_file} = $file;
	305	}
	306	sub get_database_description {
	307	my $self = shift;
	308	$self->_parse_report() unless ($self->{been_parsed});
	309	return $self->{database_description};
	310	}
	311	sub get_database_sequence_count {
	312	my $self = shift;
	313	$self->_parse_report() unless ($self->{been_parsed});
	314	return $self->{database_sequence_count};
	315	}
	316	sub get_database_letter_count {
	317	my $self = shift;
	318	$self->_parse_report() unless ($self->{been_parsed});
	319	return $self->{database_letter_count};
	320	}
	321	sub get_blast_program {
	322	my $self = shift;
	323	$self->_parse_report() unless ($self->{been_parsed});
	324	return $self->{program};
	325	}
	326	sub get_blast_version {
	327	my $self = shift;
	328	$self->_parse_report() unless ($self->{been_parsed});
	329	return $self->{blast_version};
	330	}
	331	sub get_report {
	332	my $self = shift;
	333	return $self->{report_file};
	334	}
	335	sub DESTROY {
	336	my $self = shift;
	337	foreach (@{$self->{tmp_file}}) {
	338	unlink $_ if (-e $_);
	339	}
	340	}
	341
	342	# END OF PUBLIC METHODS
	343	# BEGIN PRIVATE METHODS
	344
	345	sub _get_opts {
	346	my @args = @_;
	347	my $rh_opts;
	348
	349	if (@args == 1) {
	350	$rh_opts = shift;
	351	} else {
	352	my %opts = @args;
	353	$rh_opts = \%opts;
	354	}
	355	return $rh_opts;
	356	}
	357	sub _get_formatdb_indexes {
	358	my $rh_opts = shift;
	359	my @indexes = ();
	360	my $x = 'p';
	361	my $name = $rh_opts->{i};
	362	$name = $rh_opts->{n} if ($rh_opts->{n});
	363	$x = 'n' if ($rh_opts->{p} =~ /f/i);
	364	push @indexes, "${name}.${x}hr", "${name}.${x}in", "${name}.${x}sq";
	365	if ($rh_opts->{o} && $rh_opts->{o} =~ /t/i) {
	366	push @indexes, "${name}.${x}nd", "${name}.${x}ni",
	367	"${name}.${x}sd", "${name}.${x}si";
	368	}
	369	return \@indexes;
	370	}
	371	sub _write_report_to_disk {
	372	my $report_file = shift;
	373	my $results = shift;
	374	open OUT, ">$report_file" or croak "cannot open $report_file:$!";
	375	print OUT $results;
	376	}
	377
	378	sub _get_matrix_parameters {
	379	return { PAM30 => [9, 1],
	380	PAM70 => [10, 1],
	381	BLOSUM45 => [14, 2],
	382	BLOSUM80 => [10, 1],
	383	BLOSUM62 => [11, 1],
	384	BLOSUM45 => [14, 2]
	385	};
	386	}
	387	sub _get_genetic_codes {
	388	return { 1 => "Standard (1)",
	389	2 => "Vertebrate Mitochondrial (2)",
	390	3 => "Yeast Mitochondrial (3)",
	391	4 => "Mold Mitochondrial; ... (4)",
	392	5 => "Invertebrate Mitochondrial (5)",
	393	6 => "Ciliate Nuclear; ... (6)",
	394	9 => "Echinoderm Mitochondrial (9)",
	395	10 => "Euplotid Nuclear (10)",
	396	11 => "Bacterial (11)",
	397	12 => "lternative Yeast Nuclear (12)",
	398	13 => "Ascidian Mitochondrial (13)",
	399	14 => "Flatworm Mitochondrial (14)",
	400	15 => "Blepharisma Macronuclear (15)"
	401	};
	402	}
	403
	404	sub _get_masked_sequence {
	405	my $sequence = shift;
	406	my $ra_mask_positions = shift;
	407	my ($i,$iplus);
	408	return $sequence unless ($ra_mask_positions); # nothing masked
	409
	410	if ($ra_mask_positions->[0] &&
	411	$ra_mask_positions->[0] > $ra_mask_positions->[1]) {
	412	@{$ra_mask_positions} = reverse @{$ra_mask_positions};
	413	}
	414
	415	for ($i=0; $i < @{$ra_mask_positions}; $i=$i+2) {
	416	$iplus = ($i + 1);
	417	$sequence = _mask_from_x_to_y($sequence,$ra_mask_positions->[$i],
	418	$ra_mask_positions->[$iplus]);
	419	}
	420	return $sequence;
	421	}
	422
	423	sub _mask_from_x_to_y {
	424	my $sequence = shift;
	425	my $x = shift;
	426	my $y = shift;
	427	my $length = ($y - $x);
	428	substr($sequence, $x, $length) = ('N' x $length);
	429	return($sequence);
	430	}
	431
	432
	433	sub _get_mask_positions {
	434	my $report_file = shift;
	435	my @mask_positions = ();
	436
	437	open REPORT, $report_file or carp "cannot open $report_file:$!";
	438	while (<REPORT>) {
	439	if (/^Query:\s+(\d+)\s+.*\s+(\d+)$/) {
	440	push @mask_positions,($1,$2);
	441	}
	442	}
	443	return(\@mask_positions);
	444	}
	445
	446
	447	sub _get_sequence {
	448	my $fasta_file = shift;
	449	my @fasta_lines = ();
	450
	451	open FASTA, $fasta_file or warn "cannot open $fasta_file:$!";
	452	chomp(@fasta_lines = <FASTA>);
	453	my $first_line = shift @fasta_lines;
	454	my $seq = join '',@fasta_lines;
	455	return ($first_line,$seq);
	456	}
	457	sub _result_for_one {
	458	my $self = shift;
	459	my $attribute = shift;
	460	my $count = 0;
	461	my $ra_results = [];
	462	$attribute = _map_attribute($attribute);
	463	foreach (@{$self->{results}}) {
	464	if (ref($self->{results}->[$count]->{$attribute}) eq 'ARRAY'){
	465	push @{$ra_results},@{$self->{results}->[$count]->{$attribute}};
	466	} else {
	467	push @{$ra_results},$self->{results}->[$count]->{$attribute};
	468	}
	469	$count++;
	470	}
	471	return($ra_results);
	472	}
	473	sub _result_for_two {
	474	my $self = shift;
	475	my $attribute = shift;
	476	my $id = shift;
	477	my $ra_results = [];
	478	$attribute = _map_attribute($attribute);
	479	for (my $i = 0; $i < @{$self->{results}}; $i++) {
	480	next unless ($self->{results}->[$i]->{id} eq $id);
	481	if (ref($self->{results}->[$i]->{$attribute}) eq 'ARRAY'){
	482	push @{$ra_results},@{$self->{results}->[$i]->{$attribute}};
	483	} else {
	484	push @{$ra_results},$self->{results}->[$i]->{$attribute};
	485	}
	486	}
	487	return($ra_results);
	488	}
	489	sub _map_attribute {
	490	my $attribute = shift;
	491	my %attribute_map = ('ids' => 'id',
	492	'score' => 'scores',
	493	'def_line' => 'defline',
	494	'def_lines' => 'defline',
	495	'def-line' => 'defline',
	496	'def-lines' => 'defline',
	497	'deflines' => 'defline',
	498	'expect' => 'expects',
	499	'identity' => 'identities',
	500	'gap' => 'gaps',
	501	'subject_lengths' => 'subject_length',
	502	'subject-length' => 'subject_length',
	503	'subject-lengths' => 'subject_length',
	504	'subjectlength' => 'subject_length',
	505	'subjectlengths' => 'subject_length',
	506	'length' => 'subject_length',
	507	'lengths' => 'subject_length',
	508	'match_length' => 'match_lengths',
	509	'matchlengths' => 'match_lengths',
	510	'matchlength' => 'match_lengths',
	511	'match-length' => 'match_lengths',
	512	'match-lengths' => 'match_lengths',
	513	'query-frame' => 'query_frames',
	514	'queryframe' => 'query_frames',
	515	'query-frames' => 'query_frames',
	516	'queryframes' => 'query_frames',
	517	'query_frame' => 'query_frames',
	518	'query-frame' => 'query_frames',
	519	'subjectframe' => 'subject_frames',
	520	'subject-frames' => 'subject_frames',
	521	'subjectframes' => 'subject_frames',
	522	'subject_frame' => 'subject_frames',
	523	);
	524
	525	$attribute = lc($attribute);
	526	$attribute = $attribute_map{$attribute} \|\| $attribute;
	527	return($attribute);
	528	}
	529	sub _all_results {
	530	my $self = shift;
	531	return () unless defined($self) && $self->{results};
	532	return () unless @{$self->{results}};
	533	return @{$self->{results}};
	534	}
	535	sub _entry_passes_filter {
	536	my $entry = shift;
	537	my $ra_stripped_values = shift;
	538	my $ra_cmp_ops = shift;
	539	my $rh_criteria = shift;
	540	my @fields = keys %$rh_criteria;
	541	my ($i,$code,$pass,@codes,$ra_safe_vals);
	542
	543	for($i=0; $i < @$ra_stripped_values; $i++) {
	544	if (ref($entry->{$fields[$i]}) eq "ARRAY") {
	545	my $n = 0;
	546	my (@sorted,$best_result);
	547	foreach (@{$entry->{$fields[$i]}}) {
	548	if ($ra_cmp_ops->[$i] =~ /\~/) {
	549	s/"/'/g; # cannot nest double quotes in regex
	550	$code = '$pass' . " = 1 if (\"$_\" $ra_cmp_ops->[$i] ";
	551	$ra_stripped_values->[$n] =~ s/\//\\\//g;
	552	$code .= "/$ra_stripped_values->[$n]/i)";
	553	} else {
	554	$ra_safe_vals = _check_for_unsafe_e($entry->{$fields[$i]});
	555	if ($ra_cmp_ops->[$i] =~ />/) {
	556	@sorted = sort { $b <=> $a } @{$ra_safe_vals};
	557	} else {
	558	@sorted = sort { $a <=> $b } @{$ra_safe_vals};
	559	}
	560
	561	$best_result = shift @sorted;
	562	$code = '$pass' . qq! = 1 if ("$best_result" !;
	563	$code .= qq!$ra_cmp_ops->[$i] '$ra_stripped_values->[$i]')!;
	564	}
	565	_set_passing_positions_flags ($entry,
	566	$ra_safe_vals,
	567	$ra_cmp_ops->[$i],
	568	$ra_stripped_values->[$i]);
	569	push @codes, $code;
	570	$n++;
	571	}
	572	foreach (@codes) {
	573	$pass = 0;
	574	eval $_;
	575	carp "eval on $_ failed: $@" if $@;
	576	}
	577	return undef unless $pass;
	578	} else {
	579	if ($ra_cmp_ops->[$i] =~ /\~/) {
	580	# cannot nest double quotes in regex
	581	$entry->{$fields[$i]} =~ s/"/'/g;
	582	$code = '$pass' . qq! = 1 if ("$entry->{$fields[$i]}" !;
	583	$code .= "$ra_cmp_ops->[$i] ";
	584	$ra_stripped_values =~ s/\//\\\//g;
	585	$code .= "/$ra_stripped_values->[$i]/i)\;";
	586	} else {
	587	$entry->{$fields[$i]} =~ s/^e-(\d*)$/1e-$1/;
	588	$code = '$pass' . qq! = 1 if ("$entry->{$fields[$i]}" !;
	589	$code .= "$ra_cmp_ops->[$i] ";
	590	$code .= qq!'$ra_stripped_values->[$i]')\;!;
	591	}
	592	$pass = 0;
	593
	594	eval $code;
	595	return undef unless $pass;
	596	croak "eval on $code failed: $@" if $@;
	597	}
	598	}
	599	return $entry;
	600	}
	601
	602	sub _set_passing_positions_flags {
	603	my $entry = shift;
	604	my $ra_safe_vals = shift;
	605	my $cmp_op = shift;
	606	my $cmp_val = shift;
	607	my $count = 0;
	608	my $code = '';
	609	my @array_positions = ();
	610
	611	$entry->{passing_positions} = [];
	612	foreach (@{$ra_safe_vals}) {
	613	$code = qq^if ("$_" $cmp_op "$cmp_val") { ^;
	614	$code .= q^ $entry->{passing_positions}->[$count] = "pass"; ^;
	615	$code .= q^} else { ^;
	616	$code .= q^ $entry->{passing_positions}->[$count] = "fail"; }^;
	617	eval $code;
	618	carp "eval on $code failed: $@" if $@;
	619	$count++;
	620	}
	621	return;
	622	}
	623	###########################################################################
	624	# sub _check_for_unsafe_e #
	625	###########################################################################
	626	# When sorting BLAST values that have an initial e (e.g. `e-122')
	627	# they will not sort. so we need to put a 1 if front (e.g. `1e-122')
	628	###########################################################################
	629	sub _check_for_unsafe_e {
	630	my $ra_unsafe_vals = shift;
	631	my @safe_vals = ();
	632
	633	foreach (@{$ra_unsafe_vals}) {
	634	$_ =~ s/^e-(\d*)$/1e-$1/;
	635	push @safe_vals,$_;
	636	}
	637	return \@safe_vals;
	638	}
	639
	640	sub _get_stripped_values_and_comparison_operators {
	641	my $rh_criteria = shift;
	642	my ($key,$value,@stripped_values,@cmp_ops);
	643	my $count = 0;
	644
	645	while (($key,$value) = each %$rh_criteria) {
	646	$key = lc $key;
	647	($stripped_values[$count],$cmp_ops[$count]) =
	648	_check_for_comparison_operators($key,$value);
	649	$count++;
	650	}
	651	return(\@stripped_values,\@cmp_ops);
	652	}
	653	sub _check_for_comparison_operators {
	654	my $key = shift;
	655	my $value = shift;
	656	my ($cmp_op,$new_value);
	657	if ($value =~ /^(<=\|>=\|=~\|~\|!~\|>\|<\|=\|==)(.*)/) {
	658	if ($1 eq '~') {
	659	$cmp_op = '=~';
	660	} elsif ($1 eq '=') {
	661	$cmp_op = '==';
	662	} else {
	663	$cmp_op = $1;
	664	}
	665	$new_value = $2;
	666	} else {
	667	$cmp_op = $DEFAULT_FILTERS{$key};
	668	$new_value = $value;
	669	}
	670	return($new_value,$cmp_op);
	671	}
	672	sub _parse_report {
	673	my $self = shift;
	674	my $report_file = $self->{report_file};
	675	$self->{been_parsed} = 1;
	676	$self->{results} = [];
	677	unless ($self->{report_file_handle}) {
	678	open REPORT, $report_file or carp "cannot open $report_file:$!";
	679	$self->{report_file_handle} = \*REPORT;
	680	}
	681	my $hit_new_seq = 0;
	682	my $hit_a_seq = 0;
	683	my $hit_first_seq_line = 0;
	684	my $hit_first_hsp = 0;
	685	my $hit_summaries = 0;
	686	my $hit_query = 0;
	687	my $hit_database = 0;
	688	my ($id,$defline,$subject_length,@scores,@expects,@identities,
	689	@match_lengths,@query_starts,@subject_starts,$strand,$q_strand,
	690	$last_end_point,$q_last_end_point,@subject_strands,
	691	@query_strands,$query,$query_length,$e,@query_seqs,
	692	@subject_seqs,$query_seq,$subject_seq,@query_frames,@subject_frames,
	693	@query_ends, @subject_ends);
	694	my $id_regex = $self->{-DB_ID_REGEX}
	695	\|\| '[^\\|]+(?:\\|[^\\|,\s]*){1,10}';
	696	$id_regex = "($id_regex)"; # add capturing parens
	697	while (<REPORT>) {
	698	$hit_summaries = 1 if (/^Sequences producing significant alignments/);
	699	$hit_a_seq = 1 if (/^>/);
	700
	701	# Gather Query defline and Length
	702	if ($hit_query == 0 && /^Query=\s+(.*)/) {
	703	$query = $1;
	704	$hit_query = 1;
	705	} elsif ($hit_query == 1) {
	706	if (/$([0-9,]+) letters$/) {
	707	$query_length = $1;
	708	$query_length =~ s/,//g;
	709	$hit_query = 0;
	710	} else {
	711	chomp;
	712	$query .= $_;
	713	}
	714	}
	715	# Gather Database information
	716	if (/^Database:\s+(.*)/) {
	717	$self->{database_description} = $1;
	718	$hit_database = 1;
	719	} elsif ($hit_database &&
	720	/^\s+([\d,]+) sequences; ([\d,]+) total letters/) {
	721	$self->{database_sequence_count} = $1;
	722	$self->{database_letter_count} = $2;
	723	$hit_database = 0;
	724	} elsif ($hit_database && /^\s(.)$/) {
	725	$self->{database_description} .= $1;
	726	}
	727
	728	if (!$self->{program} && $_ =~ /^(T?BLAST[XNP]) ([\d.]+)?/) {
	729	$self->{program} = $1;
	730	$self->{blast_version} = $2;
	731	}
	732	$self->{options}->{p} = lc($1) if (/^(T?BLAST[XNP]?)/);
	733	next unless ($hit_a_seq);
	734	if (/^>$id_regex\s+(.*)/) {
	735	if ($id) {
	736	# catches the starting point for the previous reverse strand
	737	push @query_starts,$q_last_end_point if ($q_strand eq 'minus');
	738	push @query_ends,$q_last_end_point if ($q_strand eq 'plus');
	739	push @subject_starts,$last_end_point if ($strand eq 'minus');
	740	push @subject_ends,$last_end_point if ($strand eq 'plus');
	741	push @subject_seqs,$subject_seq if ($subject_seq);
	742	push @query_seqs,$query_seq if ($query_seq);
	743
	744	$self->_push_new_results($id,$defline,$subject_length,
	745	$query,$query_length,
	746	\@scores,\@expects,\@identities,\@match_lengths,
	747	\@query_starts,\@query_ends,\@subject_starts,\@subject_ends,
	748	\@subject_strands,\@query_strands,\@subject_seqs,
	749	\@query_seqs,\@query_frames,\@subject_frames);
	750	}
	751	# initialize arrays for new sample
	752	@scores = (); @expects = (); @identities = (); @match_lengths = ();
	753	@query_starts = (); @query_ends = (); @subject_strands = ();
	754	@query_strands = (); @subject_starts = (); @subject_ends = ();
	755	$strand = ''; $q_strand = ''; $last_end_point = '';
	756	$q_last_end_point = ''; $subject_seq = ''; $query_seq = '';
	757	@subject_seqs = (); @query_seqs = (); @query_frames = ();
	758	@subject_frames = (); $hit_new_seq = 1;
	759	$id = $1;
	760	$defline = $2;
	761	} elsif (/\s+Length = (\d+)/) {
	762	$hit_new_seq = 0;
	763	$subject_length = $1;
	764	} elsif ($hit_new_seq == 1) {
	765	chomp();
	766	$defline .= $_;
	767	} elsif (/^\sScore[^=]=\s([0-9\.-])[^E]Expect[^=]=\s([0-9\.e-])/) {
	768	# catches the starting point for the previous reverse strand
	769	push @query_starts,$q_last_end_point if ($q_strand eq 'minus');
	770	push @query_ends,$q_last_end_point if ($q_strand eq 'plus');
	771	push @subject_starts,$last_end_point if ($strand eq 'minus');
	772	push @subject_ends,$last_end_point if ($strand eq 'plus');
	773	push @subject_seqs,$subject_seq if ($subject_seq);
	774	push @query_seqs,$query_seq if ($query_seq);
	775	$subject_seq = ''; $query_seq = '';
	776
	777	push @scores,$1;
	778	push @expects,$2;
	779	$hit_first_seq_line = 0;
	780	} elsif (/^\sIdentities[^=]=\s*(\d+)\/(\d+)/) {
	781	my $identity = $1 / $2;
	782	push @identities,$identity;
	783	push @match_lengths,$2;
	784	$hit_first_seq_line = 0;
	785	} elsif (/^\s*Frame = ([-,+]\d)(?: \/ ([-,+]\d))?/) {
	786	push @query_frames, $1;
	787	push @subject_frames, $2 if ($2);
	788	} elsif (/Query:\s+(\d+)\s+(\S+)\s+(\d+)/) {
	789	if (($1 > $3) && $hit_first_seq_line == 0) {
	790	$q_strand = 'minus';
	791	push @query_ends,$1;
	792	} elsif (($1 < $3) && $hit_first_seq_line == 0) {
	793	$q_strand = 'plus';
	794	push @query_starts,$1;
	795	}
	796	push @query_strands,$q_strand if ($hit_first_seq_line == 0);
	797	$q_last_end_point = $3;
	798	$query_seq .= $2;
	799	$hit_first_seq_line++;
	800	} elsif (/^Sbjct:\s+(\d+)\s+(\S+)\s+(\d+)/) {
	801	if (($1 > $3) && $hit_first_seq_line == 1) {
	802	$strand = 'minus';
	803	push @subject_ends,$1;
	804	} elsif (($1 < $3) && $hit_first_seq_line == 1) {
	805	$strand = 'plus';
	806	push @subject_starts,$1;
	807	}
	808	push @subject_strands,$strand if ($hit_first_seq_line == 1);
	809	$last_end_point = $3;
	810	$subject_seq .= $2;
	811	$hit_first_seq_line++;
	812	}
	813	}
	814	# catch the last one.
	815	my $ra_safe_expects = _check_for_unsafe_e(\@expects);
	816	if ($id) {
	817	# catches the starting point for the previous reverse strand
	818	push @query_starts,$q_last_end_point if ($q_strand eq 'minus');
	819	push @query_ends,$q_last_end_point if ($q_strand eq 'plus');
	820	push @subject_starts,$last_end_point if ($strand eq 'minus');
	821	push @subject_ends,$last_end_point if ($strand eq 'plus');
	822	push @subject_seqs,$subject_seq if ($subject_seq);
	823	push @query_seqs,$query_seq if ($query_seq);
	824	$self->_push_new_results($id,$defline,$subject_length,
	825	$query,$query_length,\@scores,$ra_safe_expects,\@identities,
	826	\@match_lengths,\@query_starts,\@query_ends,\@subject_starts,
	827	\@subject_ends,\@subject_strands,\@query_strands,
	828	\@subject_seqs,\@query_seqs,\@query_frames,\@subject_frames);
	829	}
	830	}
	831
	832	sub _push_new_results {
	833	my $self = shift;
	834	my ($id,$defline,$subject_length,$query,$query_length,$ra_scores,
	835	$ra_expects,$ra_identities,$ra_match_lengths,$ra_query_starts,
	836	$ra_query_ends,$ra_subject_starts,$ra_subject_ends,$ra_subject_strands,
	837	$ra_query_strands,$ra_subject_seqs,$ra_query_seqs,$ra_query_frames,
	838	$ra_subject_frames) = @_;
	839
	840	my $rh_hash = {'id' =>$id
	841	, 'defline' =>$defline
	842	, 'subject_length' =>$subject_length
	843	, 'query' =>$query
	844	, 'query_length' =>$query_length
	845	, 'scores' => [ @{$ra_scores} ]
	846	, 'expects' => [ @{$ra_expects} ]
	847	, 'identities' => [ @{$ra_identities} ]
	848	, 'match_lengths' => [ @{$ra_match_lengths} ]
	849	, 'query_starts' => [ @{$ra_query_starts} ]
	850	, 'query_ends' => [ @{$ra_query_ends} ]
	851	, 'subject_starts' => [ @{$ra_subject_starts} ]
	852	, 'subject_ends' => [ @{$ra_subject_ends} ]
	853	, 'subject_strands' => [ @{$ra_subject_strands} ]
	854	, 'query_strands' => [ @{$ra_query_strands} ]
	855	, 'subject_seqs' => [ @{$ra_subject_seqs} ]
	856	, 'query_seqs' => [ @{$ra_query_seqs} ]
	857	, 'query_frames' => []
	858	, 'subject_frames' => []
	859	};
	860	# only true in blastx or tblastx
	861	if ($ra_query_frames && $self->{options}->{p} =~ /blastx/i) {
	862	push @{$rh_hash->{query_frames}}, @{$ra_query_frames};
	863	}
	864	# only true in tblastn
	865	if ($ra_query_frames && $self->{options}->{p} =~ /tblastn/i) {
	866	push @{$rh_hash->{subject_frames}}, @{$ra_query_frames};
	867	}
	868	# only true in tblastx
	869	if ($self->{options}->{p} =~ /tblastx/i) {
	870	push @{$rh_hash->{subject_frames}}, @{$ra_subject_frames};
	871	}
	872	push @{$self->{results}}, $rh_hash;
	873	}
	874	sub _get_args {
	875	my $rh_opts = shift;
	876	my $tmp_dir = _get_temp_directory();
	877	my ($key,$value,$outfile);
	878	my @args = ();
	879	my $tmp_file = "";
	880	while (($key,$value) = each %$rh_opts) {
	881	push @args, "-$key$value";
	882	$outfile = $value if ($key eq "o");
	883	}
	884	unless ($outfile) {
	885	my $base = sprintf("%s/%s-%d-%d-0", $tmp_dir, ,"Blastall", $$, time());
	886	$tmp_file = _generate_temp_file($base);
	887	push @args, "-o$tmp_file";
	888	}
	889	return(\@args,$tmp_file);
	890	}
	891	sub _get_formatdb_args {
	892	my $rh_opts = shift;
	893	my @args = ();
	894	foreach my $k (keys %{$rh_opts}) {
	895	push @args, "-${k}$rh_opts->{$k}";
	896	}
	897	return \@args;
	898	}
	899	sub _get_wu_args {
	900	my $rh_opts = shift;
	901	my ($key,$value);
	902	my $tmp_dir = _get_temp_directory();
	903	my $base_name = sprintf("%s/%s-%d-%d-0", $tmp_dir, ,"Blastall", $$, time());
	904	my $tmp_file = _generate_temp_file($base_name);
	905	my $program = $WU_BLAST{$rh_opts->{p}} \|\| $WU_BLAST{wu_blastn};
	906	my $database = $rh_opts->{d} \|\| 'nr';
	907	my $infile = $rh_opts->{i};
	908	my $outfile = $rh_opts->{o};
	909	my @args = ($program, $database, $infile);
	910	while (($key,$value) = each %${rh_opts}) {
	911	next if ($key eq 'p' \|\| $key eq 'd' \|\| $key eq 'i' \|\| $key eq 'o');
	912	if ($value eq '!' \|\| $value eq '') {
	913	push @args, "-$key";
	914	} else {
	915	push @args, "-$key=$value";
	916	}
	917	}
	918	# $rh_opts->{o} ? push @args, "> $rh_opts->{o}" : push @args, "> $tmp_file";
	919	return(\@args,$tmp_file);
	920	}
	921	sub _get_temp_directory {
	922	return($ENV{TMPDIR}) if ($ENV{TMPDIR});
	923	return($ENV{TEMP}) if ($ENV{TEMP});
	924	return($ENV{TMP}) if ($ENV{TMP});
	925	return('/tmp') if (-d '/tmp' && -w '/tmp');
	926	return('/var/tmp') if (-d '/var/tmp' && -w '/var/tmp');
	927	return('/usr/tmp') if (-d '/usr/tmp' && -w '/usr/tmp');
	928	return('/temp') if (-d '/temp' && -w '/temp');
	929	return('.') if (-w '.');
	930	$_ = 'Cannot find a place to write tempfiles. ';
	931	$_ .= 'Please set the TMPDIR environmental variable. ';
	932	return undef;
	933	}
	934	sub _generate_temp_file {
	935	my $base_name = shift;
	936	my $count = 0;
	937	while ($count < 1000) {
	938	return ($base_name) unless (-e $base_name); # don't clobber existing
	939	$base_name =~ s/-?(\d+)?$/"-" . (1 + $1)/e;
	940	$count++;
	941	}
	942	return undef;
	943	}
	944
	945	1;
	946	__END__
	947
	948	=head1 NAME
	949
	950	NHGRI::Blastall - Perl extension for running and parsing NCBI's BLAST 2.x
	951
	952	=head1 SYNOPSIS
	953
	954	=over 4
	955
	956	=head1 DESCRIPTION
	957
	958	If you have NCBI's BLAST2 or WU-BLAST installed locally and your
	959	environment is already setup you can use Perl's object-oriented
	960	capabilities to run your BLASTs. Also if you have a blastcl3 binary
	961	from the toolkit (or binaries from our FTP site) you can run BLAST over
	962	the network. There are also methods to blast single sequences against
	963	each other using the bl2seq binaries (also in the toolkit and binaries).
	964	You can blast one sequence against a library of sequences using the
	965	blast_one_to_many method. You can format databases with formatdb method.
	966	You can also have NHGRI::Blastall read existing BLAST
	967	reports. If you have a database of repetitive DNA or other DNA you would
	968	like to mask out, you can use the mask method to mask the data against
	969	these databases. You can then use either the filter or result methods
	970	to parse the report and access the various elements of the data.
	971
	972
	973	=item RUNNING NEW BLASTS
	974
	975	use NHGRI::Blastall;
	976	my $b = new NHGRI::Blastall;
	977	# If you are running NCBI's Local BLAST
	978	$b->blastall( p => 'blastn',
	979	d => 'nr',
	980	i => 'infile',
	981	o => 'outfile'
	982	);
	983	# If you are running NCBI's blastcl3 network client
	984	$b->blastcl3( p => 'blastn',
	985	d => 'nr',
	986	i => 'infile',
	987	o => 'outfile'
	988	);
	989	# If you are running WU-BLAST locally
	990	$b->wu_blastall( p => 'blastn',
	991	d => 'nr',
	992	nogap => '!', #use ! for arguments w/o parameter
	993	i => 'infile',
	994	o => 'outfile'
	995	);
	996
	997	See BLASTALL for more info
	998
	999	=item BLASTING 2 SEQUENCES
	1000
	1001	use NHGRI::Blastall;
	1002	my $b = new NHGRI::Blastall;
	1003	$b->bl2seq(i => 'seq1',
	1004	j => 'seq2',
	1005	p => 'tblastx'
	1006	);
	1007
	1008	See BL2SEQ for more info
	1009
	1010	=item BLASTING 1 SEQUENCE AGAINST A FASTA LIBRARY OF SEQUENCES
	1011
	1012	# a library is a FASTA file with multiple FASTA formatted sequences
	1013	use NHGRI::Blastall;
	1014	my $b = new NHGRI::Blastall;
	1015	$b->blast_one_to_many(i => 'seq1',
	1016	d => 'seq2.lib',
	1017	p => 'tblastx',
	1018	);
	1019
	1020	See BLAST_ONE_TO_MANY for more info
	1021
	1022	=item INITIALIZING EXISTING BLAST REPORTS
	1023
	1024	use NHGRI::Blastall;
	1025	my $b = new NHGRI::Blastall;
	1026	$b->read_report('/path/to/report');
	1027
	1028	=item MASKING SEQUENCES
	1029
	1030	use NHGRI::Blastall;
	1031	my $b = new NHGRI::Blastall;
	1032	$masked_seq = $b->mask( type => 'wu_blastall',
	1033	p => 'blastn',
	1034	d => 'alu',
	1035	i => 'infile'
	1036	);
	1037
	1038	See MASKING for more info
	1039
	1040	=item CREATING BLAST INDEXES
	1041
	1042	use NHGRI::Blastall;
	1043	my $b = new NHGRI::Blastall;
	1044	$b->formatdb( i => 'est',
	1045	p => 'F',
	1046	o => 'T',
	1047	);
	1048
	1049	See FORMATDB for more info
	1050
	1051	=item PRINTING REPORTS
	1052
	1053	$b->print_report();
	1054	# this method only opens the report and prints. It does not print
	1055	# summary reports
	1056
	1057	=item FILTERING BLAST RESULTS
	1058
	1059	@hits = $b->filter( scores => '38.2',
	1060	identities => '.98'
	1061	);
	1062
	1063	# returns an array of hash references.
	1064	See HASHREF for more info on manipulating the results.
	1065	See FILTERING for more info on using the filter method
	1066
	1067	=item GETTING AT ELEMENTS
	1068
	1069	@ids = $b->result('id');
	1070	@scores = $b->result('scores',$ids[0]); # second param must be an id
	1071
	1072	See RESULT for more info on using the result method
	1073	See ELEMENTS for element names
	1074
	1075	=item GETTING AT ALL THE DATA
	1076
	1077	@results = $b->result(); # returns an array of hashes
	1078
	1079	See HASHREF for information on the array of hashes that is returned.
	1080	See DUMP RESULTS to see how to work with the array of hashes
	1081
	1082	=item ADJUSTING THE DEFLINE REGEX
	1083
	1084	$b = new NHGRI::Blastall (-DB_ID_REGEX => '[^ ]+');
	1085
	1086	See DB_ID_REGEX for more info
	1087
	1088	=back
	1089
	1090	=head1 BLASTALL
	1091
	1092	This method provides a simple object oriented frontend to BLAST.
	1093	This module works with either NCBI's blastall binary distributed with
	1094	BLAST 2.x, WU-BLAST or over the web through NCBI's Web Site.
	1095	The blastall function accepts as a parameter an anonymous hash with keys
	1096	that are the command line options (See BLASTALL OPTIONS) and values
	1097	which are the corresponding values to those options. You may want to
	1098	set the BLASTALL variable in Blastall.pm to the full path of your
	1099	`blastall' binary, especially if you will be running scripts as cron
	1100	jobs or if blastall is not in the system path.
	1101
	1102	=head1 BLASTALL OPTIONS
	1103
	1104	For wu_blastall you need to use NCBI type switches for the following
	1105	[CB<-i>] for infile
	1106	[CB<-o>] for outfile
	1107	[CB<-p>] for program
	1108	[CB<-d>] for database
	1109	the rest of the parameters MUST be the parameters available through WU-BLAST
	1110	(e.g. -sump, -nogap -compat1.4, etc.)
	1111	use a `!' to specify that an argument has no parameters. See the example
	1112	at the top of the manpage.
	1113
	1114	These are the options that NCBI's blastall and binary accepts and these
	1115	are the same options that are accepted by the blastall and blastcl3 methods.
	1116	NOTE: You must set the proper environmental variables for the blastall
	1117	method to work (BLASTDB,BLASTMAT).
	1118
	1119	=over 4
	1120
	1121	=item
	1122
	1123	B<p> => Program Name
	1124
	1125	=item
	1126
	1127	B<d> => Database default=nr
	1128
	1129	=item
	1130
	1131	B<i> => QueryFile
	1132
	1133	=item
	1134
	1135	B<e> => Expectation vaule (E) default=10.0
	1136
	1137	=item
	1138
	1139	B<m> => alignment view default=0
	1140	0 = pairwise,
	1141	1 = master-slave showing identities,
	1142	2 = master-slave no identities,
	1143	3 = flat master-slave, show identities,
	1144	4 = flat master-slave, no identities,
	1145	5 = master-slave no identities and blunt ends,
	1146	6 = flat master-slave, no identities and blunt ends
	1147
	1148	=item
	1149
	1150	B<o> => BLAST report Output File default=stdout
	1151
	1152	=item
	1153
	1154	B<F> => Filter query sequence default=T
	1155	(DUST with blastn, SEG with others)
	1156
	1157	=item
	1158
	1159	B<G> => Cost to open a gap default=0
	1160	(zero invokes default behavior)
	1161
	1162	=item
	1163
	1164	B<E> => Cost to extend a gap default=0
	1165	(zero invokes default behavior)
	1166
	1167	=item
	1168
	1169	B<X> => X dropoff value for gapped alignment (in bits) default=0
	1170	(zero invokes default behavior)
	1171
	1172	=item
	1173
	1174	B<I> => Show GI's in deflines default=F
	1175
	1176	=item
	1177
	1178	B<q> => Penalty for a nucleotide mismatch (blastn only) default=-3
	1179
	1180	=item
	1181
	1182	B<r> => Reward for a nucleotide match (blastn only) default=1
	1183
	1184	=item
	1185
	1186	B<v> => Number of one-line descriptions (V) default=500
	1187
	1188	=item
	1189
	1190	B<b> => Number of alignments to show (B) default=250
	1191
	1192	=item
	1193
	1194	B<f> => Threshold for extending hits, default if zero default=0
	1195
	1196	=item
	1197
	1198	B<g> => Perfom gapped alignment (NA with tblastx) default=T
	1199
	1200	=item
	1201
	1202	B<Q> => Query Genetic code to use default=1
	1203
	1204	=item
	1205
	1206	B<D> => DB Genetic code (for tblast[nx] only) default=1
	1207
	1208	=item
	1209
	1210	B<a> => Number of processors to use default=1
	1211
	1212	=item
	1213
	1214	B<O> => SeqAlign file Optional
	1215
	1216	=item
	1217
	1218	B<J> => Believe the query defline default=F
	1219
	1220	=item
	1221
	1222	B<M> => Matrix default=BLOSUM62
	1223
	1224	=item
	1225
	1226	B<W> => Word size, default if zero default=0
	1227
	1228	=item
	1229
	1230	B<z> => Effective length of the database default=0
	1231	(use zero for the real size)
	1232
	1233	=item
	1234
	1235	B<K> => Number of best hits from a region to keep default=100
	1236
	1237	=item
	1238
	1239	B<L> => Length of region used to judge hits default=20
	1240
	1241	=item
	1242
	1243	B<Y> => Effective length of the search space default=0
	1244	(use zero for the real size)
	1245
	1246	=item
	1247
	1248	B<S> => Query strands to search against database default=3
	1249	(for blast[nx], and tblastx).
	1250	3 is both, 1 is top, 2 is bottom
	1251
	1252	=item
	1253
	1254	B<T> => Produce HTML output [T/F] default=F
	1255
	1256	=item
	1257
	1258	B<l> => Restrict search of database to list of GI's [String]
	1259
	1260	=back
	1261
	1262	NOTE: If you do not supply an `o' option (outfile),
	1263	the following environment variables are checked in order:
	1264	`TMPDIR', `TEMP', and `TMP'.
	1265	If one of them is set, outfiles are created relative to the
	1266	directory it specifies. If none of them are set, the first
	1267	possible one of the following directories is used:
	1268	/var/tmp , /usr/tmp , /temp , /tmp ,
	1269	This file is deleted after the NHGRI::Blastall object is destroyed.
	1270	It is recommended that you create a tmp directory in your home
	1271	directory and set one of the above environmental vars to point
	1272	to this directory and then set the permissions on this directory
	1273	to 0700. Writing to a "public" tmp directory can have
	1274	security ramifications.
	1275
	1276	=head1 BL2SEQ
	1277
	1278	This method uses the bl2seq binary (distributed with BLAST executables
	1279	and source) to BLAST one sequence against another sequence.
	1280	Like the blastall method the bl2seq method accepts the same options
	1281	that the bl2seq binary accepts. Run bl2seq without options from the
	1282	command line to get a full list of options. An important note about
	1283	the options, when running blastx 1st sequence should be nucleotide;
	1284	when running tblastn 2nd sequence should be nucleotide.
	1285
	1286	use NHGRI::Blastall;
	1287	my $b = new NHGRI::Blastall;
	1288	$b->bl2seq(i => 'seq1.nt',
	1289	j => 'seq2.aa',
	1290	p => 'blastx'
	1291	);
	1292
	1293
	1294	=head1 BLAST_ONE_TO_MANY
	1295
	1296	This method allows for blasting one sequence against a FASTA library
	1297	of sequences. Behind the scenes, BLAST indexes are created (in the
	1298	same directory as the FASTA library) using the provided FASTA library
	1299	and the one sequence is used to search against this database.
	1300	If the program completes successfully, the databases are removed.
	1301	To compare two sequences, use the bl2seq method which is faster and
	1302	less messy (no tmp indexes). This method accepts the same options
	1303	as the blastall binary with the d option corresponding to the
	1304	FASTA library.
	1305
	1306	use NHGRI::Blastall;
	1307	my $b = new NHGRI::Blastall;
	1308	$b->blast_one_to_many(i => 'seq.aa',
	1309	d => 'seq.nt.lib',
	1310	e => '0.001',
	1311	p => 'tblastn',
	1312	);
	1313
	1314	=head1 MASKING
	1315
	1316	Screens DNA sequences in fasta format against the database specified
	1317	in the blastall 'd' option. The mask method accepts the same parameters
	1318	as the blastall method. Any matches to the masking database will be
	1319	substituted with "N"s. The mask method returns the masked sequence.
	1320	Performs similar function as xblast, an old NCBI program written in C.
	1321
	1322	Set the type parameter to wu_blastall, blastcl3 or blastall
	1323	depending on your configuration.
	1324
	1325	$masked_seq = $b->mask( type => 'blastcl3', # defaults to blastall
	1326	p => 'blastn',
	1327	d => 'alu',
	1328	i => 'infile'
	1329	);
	1330
	1331	To get the mask coordinates back call the mask method in an array
	1332	context.
	1333
	1334	@mask = $b->mask(p => 'blastn',
	1335	d => 'alu',
	1336	i => 'infile'
	1337	);
	1338	$masked_seq = $mask[0]; # same as above masked seq
	1339	$ra_masked_coords = $mask[1]; # reference to array of mask coordinates
	1340
	1341	=head1 FORMATDB
	1342
	1343	This method creates BLAST indexes using the formatdb binary which
	1344	is distributed with BLAST. It accepts the same parameters as formatdb.
	1345	The remove_formatdb_indexes method will remove databases created
	1346	using the formatdb method (if called by the same object).
	1347	formatdb leaves a file called formatdb.log by default in the current
	1348	working directory (if it has permission). To change this behavior,
	1349	use the l option to direct the sequence to /dev/null or somewhere else.
	1350
	1351	use NHGRI::Blastall;
	1352	my $b = new NHGRI::Blastall;
	1353	$b->formatdb( i => 'swissprot',
	1354	p => 'T',
	1355	l => '/dev/null',
	1356	o => 'T',
	1357	);
	1358
	1359
	1360	=head1 DB_ID_REGEX
	1361
	1362	By default Blastall.pm expects FASTA deflines of BLAST databases to be
	1363	formatted like Genbank database (gi\|GINUMBER\|DATABASE\|ACCESSION\|SUMMARY).
	1364	The default regular expression is [^\\|]+(?:\\|[^\\|,\s]*){1,10}
	1365	When using non-genbankformatted deflines, it may become necessary to
	1366	adjust the regular expression that identifies the unique identifier
	1367	in a defline. This can be done with the -DB_ID_REGEX parameter to the
	1368	new method. For example
	1369
	1370	$b = new NHGRI::Blastall( -DB_ID_REGEX => '[^ ]+' );
	1371
	1372	=head1 FILTERING
	1373
	1374	The filter method accepts an anonymous hash in which the keys are
	1375	elements of the blast report and the values are limits that
	1376	are put on the result set.
	1377
	1378	The following are the Filter elements and their default operations.
	1379
	1380	id => regular expression match
	1381	defline => regular expression match
	1382	subject_length => greater than
	1383	scores => greater than
	1384	expects => less than
	1385	identities => greater than
	1386	match_length => greater than
	1387	subject_strand => equals
	1388	query_frames => equals
	1389	subject_frames => equals
	1390
	1391	so if you would like to limit your results to entries that have scores
	1392	greater than 38.2 and identities greater than 98% you would say...
	1393
	1394	@hits = $b->filter( scores => '38.2',
	1395	identities => '.98'
	1396	);
	1397
	1398	you can also override the defaults. if you would like only scores
	1399	that are less than 38.2 you could say...
	1400
	1401	@hits = $b->filter( scores => '<38.2' );
	1402
	1403	or if you wanted only identities that were equal to 1 and you didn't
	1404	care about the hits array you could say...
	1405
	1406	$b->filter( identities => '=1' );
	1407
	1408	Regular expression matches are case insensitive. If you wanted
	1409	only records with the word "human" in the defline you could say...
	1410
	1411	@hits = $b->filter( defline => 'HuMaN' );
	1412
	1413	After you run the filter method on an object the object only contains
	1414	those results which passed the filter. This will effect additional
	1415	calls to the filter method as well as calls to other methods
	1416	(e.g. result). To reset the NHGRI::Blastall object you can use
	1417	the unfilter method.
	1418
	1419	$b->unfilter();
	1420
	1421	See DUMP RESULTS for info on how to manipulate the array of hash refs.
	1422
	1423
	1424	=head1 RESULT
	1425
	1426
	1427	The result method has 3 possible invocations. The first invocation
	1428	is when it is called without parameters.
	1429
	1430	@results = $b->result();
	1431
	1432	This invocation returns an array of hash references.
	1433	See HASHREF for further explanation of this structure.
	1434
	1435	To get a list of all the ids do...
	1436
	1437	@ids = $b->result('id');
	1438
	1439	These ids can be used to get at specific elements. If 2 parameters
	1440	are present and the first one is an element (See ELEMENTS for a list
	1441	of ELEMENTS) and the second one is an id then the routine will
	1442	return a list of elements corresponding to the id.
	1443
	1444	@scores = $b->result('scores',$ids[0]); # second param must be an id
	1445
	1446	If more than 2 elements are passed the function will return undef.
	1447
	1448	=head1 ACCESSOR METHODS
	1449
	1450	=over 4
	1451
	1452	=item B<get_report>
	1453
	1454	returns the filename of the BLAST report.
	1455
	1456	=item B<get_database_description>
	1457
	1458	returns description given to the database during formatting of db.
	1459	e.g. All non-redundant GenBank CDStranslations+PDB+SwissProt+PIR
	1460
	1461	=item B<get_database_sequence_count>
	1462
	1463	returns the number of sequences in the database.
	1464
	1465	=item B<get_database_letter_count>
	1466
	1467	returns the number of total letters in the database.
	1468
	1469	=item B<get_blast_program>
	1470
	1471	returns the BLAST program name that appears at the top of the report.
	1472	either BLASTN, BLASTP, BLASTX, TBLASTN or TBLASTX
	1473
	1474	=item B<get_blast_version>
	1475
	1476	returns the version of the BLAST program that was used.
	1477
	1478	=back
	1479
	1480	=head1 ELEMENTS
	1481
	1482	=over 4
	1483
	1484	=item B<id>
	1485
	1486	an example of an id is `>gb\|U19386\|MMU19386' the initial `>'
	1487	is just a flag. The next characters up until the first pipe
	1488	is the database the subject was taken from. The next characters
	1489	up to the next pipe is the Genbank accession number. The last
	1490	characters are the locus. This element is used as a unique
	1491	identifier by the NHGRI::Blastall module.
	1492	(SCALAR)
	1493
	1494	=item B<defline>
	1495
	1496	The definition line taken from the subject
	1497	(SCALAR)
	1498
	1499	=item B<subject_length>
	1500
	1501	This is the length of the full subject, not the
	1502	length of the match.
	1503	(SCALAR)
	1504
	1505	=item B<scores>
	1506
	1507	This is score (in bits) of the match.
	1508	(ARRAY)
	1509
	1510	=item B<expects>
	1511
	1512	This is the statistical significance (`E value') for the match.
	1513	(ARRAY)
	1514
	1515	=item B<identities>
	1516
	1517	This is the number of identities divided by the match
	1518	length in decimal format. (Listed as a fraction and a percentage
	1519	in a BLAST report.)
	1520	(ARRAY)
	1521
	1522	=item B<match_lengths>
	1523
	1524	this is the number of base pairs that match up.
	1525	(ARRAY)
	1526
	1527	=item B<query_starts>
	1528
	1529	This is the number of the first base which matched
	1530	with the subject.
	1531	(ARRAY)
	1532
	1533	=item B<query_ends>
	1534
	1535	This is the number of the last base which matched
	1536	with the subject.
	1537	(ARRAY)
	1538
	1539	=item B<subject_starts>
	1540
	1541	This is the number of the first base which matched
	1542	with the query.
	1543	(ARRAY)
	1544
	1545	=item B<subject_ends>
	1546
	1547	This is the number of the last base which matched
	1548	with the query.
	1549	(ARRAY)
	1550
	1551	=item B<subject_strands>
	1552
	1553	This is either plus or minus depending on the orientation
	1554	of the subject sequence in the match.
	1555	(ARRAY)
	1556
	1557	=item B<query_strands>
	1558
	1559	This is either plus or minus depending on the orientation
	1560	of the query sequence in the match.
	1561	(ARRAY)
	1562
	1563	=item B<query_frames>
	1564
	1565	If you are running a blastx or tblastx search in which the
	1566	query_sequence is translated this is the frame the query
	1567	sequence matched.
	1568	(ARRAY)
	1569
	1570	=item B<subject_frames>
	1571
	1572	If you are running a tblastn or tblastx search in which the
	1573	subject sequence is translated, this is the frame where the
	1574	subject sequence matched.
	1575	(ARRAY)
	1576
	1577	=back
	1578
	1579	=head1 HASHREF
	1580
	1581	Each hash ref contains an id, defline and subject Length. Because
	1582	there can be multiple scores, expect values, Identities, match_lengths,
	1583	query_starts, query_strands and subject_starts, these are stored
	1584	as array references. The following is an array containing two hash
	1585	refs.
	1586
	1587	@hits = (
	1588	{'id' => '>gb\|U79716\|HSU79716',
	1589	'defline' => 'Human reelin (RELN) mRNA, complete cds',
	1590	'subject_length' => '11580',
	1591	'scores' => [ 684, 123 ],
	1592	'expects' => [ 0.0, 3e-26 ],
	1593	'identities' => [ .99430199, .89256198 ],
	1594	'match_lengths' => [ 351, 121 ],
	1595	'query_starts' => [ 3, 404 ],
	1596	'query_ends' => [ 303, 704 ],
	1597	'subject_starts' => [ 5858, 6259 ],
	1598	'subject_ends' => [ 6158, 6559 ],
	1599	'subject_strands' => [ 'plus', 'minus' ],
	1600	'query_strands' => [ 'plus', 'plus' ],
	1601	'query_frames' => [ '+1', '-3' ],
	1602	'subject_frames' => [ '+2', '-1' ],
	1603	},
	1604	{'id' => '>gb\|U24703\|MMU24703',
	1605	'defline' => 'Mus musculus reelin mRNA, complete cds',
	1606	'subject_length' => '11673',
	1607	'scores' => [ 319, 38.2 ],
	1608	'expects' => [ 2e-85, 1.2 ],
	1609	'identities' => [ .86455331, 1 ],
	1610	'match_lengths' => [ 347, 19 ],
	1611	'query_starts' => [ 3, 493 ],
	1612	'query_ends' => [ 303, 793 ],
	1613	'subject_starts' => [ 5968, 6457 ]
	1614	'subject_ends' => [ 6268, 6757 ],
	1615	'subject_strands' => [ 'plus', 'minus' ],
	1616	'query_strands' => [ 'plus', 'plus' ],
	1617	'query_frames' => [ '+3', '-3' ],
	1618	'subject_frames' => [ '+1', '-2' ],
	1619	}
	1620	);
	1621
	1622	See ELEMENTS for explanation of each element.
	1623	See DUMP RESULTS and/or the perlref(1) manpage for clues on working
	1624	with this structure.
	1625
	1626	=head1 DUMP RESULTS
	1627
	1628	When calling the result function or with no parameters, or calling the
	1629	filter function, an array of references to hashes is returned.
	1630	Each elment of the array is a reference to a hash containing 1 record.
	1631	See HASHREF for details on this structure. The following
	1632	routine will go through each element of the array of hashes and
	1633	then print out the element and it's corresponding value or values.
	1634	See perlref(1) for more info on references.
	1635
	1636	sub dump_results {
	1637	foreach $rh_r (@results) {
	1638	while (($key,$value) = each %$rh_r) {
	1639	if (ref($value) eq "ARRAY") {
	1640	print "$key: ";
	1641	foreach $v (@$value) {
	1642	print "$v ";
	1643	}
	1644	print "\n";
	1645	} else {
	1646	print "$key: $value\n";
	1647	}
	1648	}
	1649	}
	1650	}
	1651
	1652	=head1 AUTHOR
	1653
	1654	=over 4
	1655
	1656	=item
	1657
	1658	Joseph Ryan (jfryan@nhgri.nih.gov)
	1659
	1660	=back
	1661
	1662	=head1 CONTACT ADDRESS
	1663
	1664	If you have problems, questions, comments send to webblaster@nhgri.nih.gov
	1665
	1666	=head1 COPYRIGHT INFORMATION
	1667
	1668	This software/database is "United States Government Work" under the
	1669	terms of the United States Copyright Act. It was written as part of
	1670	the authors' official duties for the United States Government and thus
	1671	cannot be copyrighted. This software/database is freely available to
	1672	the public for use without a copyright notice. Restrictions cannot
	1673	be placed on its present or future use.
	1674
	1675	Although all reasonable efforts have been taken to ensure the
	1676	accuracy and reliability of the software and data, the National
	1677	Human Genome Research Institute (NHGRI) and the U.S. Government
	1678	does not and cannot warrant the performance or results that may be
	1679	obtained by using this software or data. NHGRI and the U.S.
	1680	Government disclaims all warranties as to performance,
	1681	merchantability or fitness for any particular purpose.
	1682
	1683	In any work or product derived from this material, proper
	1684	attribution of the authors as the source of the software or
	1685	data should be made, using http://genome.nhgri.nih.gov/blastall
	1686	as the citation.
	1687
	1688	=head1 ENVIRONMENT VARIABLES
	1689
	1690	=over 4
	1691
	1692	=item B<BLASTDB>
	1693
	1694	location of BLAST formated databases
	1695
	1696	=item B<BLASTMAT>
	1697
	1698	location of BLAST matrices
	1699
	1700	=item B<TMPDIR> B<TEMP> B<TMP>
	1701
	1702	If the `o' option is not passed to the blastall method than NHGRI::Blastall
	1703	looks for one of these vars (in order) to store the BLAST report. This
	1704	report is destroyed after the NHGRI::Blastall.pm object is destroyed.
	1705
	1706	=back
	1707
	1708	=head1 SEE ALSO
	1709
	1710	L<perl(1)> L<perlref(1)>
	1711
	1712	http://www.ncbi.nlm.nih.gov/BLAST/newblast.html
	1713
	1714	ftp://ncbi.nlm.nih.gov/blast/db/README
	1715
	1716	http://www.ncbi.nlm.nih.gov/BLAST/tutorial/Altschul-1.html
	1717
	1718	=cut

+126

-0

Changes less more

	0	VERSION 0.66
	1	---------------------------------------------------------------------
	2	fixed a section of code in masking logic which would cause a warning
	3	if user got no hits to their masking database.
	4
	5	VERSION 0.65
	6	---------------------------------------------------------------------
	7	added the ability to get the coordinates back from a mask.
	8	usually mask is called in a scalar context, but if you ask
	9	in an array context you will receive a masked sequence and
	10	a reference to an array of masked sequences.
	11
	12	VERSION 0.64 (never released)
	13	---------------------------------------------------------------------
	14	Version .63 fix fixed a masking problem but introduced a bug in blasting
	15	which would delete all BLAST reports after BLASTing. (version .63 was
	16	never released). This version re-fixes the masking problem without
	17	breaking anything else.
	18
	19	VERSION 0.63 (never released)
	20	---------------------------------------------------------------------
	21	fixed problem with masking in which masking would fail if you provided -o opt
	22	fixed typo in pod documentation.
	23
	24	VERSION 0.61
	25	---------------------------------------------------------------------
	26	fixed pod which caused make to gripe
	27	fixed reference in README
	28
	29	VERSION 0.60
	30	---------------------------------------------------------------------
	31	added bl2seq method which BLASTs 2 sequences against each other
	32	added blast_one_to_many method which BLASTs 1 sequence against
	33	a library of sequences.
	34	added formatdb method which creates BLASTable databases
	35	added remove_formatdb_indexes method which removes the BLASTable
	36	databases made by the formatdb method.
	37	cleaned up some of the code
	38	fixed a couple methods so they would work with other methods besides blastall
	39	updated documentation
	40
	41	VERSION 0.56
	42	---------------------------------------------------------------------
	43	added get_report method which returns the BLAST report file.
	44
	45	VERSION 0.55
	46	---------------------------------------------------------------------
	47	fixes a bug which would turn some 0.0 expect values to 1000
	48	if they did not appear in the summaries. This was to correct
	49	an old bug in NCBI BLAST but seems to not exist any longer.
	50
	51	VERSION 0.54
	52	---------------------------------------------------------------------
	53	added the blastcl3 method which uses the blastcl3 binary
	54	to make network BLASTs. This became necessary when NCBI
	55	changed their web interface (QBLAST) and broke my LWP
	56	routines.
	57
	58	VERSION 0.53
	59	---------------------------------------------------------------------
	60	fixed bug in wu-blast option processing.
	61
	62	VERSION 0.52
	63	---------------------------------------------------------------------
	64	Added check to make sure temporary directories are writable.
	65
	66	VERSION 0.51
	67	---------------------------------------------------------------------
	68	Added query_ends and subject_ends to the data parsed from the BLAST report
	69	instead of relying on query_starts and match_lengths to get the end.
	70
	71	VERSION 0.50
	72	---------------------------------------------------------------------
	73	fixed a number of bugs. Added the DB_ID_REGEX
	74	added some accessor methods to get the database description,
	75	number of sequences in the database, number of letters in the database,
	76	BLAST program and program version.
	77
	78	VERSION 0.44
	79	---------------------------------------------------------------------
	80
	81	Blastall now parses and stores "Frames" from blastx, tblastn and tblastx runs.
	82	Updated documentation.
	83
	84	VERSION 0.43
	85	---------------------------------------------------------------------
	86
	87	fixed some misleading documentation
	88
	89	VERSION 0.42
	90	----------------------------------------------------------------------
	91
	92	methods that required a hash_ref (wu-blastall, blastall, net-blastall and mask)
	93	will now accept a hash. so you can either say...
	94
	95	blastall({'p'=>'blastn', 'd'=>'nr', 'i'=>'infile'}); #with ref
	96	blastall('p'=>'blastn', 'd'=>'nr', 'i'=>'infile'); #without ref
	97
	98	Fixed the test suite so it tests for LWP modules before trying to run
	99	net-blastall. If it doesn't find LWP modules it looks for $BLASTDB
	100	environmental variable. If your BLASTDB environmental variable =~ /wu/
	101	than it will run wu-blastall. If the BLASTDB env var does exist but
	102	does not have the 'wu' pattern it tries to run NCBI's BLAST via the
	103	blastall method. If all else fails it gives you a warning and fails
	104	the test.
	105
	106
	107	VERSION 0.41
	108	----------------------------------------------------------------------
	109	Filter method used to remove sub entries but now does not.
	110	Now if query sequence has one significant hit to a subject sequence
	111	all hits to that sequence are shown.
	112	There is a bug(feature?).
	113	If one of the hits passes on Identity but not E-value
	114	and another hit does the opposite all hits are reported.
	115	Haven't decided what to do about this.
	116
	117	NEW IN VERSION 0.40
	118	----------------------------------------------------------------------
	119	* support for WU BLAST
	120	* support for WWW BLAST through NCBI
	121	HOWEVER: NCBI has recently changed there web protocol to QBLAST and this
	122	may present future problems.
	123	* added sample scripts in scripts directory for WU, WWW and NCBI BLASTs
	124	* documentation may be slightly behind. Sorry. Working on this...
	125

+74

-0

INSTALL less more

	0	To install this module, cd to the directory that contains this INSTALL
	1	file and type the following:
	2
	3	perl Makefile.PL
	4	make
	5	make test
	6	make install
	7
	8	----------------------------------------------------------------------------
	9	NHGRI::Blastall.pm expects that you have NCBI's blastall binary,
	10	wu-blast binaries, or NCBI's blastcl3 binary installed.
	11	If you do not have it you can download the NCBI binary
	12	for several platforms from ftp://ftp.ncbi.nlm.nih.gov/blast/executables/
	13	If you do not have the blastcl3 binary you can either compile the
	14	NCBI toolkit (ftp://ftp.ncbi.nlm.nih.gov/toolbox/) or download the Entrez
	15	binaries from (ftp://ncbi.nlm.nih.gov/entrez/CURRENT/).
	16	WU-BLAST is available from (http://blast.wustl.edu/).
	17
	18	----------------------------------------------------------------------------
	19	Certain methods in NHGRI::Blastall expect to find binaries in your path.
	20	METHOD EXPECTED BINARIES
	21	---------- ---------------------
	22	blastall() blastall
	23	blastcl3() blastcl3
	24	wu_blastall wu-blastn wu-blastp, wu-blastx, wu-tblastn, wu-tblastx
	25
	26
	27	----------------------------------------------------------------------------
	28	If you are using the blastall or wu_blastall methods
	29	you need to make sure the BLASTDB environmental variable is set.
	30	If you stored your BLAST formatted databases in /usr/local/ncbi/blast/db
	31	the command under bash, sh, or ksh would be...
	32
	33	BLASTDB=/usr/local/ncbi/blast/db;
	34	export BLASTDB;
	35
	36	under csh or tcsh it would be
	37
	38	setenv BLASTDB /usr/local/ncbi/blast/db
	39
	40	Also make sure you have a .ncbirc file in your home directory or in
	41	$NCBI/.ncbirc.
	42
	43	---------------------------------------------------------------------------
	44	If you are having problems using the blastall method you may want to test
	45	BLAST from the command line.
	46
	47	cd NHGRI-Blastall-0.xx
	48	blastall -d vector -p blastn -i t/fasta.seq
	49
	50	if this fails than the problem is probably one of the following
	51	a. you don't have the vector database installed
	52	b. you don't have the blastall binary in your path
	53	c. you don't have NCBI's BLAST2.xx installed on your machine
	54	d. the user that you are running the test under does not have
	55	the correct environment settings (see above)
	56
	57	------------------
	58
	59	If you have trouble installing NHGRI::Blastall.pm because you have
	60	insufficient access privileges to add to the perl library directory,
	61	you can still use NHGRI::Blastall.pm. You would want to do something
	62	like:
	63
	64	perl Makefile.PL LIB=~/lib
	65
	66	See perldoc ExtUtils::MakeMaker for more details on this.
	67
	68	If you have problems send mail to: webblaster@nhgri.nih.gov
	69
	70	Read the POD docs in Blastall.pm
	71
	72	try perldoc NHGRI::Blastall
	73

+16

-0

MANIFEST less more

	0	Blastall.pm
	1	Changes
	2	MANIFEST
	3	INSTALL
	4	Makefile.PL
	5	README
	6	t/blast.report
	7	t/CAB96192.aa
	8	t/basic.t
	9	scripts/wu/test.nt
	10	scripts/wu/basic.plx
	11	scripts/net/test.nt
	12	scripts/net/basic.plx
	13	scripts/ncbi/test.nt
	14	scripts/ncbi/basic.plx
	15	scripts/id_regex_test.pl

+10

-0

Makefile.PL less more

	0	use ExtUtils::MakeMaker;
	1	# See lib/ExtUtils/MakeMaker.pm for details of how to influence
	2	# the contents of the Makefile that is written.
	3	WriteMakefile(
	4	'NAME' => 'NHGRI::Blastall',
	5	'VERSION_FROM' => 'Blastall.pm', # finds $VERSION
	6	'dist' => {
	7	COMPRESS=> 'gzip -9f', SUFFIX=>'gz',
	8	}
	9	);

+80

-0

README less more

	0	Send bug reports to webblaster@nhgri.nih.gov
	1
	2	See Changes file for revision history.
	3
	4	---------------------------------------------------------------------------
	5	Warning: The parsing methods have been tested against GenBANK datasets.
	6	If you are using private databases that have non-standard
	7	deflines, you will want to adjust the defline id regex.
	8	See DB_ID_REGEX section of the pod documentation
	9	There is also a testing script to help develop a regular expression
	10	that fits your databases. Look in the scripts subdirectory of
	11	the distribution.
	12	---------------------------------------------------------------------------
	13
	14	If you have NCBI's BLAST2 or WU-BLAST installed locally and your
	15	environment is already setup you can use Perl's object-oriented
	16	capabilities to run your BLASTs. Also if you have a blastcl3 binary
	17	from the toolkit (or binaries from our FTP site) you can run BLAST over
	18	the network. There are also methods to blast single sequences against
	19	each other using the bl2seq binaries (also in the toolkit and binaries).
	20	You can blast one sequence against a library of sequences using the
	21	blast_one_to_many method. You can format databases with formatdb method.
	22	You can also have NHGRI::Blastall read existing BLAST
	23	reports. If you have a database of repetitive DNA or other DNA you would
	24	like to mask out, you can use the mask method to mask the data against
	25	these databases. You can then use either the filter or result methods
	26	to parse the report and access the various elements of the data.
	27
	28	If you need help installing NCBI's local blast check...
	29
	30	http://genome.nhgri.nih.gov/blastall/blast_install/
	31
	32	---------------------------------------------------------------------------
	33
	34	IT IS HIGHLY RECOMMENDED THAT YOU READ THROUGH THE POD DOCUMENTATION
	35	embedded in the module to learn how to use NHGRI::Blastall.
	36	It won't take you that long. After Installation...
	37
	38	perldoc NHGRI::Blastall
	39
	40	the on-line manual page is available at
	41
	42	http://genome.nhgri.nih.gov/blastall/Blastall.html
	43
	44	---------------------------------------------------------------------------
	45
	46	If you have problems, questions, comments send to webblaster@nhgri.nih.gov
	47
	48	---------------------------------------------------------------------------
	49
	50	PUBLIC DOMAIN NOTICE
	51
	52	This software/database is ``United States Government Work'' under the
	53	terms of the United States Copyright Act. It was written as part of
	54	the authors' official duties for the United States Government and thus
	55	cannot be copyrighted. This software/database is freely available to
	56	the public for use without a copyright notice. Restrictions cannot
	57	be placed on its present or future use.
	58
	59	Although all reasonable efforts have been taken to ensure the accuracy
	60	and reliability of the software and data, the National Human Genome
	61	Research Institute (NHGRI) and the U.S. Government does not and cannot
	62	warrant the performance or results that may be obtained by using this
	63	software or data. NHGRI and the U.S. Government disclaims all
	64	warranties as to performance, merchantability or fitness for any
	65	particular purpose.
	66
	67	In any work or product derived from this material, proper attribution
	68	of the authors as the source of the software or data should be made,
	69	using http://genome.nhgri.nih.gov/webblast as the citation.
	70
	71	---------------------------------------------------------------------------
	72
	73	Special thanks to Peter Chines for reporting a bunch of bugs and suggesting
	74	some excellent features.
	75
	76	Thanks to David Lapointe from UMass Medical School for his suggestion
	77	to use blastcl3 for network blasting.
	78
	79

+46

-0

scripts/id_regex_test.pl less more

	0	# this is a little script to help come up with a -DB_ID_REGEX
	1	# if you are using non-Genbank formatted databases than odds are
	2	# this regex that matches GENBANK deflines,
	3	#
	4	# [^\\|]+(?:\\|[^\\|,\s]*){1,10}
	5	#
	6	# that matches GENBANK formatted deflines
	7	#
	8	# sp\|Q54152\|XASA_SHIFL AMINO ACID ANTIPORTER...
	9	#
	10	# is not going to work for your database. You may want to use something
	11	# a bit more generic like,
	12	#
	13	# [^ ]+
	14	#
	15	# which will match everything before the first space
	16	# which would match H_DJ0592P03.Contig29 in this WashU defline
	17	#
	18	# H_DJ0592P03.Contig29 : preliminary Length:47736
	19	#
	20	# This script will let you test your regex.
	21	# THE KEY to picking a good regex is to make sure your regex
	22	# catches something which is UNIQUE to each sequence in the database.
	23	# set $REGEX to the value of the regex you would like to test and
	24	# set $BLAST_REPORT to a BLAST report against your formatted databases
	25	# After you find a regex you will need to add -DB_ID_REGEX => YOUR_REGEX
	26	# to your new declaration.
	27	#
	28	# $b = new NHGRI::Blastall(-DB_ID_REGEX => '[^ ]+');
	29	#
	30
	31	$REGEX = '[^ ]+';
	32	$BLAST_REPORT = '../t/blast.out';
	33
	34	use NHGRI::Blastall;
	35
	36	$b = new NHGRI::Blastall( -DB_ID_REGEX => $REGEX );
	37	$b->read_report( $BLAST_REPORT );
	38	my @results = $b->result();
	39
	40	foreach $rh_r (@results) {
	41	print "id: $rh_r->{'id'}\n";
	42	}
	43
	44
	45

+18

-0

scripts/ncbi/basic.plx less more

	0	#!/usr/local/bin/perl -w
	1
	2	use lib qw(/sysdev/users/jfryan/blastall.pm/NHGRI/Blastall);
	3	use strict;
	4	use NHGRI::Blastall;
	5
	6	my $b = new NHGRI::Blastall;
	7
	8	print "running ncbi blast...\n";
	9
	10	$b->blastall( p => 'blastn',
	11	d => 'est',
	12	e => 0.001,
	13	i => 'test.nt',
	14	o => 'blastn.est.output' } );
	15
	16	$b->print_report();
	17

+11

-0

scripts/ncbi/test.nt less more

	0	>gi\|3278015\|gb\|AI038821.1\|AI038821 ox96d03.x1 Soares_senescent_fibroblasts_NbHSF Homo sapiens cDNA clone IMAGE:1664165 3' similar to gb:V00574_cds1 TRANSFORMING PROTEIN P21/H-RAS-1 (HUMAN);contains element MER22 repetitive element ;, mRNA sequence [Homo sapiens]
	1	TTTTTGACCATCCAATAATTGGGTGGGATCCCATCTGTGCCCGACAAGGGCCCACAGAGGCCTGGGAGGG
	2	GAGCTAAGGGCTGGGGTTCCGGTGGCATTTGGGATGTTCAAGACAGTCTGTGCACAGCCTCCCTGGGAGG
	3	GTCTGCAGTCACCTCGGCCCACGGTCCCGGGGTGACTGGGCTCCAGCAGCCCTTCCTTCCTTCCTTGCTT
	4	CCGTCCTTCCTTCCTCCTCCTTCCGTCTGCACCTCCTTCCTGCATCCGGCACCTCCATGTCCTGAGCTTG
	5	TGCTGCGTCAGGAGAGCACACACTTGCAGCTCATGCAGCCGGGGCCACTCTCATCAGGAGGGTTCAGCTT
	6	CCGCAGCTTGTGCTGCCGGATCTCACGCACCAACGTGTAGAAGGCATCCTCCACTCTCTGCCGGGTCTTG
	7	GCCGATGTCTCGATGTANGGGATGCCGTAGCTTCGGGCGAGGTCCTGAGCCTGCCGAGATTCCACAGTGC
	8	GTGCAGCCAGGTCACACTTTGTCCCCACCAGCACCATGGGCACGTCATCCGAGTCCTTCACCCCGTTGAT
	9	CTGCTCCCTGGTACTGGTGATGTCCTCAAAAGACTGGTGTTGTGATGGAAACACCACAGGAAGCCCTCCC
	10	TGTCCCATGACTGGTCC

+17

-0

scripts/net/basic.plx less more

	0	#!/usr/local/bin/perl -w
	1
	2	use strict;
	3	use NHGRI::Blastall;
	4
	5	my $b = new NHGRI::Blastall;
	6
	7	print "running blastn over the network...\n";
	8
	9	$b->blastcl3( {'p' => 'blastn',
	10	'd' => 'est',
	11	'i' => 'test.nt',
	12	'o' => 'blastn.est.output' } );
	13
	14	$b->print_report();
	15
	16

+11

-0

scripts/net/test.nt less more

	0	>gi\|3278015\|gb\|AI038821.1\|AI038821 ox96d03.x1 Soares_senescent_fibroblasts_NbHSF Homo sapiens cDNA clone IMAGE:1664165 3' similar to gb:V00574_cds1 TRANSFORMING PROTEIN P21/H-RAS-1 (HUMAN);contains element MER22 repetitive element ;, mRNA sequence [Homo sapiens]
	1	TTTTTGACCATCCAATAATTGGGTGGGATCCCATCTGTGCCCGACAAGGGCCCACAGAGGCCTGGGAGGG
	2	GAGCTAAGGGCTGGGGTTCCGGTGGCATTTGGGATGTTCAAGACAGTCTGTGCACAGCCTCCCTGGGAGG
	3	GTCTGCAGTCACCTCGGCCCACGGTCCCGGGGTGACTGGGCTCCAGCAGCCCTTCCTTCCTTCCTTGCTT
	4	CCGTCCTTCCTTCCTCCTCCTTCCGTCTGCACCTCCTTCCTGCATCCGGCACCTCCATGTCCTGAGCTTG
	5	TGCTGCGTCAGGAGAGCACACACTTGCAGCTCATGCAGCCGGGGCCACTCTCATCAGGAGGGTTCAGCTT
	6	CCGCAGCTTGTGCTGCCGGATCTCACGCACCAACGTGTAGAAGGCATCCTCCACTCTCTGCCGGGTCTTG
	7	GCCGATGTCTCGATGTANGGGATGCCGTAGCTTCGGGCGAGGTCCTGAGCCTGCCGAGATTCCACAGTGC
	8	GTGCAGCCAGGTCACACTTTGTCCCCACCAGCACCATGGGCACGTCATCCGAGTCCTTCACCCCGTTGAT
	9	CTGCTCCCTGGTACTGGTGATGTCCTCAAAAGACTGGTGTTGTGATGGAAACACCACAGGAAGCCCTCCC
	10	TGTCCCATGACTGGTCC

+17

-0

scripts/wu/basic.plx less more

	0	#!/usr/local/bin/perl -w
	1
	2	use lib qw(/sysdev/users/jfryan/blastall.pm/NHGRI/Blastall);
	3	use strict;
	4	use NHGRI::Blastall;
	5
	6	my $b = new NHGRI::Blastall;
	7
	8	print "running wu-blastn...\n";
	9
	10	$b->wu_blastall( {'p' => 'blastn',
	11	'd' => 'est',
	12	'i' => 'test.nt',
	13	'o' => 'blastn.est.output' } );
	14
	15	$b->print_report();
	16

+11

-0

scripts/wu/test.nt less more

	0	>gi\|3278015\|gb\|AI038821.1\|AI038821 ox96d03.x1 Soares_senescent_fibroblasts_NbHSF Homo sapiens cDNA clone IMAGE:1664165 3' similar to gb:V00574_cds1 TRANSFORMING PROTEIN P21/H-RAS-1 (HUMAN);contains element MER22 repetitive element ;, mRNA sequence [Homo sapiens]
	1	TTTTTGACCATCCAATAATTGGGTGGGATCCCATCTGTGCCCGACAAGGGCCCACAGAGGCCTGGGAGGG
	2	GAGCTAAGGGCTGGGGTTCCGGTGGCATTTGGGATGTTCAAGACAGTCTGTGCACAGCCTCCCTGGGAGG
	3	GTCTGCAGTCACCTCGGCCCACGGTCCCGGGGTGACTGGGCTCCAGCAGCCCTTCCTTCCTTCCTTGCTT
	4	CCGTCCTTCCTTCCTCCTCCTTCCGTCTGCACCTCCTTCCTGCATCCGGCACCTCCATGTCCTGAGCTTG
	5	TGCTGCGTCAGGAGAGCACACACTTGCAGCTCATGCAGCCGGGGCCACTCTCATCAGGAGGGTTCAGCTT
	6	CCGCAGCTTGTGCTGCCGGATCTCACGCACCAACGTGTAGAAGGCATCCTCCACTCTCTGCCGGGTCTTG
	7	GCCGATGTCTCGATGTANGGGATGCCGTAGCTTCGGGCGAGGTCCTGAGCCTGCCGAGATTCCACAGTGC
	8	GTGCAGCCAGGTCACACTTTGTCCCCACCAGCACCATGGGCACGTCATCCGAGTCCTTCACCCCGTTGAT
	9	CTGCTCCCTGGTACTGGTGATGTCCTCAAAAGACTGGTGTTGTGATGGAAACACCACAGGAAGCCCTCCC
	10	TGTCCCATGACTGGTCC

-0

t/CAB96192.aa less more

	0	>gi\|8919844\|emb\|CAB96192.1\| PSI 9 KDa protein [Aloe vera]
	1	YLWHETTRSMGLSY

+30

-0

t/basic.t less more

	0	BEGIN { $\| = 1; print "1..4\n"; }
	1	END {print "not ok 1\n" unless $loaded;}
	2	use NHGRI::Blastall;
	3	$loaded = 1;
	4	print "ok 1\n";
	5
	6	my $t = 't';
	7
	8	my $b = new NHGRI::Blastall;
	9
	10	if ($b->read_report("$t/blast.report")) {
	11	print "ok 2\n";
	12	} else {
	13	print "not ok 2\n";
	14	}
	15
	16	if ($b->result('id')) {
	17	print "ok 3\n";
	18	} else {
	19	print "not ok 3\n";
	20	}
	21
	22	if ($b->filter( {'identities' => .70} )) {
	23	print "ok 4\n";
	24	} else {
	25	print "not ok 4\n";
	26	}
	27
	28
	29

+477

-0

t/blast.report less more

	0	BLASTN 2.0.10 [Aug-26-1999]
	1
	2
	3	Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaffer,
	4	Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997),
	5	"Gapped BLAST and PSI-BLAST: a new generation of protein database search
	6	programs", Nucleic Acids Res. 25:3389-3402.
	7
	8	Query= HUMRASH\|gi\|190890\|gb\|J00277 Human c-Ha-ras1 proto-oncogene.
	9	(6453 letters)
	10
	11	Database: Non-redundant Database of all other organisms
	12	GenBank+EMBL+DDBJ EST sequences: updated on Sat Dec 4 01:15:02 1999
	13	800,021 sequences; 338,270,040 total letters
	14
	15	Searching..................................................done
	16
	17	Score E
	18	Sequences producing significant alignments: (bits) Value
	19
	20	gb\|AI657850.1\|AI657850 fc23g10.y1 Zebrafish WashU MPIMG EST Dani... 139 2e-30
	21	gb\|AA497334\|AA497334 fa04e08.r1 Zebrafish ICRFzfls Danio rerio c... 123 1e-25
	22	gb\|AI877912.1\|AI877912 fc55a12.y1 Zebrafish WashU MPIMG EST Dani... 100 2e-18
	23	gb\|AI959079.1\|AI959079 fd24c02.y1 Zebrafish WashU MPIMG EST Dani... 90 2e-15
	24	gb\|AA141102\|AA141102 CK01231.5prime CK Drosophila melanogaster e... 90 2e-15
	25	gb\|AW148172.1\|AW148172 da13b03.x1 normalized Xenopus laevis gast... 70 2e-09
	26	dbj\|AU003591\|AU003591 AU003591 Bombyx mori p50(Daizo) Bombyx mor... 64 1e-07
	27	dbj\|AU004341\|AU004341 AU004341 Bombyx mori p50(Daizo) Bombyx mor... 64 1e-07
	28	gb\|AI575394.1\|AI575394 UI-R-G0-uh-e-12-0-UI.s1 UI-R-G0 Rattus no... 64 1e-07
	29	gb\|AI259307\|AI259307 LP02693.3prime LP Drosophila melanogaster l... 64 1e-07
	30	dbj\|AU004702\|AU004702 AU004702 Bombyx mori p50(Daizo) Bombyx mor... 64 1e-07
	31	gb\|AI295927\|AI295927 LP09693.5prime LP Drosophila melanogaster l... 62 5e-07
	32	gb\|AI405934\|AI405934 GH26087.5prime GH Drosophila melanogaster h... 62 5e-07
	33	gb\|AI238083\|AI238083 GH14071.5prime GH Drosophila melanogaster h... 62 5e-07
	34	gb\|AI626207.1\|AI626207 fc12c05.y1 Zebrafish WashU MPIMG EST Dani... 58 7e-06
	35	gb\|AA900048\|AA900048 UI-R-E0-da-e-01-0-UI.s1 UI-R-E0 Rattus norv... 58 7e-06
	36
	37	>gb\|AI657850.1\|AI657850 fc23g10.y1 Zebrafish WashU MPIMG EST Danio rerio cDNA 5' similar to
	38	gb:V00574_cds1 TRANSFORMING PROTEIN P21/H-RAS-1 (HUMAN);,
	39	mRNA sequence
	40	Length = 573
	41
	42	Score = 139 bits (70), Expect = 2e-30
	43	Identities = 145/170 (85%)
	44	Strand = Plus / Plus
	45
	46
	47	Query: 2052 ggaagcaggtggtcattgatggggagacgtgcctgttggacatcctggataccgccggcc 2111
	48	\|\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\| \|\| \|\| \|\|\|\|\| \|\| \|\|\|\|\|\|\|\|\|\|\|\|\| \|\| \|\| \|\| \|
	49	Sbjct: 194 ggaagcaggtggtgattgacggagaaacgtgtctactggacatcctggacactgcaggtc 253
	50
	51
	52	Query: 2112 aggaggagtacagcgccatgcgggaccagtacatgcgcaccggggagggcttcctgtgtg 2171
	53	\|\|\|\|\|\|\| \|\|\|\|\| \|\|\|\|\|\| \|\|\|\|\|\|\|\|\|\|\|\|\|\| \| \|\| \|\| \|\|\|\|\|\|\|\|\|\|\| \|\|\|\|
	54	Sbjct: 254 aggaggaatacagtgccatgagggaccagtacatgaggacaggagagggcttcctctgtg 313
	55
	56
	57	Query: 2172 tgtttgccatcaacaacaccaagtcttttgaggacatccaccagtacagg 2221
	58	\| \|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\| \|\|\|\|\| \|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\| \|\|\|\|\|\|
	59	Sbjct: 314 tctttgccatcaataacacaaagtcctttgaggacattcaccactacagg 363
	60
	61
	62	Score = 79.8 bits (40), Expect = 2e-12
	63	Identities = 94/112 (83%)
	64	Strand = Plus / Plus
	65
	66
	67	Query: 1664 atgacggaatataagctggtggtggtgggcgccggcggtgtgggcaagagtgcgctgacc 1723
	68	\|\|\|\|\| \|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\| \|\|\|\|\| \|\| \|\| \|\| \|\| \|\|\|\|\| \|\| \|\| \|\| \|\|\|
	69	Sbjct: 73 atgaccgaatataagcttgtggtcgtgggagctggaggcgtaggcaaaagcgctctcacc 132
	70
	71
	72	Query: 1724 atccagctgatccagaaccattttgtggacgaatacgaccccactatagagg 1775
	73	\|\|\|\|\| \|\| \|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\|\|\|\| \|\|\|\|\| \|\|\|\|\| \|\|\|\|\|\|\|\|\|\|
	74	Sbjct: 133 atccaactcatccagaaccactttgtggatgaatatgacccaactatagagg 184
	75
	76
	77	Score = 65.9 bits (33), Expect = 3e-08
	78	Identities = 60/69 (86%)
	79	Strand = Plus / Plus
	80
	81
	82	Query: 2371 cagggagcagatcaaacgggtgaaggactcggatgacgtgcccatggtgctggtggggaa 2430
	83	\|\|\|\|\|\|\|\|\|\|\|\| \|\| \|\| \|\| \|\|\|\|\|\|\|\| \|\| \|\|\|\|\| \|\|\|\|\|\|\|\| \|\|\|\|\|\|\|\|\|\|\|
	84	Sbjct: 360 cagggagcagataaagcgagtaaaggactccgaggacgtccccatggttctggtggggaa 419
	85
	86
	87	Query: 2431 caagtgtga 2439
	88	\|\|\|\|\|\|\|\|
	89	Sbjct: 420 taagtgtga 428
	90
	91
	92	>gb\|AA497334\|AA497334 fa04e08.r1 Zebrafish ICRFzfls Danio rerio cDNA clone 1O24 5' similar
	93	to gb:V00574_cds1 TRANSFORMING PROTEIN P21/H-RAS-1
	94	(HUMAN);
	95	Length = 671
	96
	97	Score = 123 bits (62), Expect = 1e-25
	98	Identities = 143/170 (84%)
	99	Strand = Plus / Plus
	100
	101
	102	Query: 2052 ggaagcaggtggtcattgatggggagacgtgcctgttggacatcctggataccgccggcc 2111
	103	\|\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\| \|\| \|\| \|\|\|\|\| \|\| \|\|\|\|\|\|\|\|\|\|\|\|\| \|\| \|\| \|\| \|
	104	Sbjct: 452 ggaagcaggtggtgattgacggagaaacgtgtctactggacatcctggacactgcaggtc 511
	105
	106
	107	Query: 2112 aggaggagtacagcgccatgcgggaccagtacatgcgcaccggggagggcttcctgtgtg 2171
	108	\|\|\|\|\|\|\| \|\|\|\|\| \|\|\|\|\|\| \|\|\|\|\|\|\|\|\|\|\|\|\|\| \| \|\| \|\| \|\| \|\|\|\|\|\|\|\| \|\|\|\|
	109	Sbjct: 512 aggaggaatacagtgccatgagggaccagtacatgaggacaggagaaggcttcctctgtg 571
	110
	111
	112	Query: 2172 tgtttgccatcaacaacaccaagtcttttgaggacatccaccagtacagg 2221
	113	\| \|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\| \|\|\|\| \|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\| \|\|\|\|\|\|
	114	Sbjct: 572 tctttgccatcaataacacaaagtgctttgaggacattcaccactacagg 621
	115
	116
	117	Score = 71.9 bits (36), Expect = 5e-10
	118	Identities = 93/112 (83%)
	119	Strand = Plus / Plus
	120
	121
	122	Query: 1664 atgacggaatataagctggtggtggtgggcgccggcggtgtgggcaagagtgcgctgacc 1723
	123	\|\|\|\|\| \|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\| \|\|\|\|\| \|\| \|\| \|\| \|\| \|\|\|\|\| \|\| \|\| \|\| \|\|\|
	124	Sbjct: 331 atgaccgaatataagcttgtggtcgtgggagctggaggcgtaggcaaaagcgctctcacc 390
	125
	126
	127	Query: 1724 atccagctgatccagaaccattttgtggacgaatacgaccccactatagagg 1775
	128	\|\|\|\|\| \|\| \|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\|\| \| \|\|\|\|\| \|\|\|\|\| \|\|\|\|\|\|\|\|\|\|
	129	Sbjct: 391 atccaactcatccagaaccactttgtgaatgaatatgacccaactatagagg 442
	130
	131
	132	>gb\|AI877912.1\|AI877912 fc55a12.y1 Zebrafish WashU MPIMG EST Danio rerio cDNA 5' similar to
	133	gb:V00574_cds1 TRANSFORMING PROTEIN P21/H-RAS-1 (HUMAN);,
	134	mRNA sequence
	135	Length = 647
	136
	137	Score = 99.6 bits (50), Expect = 2e-18
	138	Identities = 116/138 (84%)
	139	Strand = Plus / Plus
	140
	141
	142	Query: 2052 ggaagcaggtggtcattgatggggagacgtgcctgttggacatcctggataccgccggcc 2111
	143	\|\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\| \|\| \|\|\|\|\|\|\|\| \|\|\| \|\|\|\|\|\|\|\|\|\|\|\|\| \|\| \|\| \|\|\|\|
	144	Sbjct: 336 ggaagcaggtggtgattgacggcgagacgtgtctgctggacatcctggacactgcaggcc 395
	145
	146
	147	Query: 2112 aggaggagtacagcgccatgcgggaccagtacatgcgcaccggggagggcttcctgtgtg 2171
	148	\|\|\|\| \|\|\|\|\|\|\|\|\|\|\| \|\|\| \| \|\|\|\|\|\|\|\|\|\|\|\| \| \|\| \|\| \|\|\|\|\| \|\|\|\|\| \|\| \|
	149	Sbjct: 396 aggaagagtacagcgcaatgagagaccagtacatgaggacaggagagggtttcctctgcg 455
	150
	151
	152	Query: 2172 tgtttgccatcaacaaca 2189
	153	\| \|\| \|\| \|\|\|\|\|\|\|\|\|\|
	154	Sbjct: 456 tcttcgctatcaacaaca 473
	155
	156
	157	Score = 73.8 bits (37), Expect = 1e-10
	158	Identities = 94/113 (83%)
	159	Strand = Plus / Plus
	160
	161
	162	Query: 1663 gatgacggaatataagctggtggtggtgggcgccggcggtgtgggcaagagtgcgctgac 1722
	163	\|\|\|\|\|\| \|\| \|\|\|\|\|\|\|\|\|\|\| \|\| \|\|\|\|\| \|\| \|\| \|\|\|\|\| \|\| \|\|\|\|\| \|\|\| \| \|\|
	164	Sbjct: 214 gatgactgagtataagctggttgttgtgggagcaggaggtgttgggaagagcgcgttaac 273
	165
	166
	167	Query: 1723 catccagctgatccagaaccattttgtggacgaatacgaccccactatagagg 1775
	168	\|\|\|\|\|\|\|\| \|\|\|\|\|\|\|\| \|\| \|\|\|\|\|\|\|\| \|\|\|\|\| \|\|\|\|\|\|\|\|\|\|\| \|\|\|\|
	169	Sbjct: 274 aatccagctcatccagaatcactttgtggatgaatatgaccccactattgagg 326
	170
	171
	172	Score = 67.9 bits (34), Expect = 8e-09
	173	Identities = 52/58 (89%)
	174	Strand = Plus / Plus
	175
	176
	177	Query: 2375 gagcagatcaaacgggtgaaggactcggatgacgtgcccatggtgctggtggggaaca 2432
	178	\|\|\|\|\|\|\|\|\|\|\| \|\| \|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\| \|\| \|\|\|\|\|\|\|\| \|\| \|\|\|\|\|\|\|\|\|\|
	179	Sbjct: 506 gagcagatcaagcgtgtgaaggactcggatgatgttcccatggtcctagtggggaaca 563
	180
	181
	182	>gb\|AI959079.1\|AI959079 fd24c02.y1 Zebrafish WashU MPIMG EST Danio rerio cDNA 5' similar to
	183	gb:V00574_cds1 TRANSFORMING PROTEIN P21/H-RAS-1 (HUMAN);,
	184	mRNA sequence
	185	Length = 647
	186
	187	Score = 89.7 bits (45), Expect = 2e-15
	188	Identities = 129/157 (82%)
	189	Strand = Plus / Plus
	190
	191
	192	Query: 2040 aggattcctaccggaagcaggtggtcattgatggggagacgtgcctgttggacatcctgg 2099
	193	\|\|\|\|\|\|\| \|\|\| \| \|\|\|\|\|\|\|\| \|\| \|\|\|\|\|\|\|\| \|\|\|\|\| \|\| \|\| \|\|\|\| \|\|\|\|\|\|\|
	194	Sbjct: 397 aggattcatacagaaagcaggttgtgattgatggagagacttgtttgctggatatcctgg 456
	195
	196
	197	Query: 2100 ataccgccggccaggaggagtacagcgccatgcgggaccagtacatgcgcaccggggagg 2159
	198	\| \|\| \|\| \|\| \|\|\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\|\| \| \|\|\|\|\|\|\|\| \|\|\| \| \|\| \|\| \|\|\|\|
	199	Sbjct: 457 acactgcaggtcaggaggagtacagtgccatgagagaccagtatatgaggacaggagagg 516
	200
	201
	202	Query: 2160 gcttcctgtgtgtgtttgccatcaacaacaccaagtc 2196
	203	\| \|\| \|\| \|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\|\|\|
	204	Sbjct: 517 gatttctttgtgtgtttgccatcaacaatgccaagtc 553
	205
	206
	207	>gb\|AA141102\|AA141102 CK01231.5prime CK Drosophila melanogaster embryo BlueScript
	208	Drosophila melanogaster cDNA clone CK01231 5prime similar
	209	to 0:
	210	Length = 551
	211
	212	Score = 89.7 bits (45), Expect = 2e-15
	213	Identities = 89/104 (85%)
	214	Strand = Plus / Plus
	215
	216
	217	Query: 1664 atgacggaatataagctggtggtggtgggcgccggcggtgtgggcaagagtgcgctgacc 1723
	218	\|\|\|\|\|\|\|\|\|\|\| \|\| \|\|\|\|\| \|\| \|\| \|\| \|\|\|\|\| \|\| \|\|\|\|\|\|\|\|\| \|\|\|\|\| \|\|\|
	219	Sbjct: 181 atgacggaatacaaactggtcgtcgttggagccggaggcgtgggcaagtccgcgctcacc 240
	220
	221
	222	Query: 1724 atccagctgatccagaaccattttgtggacgaatacgaccccac 1767
	223	\|\|\|\|\|\|\|\| \|\|\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\|\|\|\| \|\|\|\|\|\|\|\|\|\|\|
	224	Sbjct: 241 atccagctaatccagaaccatttcgtggacgantacgaccccac 284
	225
	226
	227	Score = 63.9 bits (32), Expect = 1e-07
	228	Identities = 83/101 (82%)
	229	Strand = Plus / Plus
	230
	231
	232	Query: 2048 taccggaagcaggtggtcattgatggggagacgtgcctgttggacatcctggataccgcc 2107
	233	\|\|\|\|\| \|\|\|\|\| \|\|\|\|\| \|\| \|\|\|\|\| \|\| \|\| \|\|\|\|\|\| \|\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\|\|
	234	Sbjct: 298 taccgaaagcaagtggttatcgatggaganacctgcctgctggacatcctggacaccgcc 357
	235
	236
	237	Query: 2108 ggccaggaggagtacagcgccatgcgggaccagtacatgcg 2148
	238	\|\|\|\|\| \|\| \|\| \| \| \|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\| \|\|\|\|\|
	239	Sbjct: 358 ggccaagatgantnctcggccatgcgggatcagtatatgcg 398
	240
	241
	242	>gb\|AW148172.1\|AW148172 da13b03.x1 normalized Xenopus laevis gastrula Xenopus laevis cDNA
	243	clone XENOPUS_SOURCE_ID:xlnga001a06 3' similar to
	244	gb:V00574_cds1 TRANSFORMING PROTEIN P21/H-RAS-1 (HUMAN);
	245	gb:X13664 Mouse mRNA for N-ras protein (MOUSE...
	246	Length = 652
	247
	248	Score = 69.9 bits (35), Expect = 2e-09
	249	Identities = 134/167 (80%)
	250	Strand = Plus / Minus
	251
	252
	253	Query: 2055 agcaggtggtcattgatggggagacgtgcctgttggacatcctggataccgccggccagg 2114
	254	\|\|\|\|\|\|\|\|\|\| \|\| \|\| \|\|\|\|\| \|\| \|\|\|\|\| \| \|\|\|\|\| \|\|\|\|\|\|\| \|\| \|\| \|\| \|
	255	Sbjct: 526 agcaggtggtgatagacggggaaacttgcctcctagacatattggatactgcggggcaag 467
	256
	257
	258	Query: 2115 aggagtacagcgccatgcgggaccagtacatgcgcaccggggagggcttcctgtgtgtgt 2174
	259	\|\|\|\| \|\|\|\|\| \|\|\|\|\|\| \|\|\|\| \|\|\|\|\|\|\|\|\|\|\|\|\|\| \|\| \|\| \|\| \|\| \|\| \|\|\|\|\| \|
	260	Sbjct: 466 aggaatacagtgccatgagggatcagtacatgcgcacgggagaagggtttctctgtgtct 407
	261
	262
	263	Query: 2175 ttgccatcaacaacaccaagtcttttgaggacatccaccagtacagg 2221
	264	\|\|\|\| \|\| \|\| \|\|\|\|\| \|\|\|\|\|\|\|\| \|\|\|\|\|\| \|\|\|\| \|\| \|\|\|\|\|\|
	265	Sbjct: 406 ttgctattaataacacaaagtctttcgaggacgtccatcattacagg 360
	266
	267
	268	>dbj\|AU003591\|AU003591 AU003591 Bombyx mori p50(Daizo) Bombyx mori cDNA clone ws00308, mRNA
	269	sequence [Bombyx mori]
	270	Length = 724
	271
	272	Score = 63.9 bits (32), Expect = 1e-07
	273	Identities = 59/68 (86%)
	274	Strand = Plus / Plus
	275
	276
	277	Query: 2375 gagcagatcaaacgggtgaaggactcggatgacgtgcccatggtgctggtggggaacaag 2434
	278	\|\|\|\|\|\|\|\|\|\|\| \|\| \|\|\|\|\|\|\|\|\| \|\|\|\| \|\| \|\|\|\|\|\|\|\|\|\|\| \|\| \|\|\|\|\| \|\|\|\|\|\|
	279	Sbjct: 150 gagcagatcaagcgagtgaaggacgcggaagaggtgcccatggtacttgtgggcaacaag 209
	280
	281
	282	Query: 2435 tgtgacct 2442
	283	\|\| \|\|\|\|\|
	284	Sbjct: 210 tgcgacct 217
	285
	286
	287	>dbj\|AU004341\|AU004341 AU004341 Bombyx mori p50(Daizo) Bombyx mori cDNA clone ws20237, mRNA
	288	sequence [Bombyx mori]
	289	Length = 739
	290
	291	Score = 63.9 bits (32), Expect = 1e-07
	292	Identities = 59/68 (86%)
	293	Strand = Plus / Plus
	294
	295
	296	Query: 2375 gagcagatcaaacgggtgaaggactcggatgacgtgcccatggtgctggtggggaacaag 2434
	297	\|\|\|\|\|\|\|\|\|\|\| \|\| \|\|\|\|\|\|\|\|\| \|\|\|\| \|\| \|\|\|\|\|\|\|\|\|\|\| \|\| \|\|\|\|\| \|\|\|\|\|\|
	298	Sbjct: 499 gagcagatcaagcgagtgaaggacgcggaagaggtgcccatggtacttgtgggcaacaag 558
	299
	300
	301	Query: 2435 tgtgacct 2442
	302	\|\| \|\|\|\|\|
	303	Sbjct: 559 tgcgacct 566
	304
	305
	306	>gb\|AI575394.1\|AI575394 UI-R-G0-uh-e-12-0-UI.s1 UI-R-G0 Rattus norvegicus cDNA clone
	307	UI-R-G0-uh-e-12-0-UI 3', mRNA sequence
	308	Length = 536
	309
	310	Score = 63.9 bits (32), Expect = 1e-07
	311	Identities = 59/68 (86%)
	312	Strand = Plus / Plus
	313
	314
	315	Query: 2114 gaggagtacagcgccatgcgggaccagtacatgcgcaccggggagggcttcctgtgtgtg 2173
	316	\|\|\|\|\|\|\|\|\|\|\| \|\| \|\|\| \|\|\|\|\|\|\|\|\|\|\|\|\|\| \| \|\| \|\|\|\|\|\|\|\|\|\|\| \|\| \|\|\|\|\|
	317	Sbjct: 387 gaggagtacagtgcaatgagggaccagtacatgagaactggggagggctttctttgtgta 446
	318
	319
	320	Query: 2174 tttgccat 2181
	321	\|\|\|\|\|\|\|\|
	322	Sbjct: 447 tttgccat 454
	323
	324
	325	>gb\|AI259307\|AI259307 LP02693.3prime LP Drosophila melanogaster larval-early pupal pOT2
	326	Drosophila melanogaster cDNA clone LP02693 3prime similar
	327	to M80535: R FBgn0004636 PID:g158198 SWISS-PROT:P08645,
	328	mRNA sequence [Drosophila melanogaster]
	329	Length = 542
	330
	331	Score = 63.9 bits (32), Expect = 1e-07
	332	Identities = 50/56 (89%)
	333	Strand = Plus / Minus
	334
	335
	336	Query: 2387 cgggtgaaggactcggatgacgtgcccatggtgctggtggggaacaagtgtgacct 2442
	337	\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\| \|\| \|\|\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\| \|\|\|\|\|\|\|\| \|\|\|\|\|
	338	Sbjct: 275 cgggtgaaggacacggacgatgtgcccatggtgctcgtgggcaacaagtgcgacct 220
	339
	340
	341	>dbj\|AU004702\|AU004702 AU004702 Bombyx mori p50(Daizo) Bombyx mori cDNA clone ws20754, mRNA
	342	sequence [Bombyx mori]
	343	Length = 747
	344
	345	Score = 63.9 bits (32), Expect = 1e-07
	346	Identities = 59/68 (86%)
	347	Strand = Plus / Plus
	348
	349
	350	Query: 2375 gagcagatcaaacgggtgaaggactcggatgacgtgcccatggtgctggtggggaacaag 2434
	351	\|\|\|\|\|\|\|\|\|\|\| \|\| \|\|\|\|\|\|\|\|\| \|\|\|\| \|\| \|\|\|\|\|\|\|\|\|\|\| \|\| \|\|\|\|\| \|\|\|\|\|\|
	352	Sbjct: 485 gagcagatcaagcgagtgaaggacgcggaagaggtgcccatggtacttgtgggcaacaag 544
	353
	354
	355	Query: 2435 tgtgacct 2442
	356	\|\| \|\|\|\|\|
	357	Sbjct: 545 tgcgacct 552
	358
	359
	360	>gb\|AI295927\|AI295927 LP09693.5prime LP Drosophila melanogaster larval-early pupal pOT2
	361	Drosophila melanogaster cDNA clone LP09693 5prime similar
	362	to Y07564: Ric FBgn0017549 PID:e276381 SPTREMBL:P91637,
	363	mRNA sequence [Drosophila melanogaster]
	364	Length = 522
	365
	366	Score = 61.9 bits (31), Expect = 5e-07
	367	Identities = 52/59 (88%)
	368	Strand = Plus / Plus
	369
	370
	371	Query: 2090 gacatcctggataccgccggccaggaggagtacagcgccatgcgggaccagtacatgcg 2148
	372	\|\|\|\|\|\|\|\| \|\| \|\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\| \|\| \|\|\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\|\|\|\|
	373	Sbjct: 413 gacatccttgacaccgccggccaggtggagttcacggccatgcgggaccaatacatgcg 471
	374
	375
	376	>gb\|AI405934\|AI405934 GH26087.5prime GH Drosophila melanogaster head pOT2 Drosophila
	377	melanogaster cDNA clone GH26087 5prime similar to Y07564:
	378	Ric FBgn0017549 PID:e276381 SPTREMBL:P91637, mRNA
	379	sequence [Drosophila melanogaster]
	380	Length = 602
	381
	382	Score = 61.9 bits (31), Expect = 5e-07
	383	Identities = 52/59 (88%)
	384	Strand = Plus / Plus
	385
	386
	387	Query: 2090 gacatcctggataccgccggccaggaggagtacagcgccatgcgggaccagtacatgcg 2148
	388	\|\|\|\|\|\|\|\| \|\| \|\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\| \|\| \|\|\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\|\|\|\|
	389	Sbjct: 77 gacatccttgacaccgccggccaggtggagttcacggccatgcgggaccaatacatgcg 135
	390
	391
	392	>gb\|AI238083\|AI238083 GH14071.5prime GH Drosophila melanogaster head pOT2 Drosophila
	393	melanogaster cDNA clone GH14071 5prime similar to Y07564:
	394	Ric FBgn0017549 PID:e276381 SPTREMBL:P91637, mRNA
	395	sequence [Drosophila melanogaster]
	396	Length = 535
	397
	398	Score = 61.9 bits (31), Expect = 5e-07
	399	Identities = 52/59 (88%)
	400	Strand = Plus / Plus
	401
	402
	403	Query: 2090 gacatcctggataccgccggccaggaggagtacagcgccatgcgggaccagtacatgcg 2148
	404	\|\|\|\|\|\|\|\| \|\| \|\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\| \|\| \|\|\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\|\|\|\|
	405	Sbjct: 425 gacatccttgacaccgccggccaggtggagttcacggccatgcgggaccaatacatgcg 483
	406
	407
	408	>gb\|AI626207.1\|AI626207 fc12c05.y1 Zebrafish WashU MPIMG EST Danio rerio cDNA 5' similar to
	409	TR:P97913 P97913 N-RAS {EXONS 1 AND 2 ;, mRNA sequence
	410	Length = 529
	411
	412	Score = 58.0 bits (29), Expect = 7e-06
	413	Identities = 92/113 (81%)
	414	Strand = Plus / Plus
	415
	416
	417	Query: 1663 gatgacggaatataagctggtggtggtgggcgccggcggtgtgggcaagagtgcgctgac 1722
	418	\|\|\|\|\|\| \|\| \|\|\|\|\|\|\|\|\|\|\| \|\| \|\|\|\|\| \|\| \|\| \|\|\|\| \| \|\|\|\|\| \|\|\| \| \|\|
	419	Sbjct: 298 gatgactgagtataagctggttgttgtgggagcaggaggtggttggaagagcgcgttaac 357
	420
	421
	422	Query: 1723 catccagctgatccagaaccattttgtggacgaatacgaccccactatagagg 1775
	423	\|\|\|\|\|\|\|\| \|\|\|\|\|\|\|\| \|\| \|\|\|\|\|\|\|\| \|\|\|\|\| \|\|\|\|\|\|\|\|\|\|\| \|\|\|\|
	424	Sbjct: 358 aatccagctcatccagaatcactttgtggatgaatatgaccccactattgagg 410
	425
	426
	427	>gb\|AA900048\|AA900048 UI-R-E0-da-e-01-0-UI.s1 UI-R-E0 Rattus norvegicus cDNA clone
	428	UI-R-E0-da-e-01-0-UI 3' similar to >
	429	gi\|191344\|gb\|M84166\|HAMCHARAS Hamster c-Ha-ras protein
	430	gene, complete cds, mRNA sequence [Rattus norvegicus]
	431	Length = 315
	432
	433	Score = 58.0 bits (29), Expect = 7e-06
	434	Identities = 32/33 (96%)
	435	Strand = Plus / Minus
	436
	437
	438	Query: 3319 gctgcatgagctgcaagtgtgtgctctcctgac 3351
	439	\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\| \|\|\|\|\|\|\|
	440	Sbjct: 315 gctgcatgagctgcaagtgtgtgctgtcctgac 283
	441
	442
	443	Database: Non-redundant Database of all other organisms
	444	GenBank+EMBL+DDBJ EST sequences: updated on Sat Dec 4 01:15:02 1999
	445	Posted date: Dec 4, 1999 2:31 AM
	446	Number of letters in database: 338,270,040
	447	Number of sequences in database: 800,021
	448
	449	Lambda K H
	450	1.37 0.711 1.31
	451
	452	Gapped
	453	Lambda K H
	454	1.37 0.711 1.31
	455
	456
	457	Matrix: blastn matrix:1 -3
	458	Gap Penalties: Existence: 5, Extension: 2
	459	Number of Hits to DB: 890609
	460	Number of Sequences: 800021
	461	Number of extensions: 890609
	462	Number of successful extensions: 160
	463	Number of sequences better than 1.0e-04: 19
	464	length of query: 6453
	465	length of database: 338,270,040
	466	effective HSP length: 21
	467	effective length of query: 6432
	468	effective length of database: 321,469,599
	469	effective search space: 2067692460768
	470	effective search space used: 2067692460768
	471	T: 0
	472	A: 0
	473	X1: 6 (11.9 bits)
	474	X2: 25 (49.6 bits)
	475	S1: 12 (24.3 bits)
	476	S2: 28 (56.0 bits)