Codebase list libnhgri-blastall-perl / upstream/0.66
Imported Upstream version 0.66 Jens Preussner 11 years ago
16 changed file(s) with 2665 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
0 package NHGRI::Blastall;
1 require 5.004;
2
3 ###########################################################################
4 # See the bottom of this file for the POD documentation. Search for the
5 # string '=head'.
6 #
7 # You can run this file through either pod2man or pod2html to produce pretty
8 # documentation in manual or html file format (these utilities are part of the
9 # Perl 5 distribution).
10 #
11 # The most recent version and complete docs are available at:
12 # ftp://ftp.nhgri.nih.gov/pub/software/
13 ###########################################################################
14 # This software is "United States Government Work" under the
15 # terms of the United States Copyright Act. It was written as part of the
16 # authors' official duties for the United States Government and thus
17 # cannot be copyrighted. This software/database is freely available to
18 # the public for use without a copyright notice. Restrictions cannot be
19 # placed on its present or future use.
20 #
21 # Although all reasonable efforts have been taken to ensure the accuracy
22 # and reliability of the software and data, the National Human Genome
23 # Research Institute (NHGRI) and the U.S. Government does not and cannot
24 # warrant the performance or results that may be obtained by using this
25 # software or data. NHGRI and the U.S. Government disclaims all
26 # warranties as to performance, merchantability or fitness for any
27 # particular purpose.
28 #
29 # In any work or product derived from this material, proper attribution
30 # of the authors as the source of the software or data should be made,
31 # using http://genome.nhgri.nih.gov/blastall as the citation.
32 ###########################################################################
33
34 use strict;
35 use vars qw($VERSION $REVISION $BLASTALL $BLASTCL3 $BL2SEQ %WU_BLAST
36 $FORMATDB %DEFAULT_FILTERS);
37 use IO::File;
38 use Carp qw(carp croak);
39
40 $VERSION = '0.67';
41 $REVISION = '# $Id: Blastall.pm,v 1.3 2002/01/15 20:09:08 jpearson Exp $';
42
43 # you can set these variables to the full path of the corresponding binaries
44 $BLASTALL = 'blastall';
45 $BLASTCL3 = 'blastcl3';
46 $BL2SEQ = 'bl2seq';
47 $FORMATDB = 'formatdb';
48
49 # you can adjust the locations of wu-blast programs
50 # for instance 'wu_blastn' => '/usr/local/wu/blastn',
51 # I haven't added support for wu-blastall (hopefully I will get to this)
52 %WU_BLAST = ('wu_blastn' => 'wu-blastn',
53 'wu_blastp' => 'wu-blastp',
54 'wu_blastx' => 'wu-blastx',
55 'wu_tblastn' => 'wu-tblastn',
56 'wu_tblastx' => 'wu-tblastx'
57 );
58
59 ###########################################################################
60 # You probably shouldn't be mucking below here...
61 ###########################################################################
62 %DEFAULT_FILTERS = ('id' => '=~'
63 ,'defline' => '=~'
64 ,'subject_length' => '>'
65 ,'scores' => '>'
66 ,'expects' => '<'
67 ,'identities' => '>'
68 ,'match_lengths' => '>'
69 ,'subject_strands' => 'eq'
70 ,'query_strands' => 'eq'
71 ,'query_frames' => '=='
72 ,'subject_frames' => '=='
73 );
74
75 sub new {
76 my $this = shift;
77 my %args = @_;
78 my $class = ref($this) || $this;
79 my $self = \%args;
80 bless $self, $class;
81 return($self);
82 }
83 sub blastall {
84 my $self = shift;
85 my $rh_opts = _get_opts(@_);
86
87 my ($ra_args,$tmp_file) = _get_args($rh_opts);
88 unshift @{$ra_args}, $BLASTALL;
89
90 my $rc = 0xffff & system(@{$ra_args});
91 if ($rc == 0) {
92 $self->{options} = $rh_opts;
93 push @{$self->{tmp_file}}, $tmp_file; # record so we can DESTROY later
94 $self->{report_file} = $self->{options}->{o} || $tmp_file;
95 return(1);
96 } else {
97 foreach (@{$ra_args}) { print STDERR "$_ "; }
98 print STDERR "\n";
99 carp qq!the above system call to $BLASTALL failed.
100 Possible problems include
101 1. Is blastall in your path or is \$BLASTALL the full path to blastall?
102 2. Do you have the DATABASE installed
103 3. Are you pointing to the correct location of the query sequence
104 4. Is your DATABASE corrupted
105 5. Are you running the correct program for the type of database and
106 query sequence you are running. e.g. BLASTP -> PROTEIN
107 6. Perhaps try blastcl3 instead of blastall.
108 !;
109 return undef;
110 }
111 }
112 sub formatdb {
113 my $self = shift;
114 my $rh_opts = _get_opts(@_);
115
116 my $ra_args = _get_formatdb_args($rh_opts);
117 unshift @{$ra_args}, $FORMATDB;
118
119 my $rc = 0xffff & system(@{$ra_args});
120 if ($rc == 0) {
121 $self->{options} = $rh_opts;
122 my $ra_indexes = _get_formatdb_indexes($rh_opts);
123 push @{$self->{indexes}}, @{$ra_indexes};
124 #$self->{indexes} = _get_formatdb_indexes($rh_opts);
125 $self->{tmp_file} = [];
126 return(1);
127 } else {
128 foreach (@{$ra_args}) { print STDERR "$_ "; }
129 print STDERR "\n";
130 carp qq!the above system call to $FORMATDB failed.
131 Possible problems include
132 1. Is formatdb in your path or is \$FORMATDB the full path to formatdb?
133 !;
134 return undef;
135 }
136 }
137 sub remove_formatdb_indexes {
138 my $self = shift;
139 if ($self->{indexes}) {
140 foreach my $i (@{$self->{indexes}}) {
141 unlink $i or carp "could not remove $i:$!";
142 }
143 }
144 }
145 sub blast_one_to_many {
146 my $self = shift;
147 my $rh_opts = _get_opts(@_);
148 my ($ra_args,$tmp_file) = _get_args($rh_opts);
149 if ($rh_opts->{p} eq 'blastp' or $rh_opts->{p} eq 'blastx') {
150 $self->formatdb(p => 'T', i => $rh_opts->{d});
151 } else {
152 $self->formatdb(p => 'F', i => $rh_opts->{d});
153 }
154 $self->blastall($rh_opts);
155 $self->remove_formatdb_indexes();
156 }
157 sub bl2seq {
158 my $self = shift;
159 my $rh_opts = _get_opts(@_);
160
161 my ($ra_args,$tmp_file) = _get_args($rh_opts);
162 unshift @{$ra_args}, $BL2SEQ;
163
164 my $rc = 0xffff & system(@{$ra_args});
165 if ($rc == 0) {
166 $self->{options} = $rh_opts;
167 push @{$self->{tmp_file}}, $tmp_file; # record so we can DESTROY later
168 $self->{report_file} = $self->{options}->{o} || $tmp_file;
169 return(1);
170 } else {
171 foreach (@{$ra_args}) { print STDERR "$_ "; }
172 print STDERR "\n";
173 carp qq!the above system call to $BL2SEQ failed.
174 Possible problems include
175 1. Is bl2seq in your path or is \$BL2SEQ the full path to bl2seq?
176 2. Are you running the correct program for the type of sequences you
177 are comparing? e.g. BLASTP -> PROTEIN
178 !;
179 return undef;
180 }
181 }
182 sub wu_blastall {
183 my $self = shift;
184 my $rh_opts = _get_opts(@_);
185
186 my ($ra_args,$tmp_file) = _get_wu_args($rh_opts);
187 my $command = join ' ', @{$ra_args};
188 my $output = $rh_opts->{o} || $tmp_file;
189 $command .= " > $output";
190 #print "$command\n";
191
192 my $rc = 0xffff & system("$command");
193 if ($rc == 0) {
194 $self->{options} = $rh_opts;
195 push @{$self->{tmp_file}}, $tmp_file; # record so we can DESTROY later
196 $self->{report_file} = $self->{options}->{o} || $tmp_file;
197 return(1);
198 } else {
199 print STDERR "$command\n";
200 carp qq!the above system call failed.!;
201 return undef;
202 }
203 }
204 sub net_blastall {
205 carp "net_blastall is depricated please use blastcl3 instead\n";
206 blastcl3(@_);
207 }
208 sub blastcl3 {
209 {
210 local $BLASTALL = $BLASTCL3;
211 blastall(@_);
212 }
213 }
214 sub mask {
215 my $self = shift;
216 my $rh_opts;
217 if (@_ > 1) {
218 my %opts = @_;
219 $rh_opts = \%opts;
220 } else {
221 $rh_opts = shift;
222 }
223 my ($rc);
224 # type can be wu_blastall, blastcl3, blastall or net_blastall(depricated)
225 my $type_of_blast = $rh_opts->{type} || 'blastall';
226 delete $rh_opts->{type};
227 my ($ra_args,$tmp_file) = _get_args($rh_opts);
228 unshift @{$ra_args}, $BLASTALL;
229 my $m = new NHGRI::Blastall;
230 $rh_opts->{o} = $rh_opts->{o} || $tmp_file;
231 if ($type_of_blast eq 'net_blastall') {
232 $m->net_blastall($rh_opts);
233 } elsif ($type_of_blast eq 'blastcl3') {
234 $m->blastcl3($rh_opts);
235 } elsif ($type_of_blast eq 'wu_blastall') {
236 $m->wu_blastall($rh_opts);
237 } else {
238 # 2002-01-12 JVP added "return undef unless"
239 return undef unless ($m->blastall($rh_opts));
240 }
241 my $report_file = $m->{report_file};
242 my $fa_file = $m->{options}->{i};
243 my($fa_line,$sequence) = _get_sequence($fa_file);
244 my($ra_mask_positions) = _get_mask_positions($report_file);
245 my $masked_sequence = _get_masked_sequence($sequence,$ra_mask_positions);
246 return ("$fa_line\n$masked_sequence", $ra_mask_positions) if (wantarray);
247 return ("$fa_line\n$masked_sequence");
248 }
249 sub print_report {
250 my $self = shift;
251 my $report_file = $self->{report_file};
252 return undef unless ($report_file);
253 open REPORT, $report_file or return(0);
254 my $report = join ('',<REPORT>);
255 print $report;
256 return;
257 }
258 sub filter {
259 my $self = shift;
260 my $rh_criteria;
261 if (@_ > 1) {
262 my %criteria = @_;
263 $rh_criteria = \%criteria;
264 } else {
265 $rh_criteria = shift;
266 }
267
268 my @filtered_results = ();
269 $self->_parse_report() unless ($self->{been_parsed});
270
271 my ($ra_stripped_values,$ra_cmp_ops) =
272 _get_stripped_values_and_comparison_operators($rh_criteria);
273 foreach my $entry (@{$self->{results}}) {
274 if ($entry = _entry_passes_filter($entry,$ra_stripped_values,
275 $ra_cmp_ops,$rh_criteria)) {
276 push @filtered_results,$entry;
277 }
278 }
279 $self->{results} = \@filtered_results;
280 return @filtered_results;
281 }
282 sub unfilter {
283 my $self = shift;
284 $self->_parse_report();
285 }
286 sub result {
287 my($self,@r) = @_;
288 my $ra_results = [];
289
290 $self->_parse_report() unless ($self->{been_parsed});
291 return $self->_all_results unless @r;
292 if (@r == 1) {
293 $ra_results = $self->_result_for_one(@r);
294 } elsif (@r == 2) {
295 $ra_results = $self->_result_for_two(@r);
296 } else {
297 return undef;
298 }
299 return(@{$ra_results});
300 }
301 sub read_report {
302 my $self = shift;
303 my $file = shift;
304 $self->{report_file} = $file;
305 }
306 sub get_database_description {
307 my $self = shift;
308 $self->_parse_report() unless ($self->{been_parsed});
309 return $self->{database_description};
310 }
311 sub get_database_sequence_count {
312 my $self = shift;
313 $self->_parse_report() unless ($self->{been_parsed});
314 return $self->{database_sequence_count};
315 }
316 sub get_database_letter_count {
317 my $self = shift;
318 $self->_parse_report() unless ($self->{been_parsed});
319 return $self->{database_letter_count};
320 }
321 sub get_blast_program {
322 my $self = shift;
323 $self->_parse_report() unless ($self->{been_parsed});
324 return $self->{program};
325 }
326 sub get_blast_version {
327 my $self = shift;
328 $self->_parse_report() unless ($self->{been_parsed});
329 return $self->{blast_version};
330 }
331 sub get_report {
332 my $self = shift;
333 return $self->{report_file};
334 }
335 sub DESTROY {
336 my $self = shift;
337 foreach (@{$self->{tmp_file}}) {
338 unlink $_ if (-e $_);
339 }
340 }
341
342 # END OF PUBLIC METHODS
343 # BEGIN PRIVATE METHODS
344
345 sub _get_opts {
346 my @args = @_;
347 my $rh_opts;
348
349 if (@args == 1) {
350 $rh_opts = shift;
351 } else {
352 my %opts = @args;
353 $rh_opts = \%opts;
354 }
355 return $rh_opts;
356 }
357 sub _get_formatdb_indexes {
358 my $rh_opts = shift;
359 my @indexes = ();
360 my $x = 'p';
361 my $name = $rh_opts->{i};
362 $name = $rh_opts->{n} if ($rh_opts->{n});
363 $x = 'n' if ($rh_opts->{p} =~ /f/i);
364 push @indexes, "${name}.${x}hr", "${name}.${x}in", "${name}.${x}sq";
365 if ($rh_opts->{o} && $rh_opts->{o} =~ /t/i) {
366 push @indexes, "${name}.${x}nd", "${name}.${x}ni",
367 "${name}.${x}sd", "${name}.${x}si";
368 }
369 return \@indexes;
370 }
371 sub _write_report_to_disk {
372 my $report_file = shift;
373 my $results = shift;
374 open OUT, ">$report_file" or croak "cannot open $report_file:$!";
375 print OUT $results;
376 }
377
378 sub _get_matrix_parameters {
379 return { PAM30 => [9, 1],
380 PAM70 => [10, 1],
381 BLOSUM45 => [14, 2],
382 BLOSUM80 => [10, 1],
383 BLOSUM62 => [11, 1],
384 BLOSUM45 => [14, 2]
385 };
386 }
387 sub _get_genetic_codes {
388 return { 1 => "Standard (1)",
389 2 => "Vertebrate Mitochondrial (2)",
390 3 => "Yeast Mitochondrial (3)",
391 4 => "Mold Mitochondrial; ... (4)",
392 5 => "Invertebrate Mitochondrial (5)",
393 6 => "Ciliate Nuclear; ... (6)",
394 9 => "Echinoderm Mitochondrial (9)",
395 10 => "Euplotid Nuclear (10)",
396 11 => "Bacterial (11)",
397 12 => "lternative Yeast Nuclear (12)",
398 13 => "Ascidian Mitochondrial (13)",
399 14 => "Flatworm Mitochondrial (14)",
400 15 => "Blepharisma Macronuclear (15)"
401 };
402 }
403
404 sub _get_masked_sequence {
405 my $sequence = shift;
406 my $ra_mask_positions = shift;
407 my ($i,$iplus);
408 return $sequence unless ($ra_mask_positions); # nothing masked
409
410 if ($ra_mask_positions->[0] &&
411 $ra_mask_positions->[0] > $ra_mask_positions->[1]) {
412 @{$ra_mask_positions} = reverse @{$ra_mask_positions};
413 }
414
415 for ($i=0; $i < @{$ra_mask_positions}; $i=$i+2) {
416 $iplus = ($i + 1);
417 $sequence = _mask_from_x_to_y($sequence,$ra_mask_positions->[$i],
418 $ra_mask_positions->[$iplus]);
419 }
420 return $sequence;
421 }
422
423 sub _mask_from_x_to_y {
424 my $sequence = shift;
425 my $x = shift;
426 my $y = shift;
427 my $length = ($y - $x);
428 substr($sequence, $x, $length) = ('N' x $length);
429 return($sequence);
430 }
431
432
433 sub _get_mask_positions {
434 my $report_file = shift;
435 my @mask_positions = ();
436
437 open REPORT, $report_file or carp "cannot open $report_file:$!";
438 while (<REPORT>) {
439 if (/^Query:\s+(\d+)\s+.*\s+(\d+)$/) {
440 push @mask_positions,($1,$2);
441 }
442 }
443 return(\@mask_positions);
444 }
445
446
447 sub _get_sequence {
448 my $fasta_file = shift;
449 my @fasta_lines = ();
450
451 open FASTA, $fasta_file or warn "cannot open $fasta_file:$!";
452 chomp(@fasta_lines = <FASTA>);
453 my $first_line = shift @fasta_lines;
454 my $seq = join '',@fasta_lines;
455 return ($first_line,$seq);
456 }
457 sub _result_for_one {
458 my $self = shift;
459 my $attribute = shift;
460 my $count = 0;
461 my $ra_results = [];
462 $attribute = _map_attribute($attribute);
463 foreach (@{$self->{results}}) {
464 if (ref($self->{results}->[$count]->{$attribute}) eq 'ARRAY'){
465 push @{$ra_results},@{$self->{results}->[$count]->{$attribute}};
466 } else {
467 push @{$ra_results},$self->{results}->[$count]->{$attribute};
468 }
469 $count++;
470 }
471 return($ra_results);
472 }
473 sub _result_for_two {
474 my $self = shift;
475 my $attribute = shift;
476 my $id = shift;
477 my $ra_results = [];
478 $attribute = _map_attribute($attribute);
479 for (my $i = 0; $i < @{$self->{results}}; $i++) {
480 next unless ($self->{results}->[$i]->{id} eq $id);
481 if (ref($self->{results}->[$i]->{$attribute}) eq 'ARRAY'){
482 push @{$ra_results},@{$self->{results}->[$i]->{$attribute}};
483 } else {
484 push @{$ra_results},$self->{results}->[$i]->{$attribute};
485 }
486 }
487 return($ra_results);
488 }
489 sub _map_attribute {
490 my $attribute = shift;
491 my %attribute_map = ('ids' => 'id',
492 'score' => 'scores',
493 'def_line' => 'defline',
494 'def_lines' => 'defline',
495 'def-line' => 'defline',
496 'def-lines' => 'defline',
497 'deflines' => 'defline',
498 'expect' => 'expects',
499 'identity' => 'identities',
500 'gap' => 'gaps',
501 'subject_lengths' => 'subject_length',
502 'subject-length' => 'subject_length',
503 'subject-lengths' => 'subject_length',
504 'subjectlength' => 'subject_length',
505 'subjectlengths' => 'subject_length',
506 'length' => 'subject_length',
507 'lengths' => 'subject_length',
508 'match_length' => 'match_lengths',
509 'matchlengths' => 'match_lengths',
510 'matchlength' => 'match_lengths',
511 'match-length' => 'match_lengths',
512 'match-lengths' => 'match_lengths',
513 'query-frame' => 'query_frames',
514 'queryframe' => 'query_frames',
515 'query-frames' => 'query_frames',
516 'queryframes' => 'query_frames',
517 'query_frame' => 'query_frames',
518 'query-frame' => 'query_frames',
519 'subjectframe' => 'subject_frames',
520 'subject-frames' => 'subject_frames',
521 'subjectframes' => 'subject_frames',
522 'subject_frame' => 'subject_frames',
523 );
524
525 $attribute = lc($attribute);
526 $attribute = $attribute_map{$attribute} || $attribute;
527 return($attribute);
528 }
529 sub _all_results {
530 my $self = shift;
531 return () unless defined($self) && $self->{results};
532 return () unless @{$self->{results}};
533 return @{$self->{results}};
534 }
535 sub _entry_passes_filter {
536 my $entry = shift;
537 my $ra_stripped_values = shift;
538 my $ra_cmp_ops = shift;
539 my $rh_criteria = shift;
540 my @fields = keys %$rh_criteria;
541 my ($i,$code,$pass,@codes,$ra_safe_vals);
542
543 for($i=0; $i < @$ra_stripped_values; $i++) {
544 if (ref($entry->{$fields[$i]}) eq "ARRAY") {
545 my $n = 0;
546 my (@sorted,$best_result);
547 foreach (@{$entry->{$fields[$i]}}) {
548 if ($ra_cmp_ops->[$i] =~ /\~/) {
549 s/"/'/g; # cannot nest double quotes in regex
550 $code = '$pass' . " = 1 if (\"$_\" $ra_cmp_ops->[$i] ";
551 $ra_stripped_values->[$n] =~ s/\//\\\//g;
552 $code .= "/$ra_stripped_values->[$n]/i)";
553 } else {
554 $ra_safe_vals = _check_for_unsafe_e($entry->{$fields[$i]});
555 if ($ra_cmp_ops->[$i] =~ />/) {
556 @sorted = sort { $b <=> $a } @{$ra_safe_vals};
557 } else {
558 @sorted = sort { $a <=> $b } @{$ra_safe_vals};
559 }
560
561 $best_result = shift @sorted;
562 $code = '$pass' . qq! = 1 if ("$best_result" !;
563 $code .= qq!$ra_cmp_ops->[$i] '$ra_stripped_values->[$i]')!;
564 }
565 _set_passing_positions_flags ($entry,
566 $ra_safe_vals,
567 $ra_cmp_ops->[$i],
568 $ra_stripped_values->[$i]);
569 push @codes, $code;
570 $n++;
571 }
572 foreach (@codes) {
573 $pass = 0;
574 eval $_;
575 carp "eval on $_ failed: $@" if $@;
576 }
577 return undef unless $pass;
578 } else {
579 if ($ra_cmp_ops->[$i] =~ /\~/) {
580 # cannot nest double quotes in regex
581 $entry->{$fields[$i]} =~ s/"/'/g;
582 $code = '$pass' . qq! = 1 if ("$entry->{$fields[$i]}" !;
583 $code .= "$ra_cmp_ops->[$i] ";
584 $ra_stripped_values =~ s/\//\\\//g;
585 $code .= "/$ra_stripped_values->[$i]/i)\;";
586 } else {
587 $entry->{$fields[$i]} =~ s/^e-(\d*)$/1e-$1/;
588 $code = '$pass' . qq! = 1 if ("$entry->{$fields[$i]}" !;
589 $code .= "$ra_cmp_ops->[$i] ";
590 $code .= qq!'$ra_stripped_values->[$i]')\;!;
591 }
592 $pass = 0;
593
594 eval $code;
595 return undef unless $pass;
596 croak "eval on $code failed: $@" if $@;
597 }
598 }
599 return $entry;
600 }
601
602 sub _set_passing_positions_flags {
603 my $entry = shift;
604 my $ra_safe_vals = shift;
605 my $cmp_op = shift;
606 my $cmp_val = shift;
607 my $count = 0;
608 my $code = '';
609 my @array_positions = ();
610
611 $entry->{passing_positions} = [];
612 foreach (@{$ra_safe_vals}) {
613 $code = qq^if ("$_" $cmp_op "$cmp_val") { ^;
614 $code .= q^ $entry->{passing_positions}->[$count] = "pass"; ^;
615 $code .= q^} else { ^;
616 $code .= q^ $entry->{passing_positions}->[$count] = "fail"; }^;
617 eval $code;
618 carp "eval on $code failed: $@" if $@;
619 $count++;
620 }
621 return;
622 }
623 ###########################################################################
624 # sub _check_for_unsafe_e #
625 ###########################################################################
626 # When sorting BLAST values that have an initial e (e.g. `e-122')
627 # they will not sort. so we need to put a 1 if front (e.g. `1e-122')
628 ###########################################################################
629 sub _check_for_unsafe_e {
630 my $ra_unsafe_vals = shift;
631 my @safe_vals = ();
632
633 foreach (@{$ra_unsafe_vals}) {
634 $_ =~ s/^e-(\d*)$/1e-$1/;
635 push @safe_vals,$_;
636 }
637 return \@safe_vals;
638 }
639
640 sub _get_stripped_values_and_comparison_operators {
641 my $rh_criteria = shift;
642 my ($key,$value,@stripped_values,@cmp_ops);
643 my $count = 0;
644
645 while (($key,$value) = each %$rh_criteria) {
646 $key = lc $key;
647 ($stripped_values[$count],$cmp_ops[$count]) =
648 _check_for_comparison_operators($key,$value);
649 $count++;
650 }
651 return(\@stripped_values,\@cmp_ops);
652 }
653 sub _check_for_comparison_operators {
654 my $key = shift;
655 my $value = shift;
656 my ($cmp_op,$new_value);
657 if ($value =~ /^(<=|>=|=~|~|!~|>|<|=|==)(.*)/) {
658 if ($1 eq '~') {
659 $cmp_op = '=~';
660 } elsif ($1 eq '=') {
661 $cmp_op = '==';
662 } else {
663 $cmp_op = $1;
664 }
665 $new_value = $2;
666 } else {
667 $cmp_op = $DEFAULT_FILTERS{$key};
668 $new_value = $value;
669 }
670 return($new_value,$cmp_op);
671 }
672 sub _parse_report {
673 my $self = shift;
674 my $report_file = $self->{report_file};
675 $self->{been_parsed} = 1;
676 $self->{results} = [];
677 unless ($self->{report_file_handle}) {
678 open REPORT, $report_file or carp "cannot open $report_file:$!";
679 $self->{report_file_handle} = \*REPORT;
680 }
681 my $hit_new_seq = 0;
682 my $hit_a_seq = 0;
683 my $hit_first_seq_line = 0;
684 my $hit_first_hsp = 0;
685 my $hit_summaries = 0;
686 my $hit_query = 0;
687 my $hit_database = 0;
688 my ($id,$defline,$subject_length,@scores,@expects,@identities,
689 @match_lengths,@query_starts,@subject_starts,$strand,$q_strand,
690 $last_end_point,$q_last_end_point,@subject_strands,
691 @query_strands,$query,$query_length,$e,@query_seqs,
692 @subject_seqs,$query_seq,$subject_seq,@query_frames,@subject_frames,
693 @query_ends, @subject_ends);
694 my $id_regex = $self->{-DB_ID_REGEX}
695 || '[^\|]+(?:\|[^\|,\s]*){1,10}';
696 $id_regex = "($id_regex)"; # add capturing parens
697 while (<REPORT>) {
698 $hit_summaries = 1 if (/^Sequences producing significant alignments/);
699 $hit_a_seq = 1 if (/^>/);
700
701 # Gather Query defline and Length
702 if ($hit_query == 0 && /^Query=\s+(.*)/) {
703 $query = $1;
704 $hit_query = 1;
705 } elsif ($hit_query == 1) {
706 if (/\(([0-9,]+) letters\)/) {
707 $query_length = $1;
708 $query_length =~ s/,//g;
709 $hit_query = 0;
710 } else {
711 chomp;
712 $query .= $_;
713 }
714 }
715 # Gather Database information
716 if (/^Database:\s+(.*)/) {
717 $self->{database_description} = $1;
718 $hit_database = 1;
719 } elsif ($hit_database &&
720 /^\s+([\d,]+) sequences; ([\d,]+) total letters/) {
721 $self->{database_sequence_count} = $1;
722 $self->{database_letter_count} = $2;
723 $hit_database = 0;
724 } elsif ($hit_database && /^\s*(.*)$/) {
725 $self->{database_description} .= $1;
726 }
727
728 if (!$self->{program} && $_ =~ /^(T?BLAST[XNP]) ([\d.]+)?/) {
729 $self->{program} = $1;
730 $self->{blast_version} = $2;
731 }
732 $self->{options}->{p} = lc($1) if (/^(T?BLAST[XNP]?)/);
733 next unless ($hit_a_seq);
734 if (/^>$id_regex\s+(.*)/) {
735 if ($id) {
736 # catches the starting point for the previous reverse strand
737 push @query_starts,$q_last_end_point if ($q_strand eq 'minus');
738 push @query_ends,$q_last_end_point if ($q_strand eq 'plus');
739 push @subject_starts,$last_end_point if ($strand eq 'minus');
740 push @subject_ends,$last_end_point if ($strand eq 'plus');
741 push @subject_seqs,$subject_seq if ($subject_seq);
742 push @query_seqs,$query_seq if ($query_seq);
743
744 $self->_push_new_results($id,$defline,$subject_length,
745 $query,$query_length,
746 \@scores,\@expects,\@identities,\@match_lengths,
747 \@query_starts,\@query_ends,\@subject_starts,\@subject_ends,
748 \@subject_strands,\@query_strands,\@subject_seqs,
749 \@query_seqs,\@query_frames,\@subject_frames);
750 }
751 # initialize arrays for new sample
752 @scores = (); @expects = (); @identities = (); @match_lengths = ();
753 @query_starts = (); @query_ends = (); @subject_strands = ();
754 @query_strands = (); @subject_starts = (); @subject_ends = ();
755 $strand = ''; $q_strand = ''; $last_end_point = '';
756 $q_last_end_point = ''; $subject_seq = ''; $query_seq = '';
757 @subject_seqs = (); @query_seqs = (); @query_frames = ();
758 @subject_frames = (); $hit_new_seq = 1;
759 $id = $1;
760 $defline = $2;
761 } elsif (/\s+Length = (\d+)/) {
762 $hit_new_seq = 0;
763 $subject_length = $1;
764 } elsif ($hit_new_seq == 1) {
765 chomp();
766 $defline .= $_;
767 } elsif (/^\s*Score[^=]*=\s*([0-9\.-]*)[^E]*Expect[^=]*=\s*([0-9\.e-]*)/) {
768 # catches the starting point for the previous reverse strand
769 push @query_starts,$q_last_end_point if ($q_strand eq 'minus');
770 push @query_ends,$q_last_end_point if ($q_strand eq 'plus');
771 push @subject_starts,$last_end_point if ($strand eq 'minus');
772 push @subject_ends,$last_end_point if ($strand eq 'plus');
773 push @subject_seqs,$subject_seq if ($subject_seq);
774 push @query_seqs,$query_seq if ($query_seq);
775 $subject_seq = ''; $query_seq = '';
776
777 push @scores,$1;
778 push @expects,$2;
779 $hit_first_seq_line = 0;
780 } elsif (/^\s*Identities[^=]*=\s*(\d+)\/(\d+)/) {
781 my $identity = $1 / $2;
782 push @identities,$identity;
783 push @match_lengths,$2;
784 $hit_first_seq_line = 0;
785 } elsif (/^\s*Frame = ([-,+]\d)(?: \/ ([-,+]\d))?/) {
786 push @query_frames, $1;
787 push @subject_frames, $2 if ($2);
788 } elsif (/Query:\s+(\d+)\s+(\S+)\s+(\d+)/) {
789 if (($1 > $3) && $hit_first_seq_line == 0) {
790 $q_strand = 'minus';
791 push @query_ends,$1;
792 } elsif (($1 < $3) && $hit_first_seq_line == 0) {
793 $q_strand = 'plus';
794 push @query_starts,$1;
795 }
796 push @query_strands,$q_strand if ($hit_first_seq_line == 0);
797 $q_last_end_point = $3;
798 $query_seq .= $2;
799 $hit_first_seq_line++;
800 } elsif (/^Sbjct:\s+(\d+)\s+(\S+)\s+(\d+)/) {
801 if (($1 > $3) && $hit_first_seq_line == 1) {
802 $strand = 'minus';
803 push @subject_ends,$1;
804 } elsif (($1 < $3) && $hit_first_seq_line == 1) {
805 $strand = 'plus';
806 push @subject_starts,$1;
807 }
808 push @subject_strands,$strand if ($hit_first_seq_line == 1);
809 $last_end_point = $3;
810 $subject_seq .= $2;
811 $hit_first_seq_line++;
812 }
813 }
814 # catch the last one.
815 my $ra_safe_expects = _check_for_unsafe_e(\@expects);
816 if ($id) {
817 # catches the starting point for the previous reverse strand
818 push @query_starts,$q_last_end_point if ($q_strand eq 'minus');
819 push @query_ends,$q_last_end_point if ($q_strand eq 'plus');
820 push @subject_starts,$last_end_point if ($strand eq 'minus');
821 push @subject_ends,$last_end_point if ($strand eq 'plus');
822 push @subject_seqs,$subject_seq if ($subject_seq);
823 push @query_seqs,$query_seq if ($query_seq);
824 $self->_push_new_results($id,$defline,$subject_length,
825 $query,$query_length,\@scores,$ra_safe_expects,\@identities,
826 \@match_lengths,\@query_starts,\@query_ends,\@subject_starts,
827 \@subject_ends,\@subject_strands,\@query_strands,
828 \@subject_seqs,\@query_seqs,\@query_frames,\@subject_frames);
829 }
830 }
831
832 sub _push_new_results {
833 my $self = shift;
834 my ($id,$defline,$subject_length,$query,$query_length,$ra_scores,
835 $ra_expects,$ra_identities,$ra_match_lengths,$ra_query_starts,
836 $ra_query_ends,$ra_subject_starts,$ra_subject_ends,$ra_subject_strands,
837 $ra_query_strands,$ra_subject_seqs,$ra_query_seqs,$ra_query_frames,
838 $ra_subject_frames) = @_;
839
840 my $rh_hash = {'id' =>$id
841 , 'defline' =>$defline
842 , 'subject_length' =>$subject_length
843 , 'query' =>$query
844 , 'query_length' =>$query_length
845 , 'scores' => [ @{$ra_scores} ]
846 , 'expects' => [ @{$ra_expects} ]
847 , 'identities' => [ @{$ra_identities} ]
848 , 'match_lengths' => [ @{$ra_match_lengths} ]
849 , 'query_starts' => [ @{$ra_query_starts} ]
850 , 'query_ends' => [ @{$ra_query_ends} ]
851 , 'subject_starts' => [ @{$ra_subject_starts} ]
852 , 'subject_ends' => [ @{$ra_subject_ends} ]
853 , 'subject_strands' => [ @{$ra_subject_strands} ]
854 , 'query_strands' => [ @{$ra_query_strands} ]
855 , 'subject_seqs' => [ @{$ra_subject_seqs} ]
856 , 'query_seqs' => [ @{$ra_query_seqs} ]
857 , 'query_frames' => []
858 , 'subject_frames' => []
859 };
860 # only true in blastx or tblastx
861 if ($ra_query_frames && $self->{options}->{p} =~ /blastx/i) {
862 push @{$rh_hash->{query_frames}}, @{$ra_query_frames};
863 }
864 # only true in tblastn
865 if ($ra_query_frames && $self->{options}->{p} =~ /tblastn/i) {
866 push @{$rh_hash->{subject_frames}}, @{$ra_query_frames};
867 }
868 # only true in tblastx
869 if ($self->{options}->{p} =~ /tblastx/i) {
870 push @{$rh_hash->{subject_frames}}, @{$ra_subject_frames};
871 }
872 push @{$self->{results}}, $rh_hash;
873 }
874 sub _get_args {
875 my $rh_opts = shift;
876 my $tmp_dir = _get_temp_directory();
877 my ($key,$value,$outfile);
878 my @args = ();
879 my $tmp_file = "";
880 while (($key,$value) = each %$rh_opts) {
881 push @args, "-$key$value";
882 $outfile = $value if ($key eq "o");
883 }
884 unless ($outfile) {
885 my $base = sprintf("%s/%s-%d-%d-0", $tmp_dir, ,"Blastall", $$, time());
886 $tmp_file = _generate_temp_file($base);
887 push @args, "-o$tmp_file";
888 }
889 return(\@args,$tmp_file);
890 }
891 sub _get_formatdb_args {
892 my $rh_opts = shift;
893 my @args = ();
894 foreach my $k (keys %{$rh_opts}) {
895 push @args, "-${k}$rh_opts->{$k}";
896 }
897 return \@args;
898 }
899 sub _get_wu_args {
900 my $rh_opts = shift;
901 my ($key,$value);
902 my $tmp_dir = _get_temp_directory();
903 my $base_name = sprintf("%s/%s-%d-%d-0", $tmp_dir, ,"Blastall", $$, time());
904 my $tmp_file = _generate_temp_file($base_name);
905 my $program = $WU_BLAST{$rh_opts->{p}} || $WU_BLAST{wu_blastn};
906 my $database = $rh_opts->{d} || 'nr';
907 my $infile = $rh_opts->{i};
908 my $outfile = $rh_opts->{o};
909 my @args = ($program, $database, $infile);
910 while (($key,$value) = each %${rh_opts}) {
911 next if ($key eq 'p' || $key eq 'd' || $key eq 'i' || $key eq 'o');
912 if ($value eq '!' || $value eq '') {
913 push @args, "-$key";
914 } else {
915 push @args, "-$key=$value";
916 }
917 }
918 # $rh_opts->{o} ? push @args, "> $rh_opts->{o}" : push @args, "> $tmp_file";
919 return(\@args,$tmp_file);
920 }
921 sub _get_temp_directory {
922 return($ENV{TMPDIR}) if ($ENV{TMPDIR});
923 return($ENV{TEMP}) if ($ENV{TEMP});
924 return($ENV{TMP}) if ($ENV{TMP});
925 return('/tmp') if (-d '/tmp' && -w '/tmp');
926 return('/var/tmp') if (-d '/var/tmp' && -w '/var/tmp');
927 return('/usr/tmp') if (-d '/usr/tmp' && -w '/usr/tmp');
928 return('/temp') if (-d '/temp' && -w '/temp');
929 return('.') if (-w '.');
930 $_ = 'Cannot find a place to write tempfiles. ';
931 $_ .= 'Please set the TMPDIR environmental variable. ';
932 return undef;
933 }
934 sub _generate_temp_file {
935 my $base_name = shift;
936 my $count = 0;
937 while ($count < 1000) {
938 return ($base_name) unless (-e $base_name); # don't clobber existing
939 $base_name =~ s/-?(\d+)?$/"-" . (1 + $1)/e;
940 $count++;
941 }
942 return undef;
943 }
944
945 1;
946 __END__
947
948 =head1 NAME
949
950 NHGRI::Blastall - Perl extension for running and parsing NCBI's BLAST 2.x
951
952 =head1 SYNOPSIS
953
954 =over 4
955
956 =head1 DESCRIPTION
957
958 If you have NCBI's BLAST2 or WU-BLAST installed locally and your
959 environment is already setup you can use Perl's object-oriented
960 capabilities to run your BLASTs. Also if you have a blastcl3 binary
961 from the toolkit (or binaries from our FTP site) you can run BLAST over
962 the network. There are also methods to blast single sequences against
963 each other using the bl2seq binaries (also in the toolkit and binaries).
964 You can blast one sequence against a library of sequences using the
965 blast_one_to_many method. You can format databases with formatdb method.
966 You can also have NHGRI::Blastall read existing BLAST
967 reports. If you have a database of repetitive DNA or other DNA you would
968 like to mask out, you can use the mask method to mask the data against
969 these databases. You can then use either the filter or result methods
970 to parse the report and access the various elements of the data.
971
972
973 =item RUNNING NEW BLASTS
974
975 use NHGRI::Blastall;
976 my $b = new NHGRI::Blastall;
977 # If you are running NCBI's Local BLAST
978 $b->blastall( p => 'blastn',
979 d => 'nr',
980 i => 'infile',
981 o => 'outfile'
982 );
983 # If you are running NCBI's blastcl3 network client
984 $b->blastcl3( p => 'blastn',
985 d => 'nr',
986 i => 'infile',
987 o => 'outfile'
988 );
989 # If you are running WU-BLAST locally
990 $b->wu_blastall( p => 'blastn',
991 d => 'nr',
992 nogap => '!', #use ! for arguments w/o parameter
993 i => 'infile',
994 o => 'outfile'
995 );
996
997 See BLASTALL for more info
998
999 =item BLASTING 2 SEQUENCES
1000
1001 use NHGRI::Blastall;
1002 my $b = new NHGRI::Blastall;
1003 $b->bl2seq(i => 'seq1',
1004 j => 'seq2',
1005 p => 'tblastx'
1006 );
1007
1008 See BL2SEQ for more info
1009
1010 =item BLASTING 1 SEQUENCE AGAINST A FASTA LIBRARY OF SEQUENCES
1011
1012 # a library is a FASTA file with multiple FASTA formatted sequences
1013 use NHGRI::Blastall;
1014 my $b = new NHGRI::Blastall;
1015 $b->blast_one_to_many(i => 'seq1',
1016 d => 'seq2.lib',
1017 p => 'tblastx',
1018 );
1019
1020 See BLAST_ONE_TO_MANY for more info
1021
1022 =item INITIALIZING EXISTING BLAST REPORTS
1023
1024 use NHGRI::Blastall;
1025 my $b = new NHGRI::Blastall;
1026 $b->read_report('/path/to/report');
1027
1028 =item MASKING SEQUENCES
1029
1030 use NHGRI::Blastall;
1031 my $b = new NHGRI::Blastall;
1032 $masked_seq = $b->mask( type => 'wu_blastall',
1033 p => 'blastn',
1034 d => 'alu',
1035 i => 'infile'
1036 );
1037
1038 See MASKING for more info
1039
1040 =item CREATING BLAST INDEXES
1041
1042 use NHGRI::Blastall;
1043 my $b = new NHGRI::Blastall;
1044 $b->formatdb( i => 'est',
1045 p => 'F',
1046 o => 'T',
1047 );
1048
1049 See FORMATDB for more info
1050
1051 =item PRINTING REPORTS
1052
1053 $b->print_report();
1054 # this method only opens the report and prints. It does not print
1055 # summary reports
1056
1057 =item FILTERING BLAST RESULTS
1058
1059 @hits = $b->filter( scores => '38.2',
1060 identities => '.98'
1061 );
1062
1063 # returns an array of hash references.
1064 See HASHREF for more info on manipulating the results.
1065 See FILTERING for more info on using the filter method
1066
1067 =item GETTING AT ELEMENTS
1068
1069 @ids = $b->result('id');
1070 @scores = $b->result('scores',$ids[0]); # second param must be an id
1071
1072 See RESULT for more info on using the result method
1073 See ELEMENTS for element names
1074
1075 =item GETTING AT ALL THE DATA
1076
1077 @results = $b->result(); # returns an array of hashes
1078
1079 See HASHREF for information on the array of hashes that is returned.
1080 See DUMP RESULTS to see how to work with the array of hashes
1081
1082 =item ADJUSTING THE DEFLINE REGEX
1083
1084 $b = new NHGRI::Blastall (-DB_ID_REGEX => '[^ ]+');
1085
1086 See DB_ID_REGEX for more info
1087
1088 =back
1089
1090 =head1 BLASTALL
1091
1092 This method provides a simple object oriented frontend to BLAST.
1093 This module works with either NCBI's blastall binary distributed with
1094 BLAST 2.x, WU-BLAST or over the web through NCBI's Web Site.
1095 The blastall function accepts as a parameter an anonymous hash with keys
1096 that are the command line options (See BLASTALL OPTIONS) and values
1097 which are the corresponding values to those options. You may want to
1098 set the BLASTALL variable in Blastall.pm to the full path of your
1099 `blastall' binary, especially if you will be running scripts as cron
1100 jobs or if blastall is not in the system path.
1101
1102 =head1 BLASTALL OPTIONS
1103
1104 For wu_blastall you need to use NCBI type switches for the following
1105 [CB<-i>] for infile
1106 [CB<-o>] for outfile
1107 [CB<-p>] for program
1108 [CB<-d>] for database
1109 the rest of the parameters MUST be the parameters available through WU-BLAST
1110 (e.g. -sump, -nogap -compat1.4, etc.)
1111 use a `!' to specify that an argument has no parameters. See the example
1112 at the top of the manpage.
1113
1114 These are the options that NCBI's blastall and binary accepts and these
1115 are the same options that are accepted by the blastall and blastcl3 methods.
1116 NOTE: You must set the proper environmental variables for the blastall
1117 method to work (BLASTDB,BLASTMAT).
1118
1119 =over 4
1120
1121 =item
1122
1123 B<p> => Program Name
1124
1125 =item
1126
1127 B<d> => Database default=nr
1128
1129 =item
1130
1131 B<i> => QueryFile
1132
1133 =item
1134
1135 B<e> => Expectation vaule (E) default=10.0
1136
1137 =item
1138
1139 B<m> => alignment view default=0
1140 0 = pairwise,
1141 1 = master-slave showing identities,
1142 2 = master-slave no identities,
1143 3 = flat master-slave, show identities,
1144 4 = flat master-slave, no identities,
1145 5 = master-slave no identities and blunt ends,
1146 6 = flat master-slave, no identities and blunt ends
1147
1148 =item
1149
1150 B<o> => BLAST report Output File default=stdout
1151
1152 =item
1153
1154 B<F> => Filter query sequence default=T
1155 (DUST with blastn, SEG with others)
1156
1157 =item
1158
1159 B<G> => Cost to open a gap default=0
1160 (zero invokes default behavior)
1161
1162 =item
1163
1164 B<E> => Cost to extend a gap default=0
1165 (zero invokes default behavior)
1166
1167 =item
1168
1169 B<X> => X dropoff value for gapped alignment (in bits) default=0
1170 (zero invokes default behavior)
1171
1172 =item
1173
1174 B<I> => Show GI's in deflines default=F
1175
1176 =item
1177
1178 B<q> => Penalty for a nucleotide mismatch (blastn only) default=-3
1179
1180 =item
1181
1182 B<r> => Reward for a nucleotide match (blastn only) default=1
1183
1184 =item
1185
1186 B<v> => Number of one-line descriptions (V) default=500
1187
1188 =item
1189
1190 B<b> => Number of alignments to show (B) default=250
1191
1192 =item
1193
1194 B<f> => Threshold for extending hits, default if zero default=0
1195
1196 =item
1197
1198 B<g> => Perfom gapped alignment (NA with tblastx) default=T
1199
1200 =item
1201
1202 B<Q> => Query Genetic code to use default=1
1203
1204 =item
1205
1206 B<D> => DB Genetic code (for tblast[nx] only) default=1
1207
1208 =item
1209
1210 B<a> => Number of processors to use default=1
1211
1212 =item
1213
1214 B<O> => SeqAlign file Optional
1215
1216 =item
1217
1218 B<J> => Believe the query defline default=F
1219
1220 =item
1221
1222 B<M> => Matrix default=BLOSUM62
1223
1224 =item
1225
1226 B<W> => Word size, default if zero default=0
1227
1228 =item
1229
1230 B<z> => Effective length of the database default=0
1231 (use zero for the real size)
1232
1233 =item
1234
1235 B<K> => Number of best hits from a region to keep default=100
1236
1237 =item
1238
1239 B<L> => Length of region used to judge hits default=20
1240
1241 =item
1242
1243 B<Y> => Effective length of the search space default=0
1244 (use zero for the real size)
1245
1246 =item
1247
1248 B<S> => Query strands to search against database default=3
1249 (for blast[nx], and tblastx).
1250 3 is both, 1 is top, 2 is bottom
1251
1252 =item
1253
1254 B<T> => Produce HTML output [T/F] default=F
1255
1256 =item
1257
1258 B<l> => Restrict search of database to list of GI's [String]
1259
1260 =back
1261
1262 NOTE: If you do not supply an `o' option (outfile),
1263 the following environment variables are checked in order:
1264 `TMPDIR', `TEMP', and `TMP'.
1265 If one of them is set, outfiles are created relative to the
1266 directory it specifies. If none of them are set, the first
1267 possible one of the following directories is used:
1268 /var/tmp , /usr/tmp , /temp , /tmp ,
1269 This file is deleted after the NHGRI::Blastall object is destroyed.
1270 It is recommended that you create a tmp directory in your home
1271 directory and set one of the above environmental vars to point
1272 to this directory and then set the permissions on this directory
1273 to 0700. Writing to a "public" tmp directory can have
1274 security ramifications.
1275
1276 =head1 BL2SEQ
1277
1278 This method uses the bl2seq binary (distributed with BLAST executables
1279 and source) to BLAST one sequence against another sequence.
1280 Like the blastall method the bl2seq method accepts the same options
1281 that the bl2seq binary accepts. Run bl2seq without options from the
1282 command line to get a full list of options. An important note about
1283 the options, when running blastx 1st sequence should be nucleotide;
1284 when running tblastn 2nd sequence should be nucleotide.
1285
1286 use NHGRI::Blastall;
1287 my $b = new NHGRI::Blastall;
1288 $b->bl2seq(i => 'seq1.nt',
1289 j => 'seq2.aa',
1290 p => 'blastx'
1291 );
1292
1293
1294 =head1 BLAST_ONE_TO_MANY
1295
1296 This method allows for blasting one sequence against a FASTA library
1297 of sequences. Behind the scenes, BLAST indexes are created (in the
1298 same directory as the FASTA library) using the provided FASTA library
1299 and the one sequence is used to search against this database.
1300 If the program completes successfully, the databases are removed.
1301 To compare two sequences, use the bl2seq method which is faster and
1302 less messy (no tmp indexes). This method accepts the same options
1303 as the blastall binary with the d option corresponding to the
1304 FASTA library.
1305
1306 use NHGRI::Blastall;
1307 my $b = new NHGRI::Blastall;
1308 $b->blast_one_to_many(i => 'seq.aa',
1309 d => 'seq.nt.lib',
1310 e => '0.001',
1311 p => 'tblastn',
1312 );
1313
1314 =head1 MASKING
1315
1316 Screens DNA sequences in fasta format against the database specified
1317 in the blastall 'd' option. The mask method accepts the same parameters
1318 as the blastall method. Any matches to the masking database will be
1319 substituted with "N"s. The mask method returns the masked sequence.
1320 Performs similar function as xblast, an old NCBI program written in C.
1321
1322 Set the type parameter to wu_blastall, blastcl3 or blastall
1323 depending on your configuration.
1324
1325 $masked_seq = $b->mask( type => 'blastcl3', # defaults to blastall
1326 p => 'blastn',
1327 d => 'alu',
1328 i => 'infile'
1329 );
1330
1331 To get the mask coordinates back call the mask method in an array
1332 context.
1333
1334 @mask = $b->mask(p => 'blastn',
1335 d => 'alu',
1336 i => 'infile'
1337 );
1338 $masked_seq = $mask[0]; # same as above masked seq
1339 $ra_masked_coords = $mask[1]; # reference to array of mask coordinates
1340
1341 =head1 FORMATDB
1342
1343 This method creates BLAST indexes using the formatdb binary which
1344 is distributed with BLAST. It accepts the same parameters as formatdb.
1345 The remove_formatdb_indexes method will remove databases created
1346 using the formatdb method (if called by the same object).
1347 formatdb leaves a file called formatdb.log by default in the current
1348 working directory (if it has permission). To change this behavior,
1349 use the l option to direct the sequence to /dev/null or somewhere else.
1350
1351 use NHGRI::Blastall;
1352 my $b = new NHGRI::Blastall;
1353 $b->formatdb( i => 'swissprot',
1354 p => 'T',
1355 l => '/dev/null',
1356 o => 'T',
1357 );
1358
1359
1360 =head1 DB_ID_REGEX
1361
1362 By default Blastall.pm expects FASTA deflines of BLAST databases to be
1363 formatted like Genbank database (gi|GINUMBER|DATABASE|ACCESSION|SUMMARY).
1364 The default regular expression is [^\|]+(?:\|[^\|,\s]*){1,10}
1365 When using non-genbankformatted deflines, it may become necessary to
1366 adjust the regular expression that identifies the unique identifier
1367 in a defline. This can be done with the -DB_ID_REGEX parameter to the
1368 new method. For example
1369
1370 $b = new NHGRI::Blastall( -DB_ID_REGEX => '[^ ]+' );
1371
1372 =head1 FILTERING
1373
1374 The filter method accepts an anonymous hash in which the keys are
1375 elements of the blast report and the values are limits that
1376 are put on the result set.
1377
1378 The following are the Filter elements and their default operations.
1379
1380 id => regular expression match
1381 defline => regular expression match
1382 subject_length => greater than
1383 scores => greater than
1384 expects => less than
1385 identities => greater than
1386 match_length => greater than
1387 subject_strand => equals
1388 query_frames => equals
1389 subject_frames => equals
1390
1391 so if you would like to limit your results to entries that have scores
1392 greater than 38.2 and identities greater than 98% you would say...
1393
1394 @hits = $b->filter( scores => '38.2',
1395 identities => '.98'
1396 );
1397
1398 you can also override the defaults. if you would like only scores
1399 that are less than 38.2 you could say...
1400
1401 @hits = $b->filter( scores => '<38.2' );
1402
1403 or if you wanted only identities that were equal to 1 and you didn't
1404 care about the hits array you could say...
1405
1406 $b->filter( identities => '=1' );
1407
1408 Regular expression matches are case insensitive. If you wanted
1409 only records with the word "human" in the defline you could say...
1410
1411 @hits = $b->filter( defline => 'HuMaN' );
1412
1413 After you run the filter method on an object the object only contains
1414 those results which passed the filter. This will effect additional
1415 calls to the filter method as well as calls to other methods
1416 (e.g. result). To reset the NHGRI::Blastall object you can use
1417 the unfilter method.
1418
1419 $b->unfilter();
1420
1421 See DUMP RESULTS for info on how to manipulate the array of hash refs.
1422
1423
1424 =head1 RESULT
1425
1426
1427 The result method has 3 possible invocations. The first invocation
1428 is when it is called without parameters.
1429
1430 @results = $b->result();
1431
1432 This invocation returns an array of hash references.
1433 See HASHREF for further explanation of this structure.
1434
1435 To get a list of all the ids do...
1436
1437 @ids = $b->result('id');
1438
1439 These ids can be used to get at specific elements. If 2 parameters
1440 are present and the first one is an element (See ELEMENTS for a list
1441 of ELEMENTS) and the second one is an id then the routine will
1442 return a list of elements corresponding to the id.
1443
1444 @scores = $b->result('scores',$ids[0]); # second param must be an id
1445
1446 If more than 2 elements are passed the function will return undef.
1447
1448 =head1 ACCESSOR METHODS
1449
1450 =over 4
1451
1452 =item B<get_report>
1453
1454 returns the filename of the BLAST report.
1455
1456 =item B<get_database_description>
1457
1458 returns description given to the database during formatting of db.
1459 e.g. All non-redundant GenBank CDStranslations+PDB+SwissProt+PIR
1460
1461 =item B<get_database_sequence_count>
1462
1463 returns the number of sequences in the database.
1464
1465 =item B<get_database_letter_count>
1466
1467 returns the number of total letters in the database.
1468
1469 =item B<get_blast_program>
1470
1471 returns the BLAST program name that appears at the top of the report.
1472 either BLASTN, BLASTP, BLASTX, TBLASTN or TBLASTX
1473
1474 =item B<get_blast_version>
1475
1476 returns the version of the BLAST program that was used.
1477
1478 =back
1479
1480 =head1 ELEMENTS
1481
1482 =over 4
1483
1484 =item B<id>
1485
1486 an example of an id is `>gb|U19386|MMU19386' the initial `>'
1487 is just a flag. The next characters up until the first pipe
1488 is the database the subject was taken from. The next characters
1489 up to the next pipe is the Genbank accession number. The last
1490 characters are the locus. This element is used as a unique
1491 identifier by the NHGRI::Blastall module.
1492 (SCALAR)
1493
1494 =item B<defline>
1495
1496 The definition line taken from the subject
1497 (SCALAR)
1498
1499 =item B<subject_length>
1500
1501 This is the length of the full subject, not the
1502 length of the match.
1503 (SCALAR)
1504
1505 =item B<scores>
1506
1507 This is score (in bits) of the match.
1508 (ARRAY)
1509
1510 =item B<expects>
1511
1512 This is the statistical significance (`E value') for the match.
1513 (ARRAY)
1514
1515 =item B<identities>
1516
1517 This is the number of identities divided by the match
1518 length in decimal format. (Listed as a fraction and a percentage
1519 in a BLAST report.)
1520 (ARRAY)
1521
1522 =item B<match_lengths>
1523
1524 this is the number of base pairs that match up.
1525 (ARRAY)
1526
1527 =item B<query_starts>
1528
1529 This is the number of the first base which matched
1530 with the subject.
1531 (ARRAY)
1532
1533 =item B<query_ends>
1534
1535 This is the number of the last base which matched
1536 with the subject.
1537 (ARRAY)
1538
1539 =item B<subject_starts>
1540
1541 This is the number of the first base which matched
1542 with the query.
1543 (ARRAY)
1544
1545 =item B<subject_ends>
1546
1547 This is the number of the last base which matched
1548 with the query.
1549 (ARRAY)
1550
1551 =item B<subject_strands>
1552
1553 This is either plus or minus depending on the orientation
1554 of the subject sequence in the match.
1555 (ARRAY)
1556
1557 =item B<query_strands>
1558
1559 This is either plus or minus depending on the orientation
1560 of the query sequence in the match.
1561 (ARRAY)
1562
1563 =item B<query_frames>
1564
1565 If you are running a blastx or tblastx search in which the
1566 query_sequence is translated this is the frame the query
1567 sequence matched.
1568 (ARRAY)
1569
1570 =item B<subject_frames>
1571
1572 If you are running a tblastn or tblastx search in which the
1573 subject sequence is translated, this is the frame where the
1574 subject sequence matched.
1575 (ARRAY)
1576
1577 =back
1578
1579 =head1 HASHREF
1580
1581 Each hash ref contains an id, defline and subject Length. Because
1582 there can be multiple scores, expect values, Identities, match_lengths,
1583 query_starts, query_strands and subject_starts, these are stored
1584 as array references. The following is an array containing two hash
1585 refs.
1586
1587 @hits = (
1588 {'id' => '>gb|U79716|HSU79716',
1589 'defline' => 'Human reelin (RELN) mRNA, complete cds',
1590 'subject_length' => '11580',
1591 'scores' => [ 684, 123 ],
1592 'expects' => [ 0.0, 3e-26 ],
1593 'identities' => [ .99430199, .89256198 ],
1594 'match_lengths' => [ 351, 121 ],
1595 'query_starts' => [ 3, 404 ],
1596 'query_ends' => [ 303, 704 ],
1597 'subject_starts' => [ 5858, 6259 ],
1598 'subject_ends' => [ 6158, 6559 ],
1599 'subject_strands' => [ 'plus', 'minus' ],
1600 'query_strands' => [ 'plus', 'plus' ],
1601 'query_frames' => [ '+1', '-3' ],
1602 'subject_frames' => [ '+2', '-1' ],
1603 },
1604 {'id' => '>gb|U24703|MMU24703',
1605 'defline' => 'Mus musculus reelin mRNA, complete cds',
1606 'subject_length' => '11673',
1607 'scores' => [ 319, 38.2 ],
1608 'expects' => [ 2e-85, 1.2 ],
1609 'identities' => [ .86455331, 1 ],
1610 'match_lengths' => [ 347, 19 ],
1611 'query_starts' => [ 3, 493 ],
1612 'query_ends' => [ 303, 793 ],
1613 'subject_starts' => [ 5968, 6457 ]
1614 'subject_ends' => [ 6268, 6757 ],
1615 'subject_strands' => [ 'plus', 'minus' ],
1616 'query_strands' => [ 'plus', 'plus' ],
1617 'query_frames' => [ '+3', '-3' ],
1618 'subject_frames' => [ '+1', '-2' ],
1619 }
1620 );
1621
1622 See ELEMENTS for explanation of each element.
1623 See DUMP RESULTS and/or the perlref(1) manpage for clues on working
1624 with this structure.
1625
1626 =head1 DUMP RESULTS
1627
1628 When calling the result function or with no parameters, or calling the
1629 filter function, an array of references to hashes is returned.
1630 Each elment of the array is a reference to a hash containing 1 record.
1631 See HASHREF for details on this structure. The following
1632 routine will go through each element of the array of hashes and
1633 then print out the element and it's corresponding value or values.
1634 See perlref(1) for more info on references.
1635
1636 sub dump_results {
1637 foreach $rh_r (@results) {
1638 while (($key,$value) = each %$rh_r) {
1639 if (ref($value) eq "ARRAY") {
1640 print "$key: ";
1641 foreach $v (@$value) {
1642 print "$v ";
1643 }
1644 print "\n";
1645 } else {
1646 print "$key: $value\n";
1647 }
1648 }
1649 }
1650 }
1651
1652 =head1 AUTHOR
1653
1654 =over 4
1655
1656 =item
1657
1658 Joseph Ryan (jfryan@nhgri.nih.gov)
1659
1660 =back
1661
1662 =head1 CONTACT ADDRESS
1663
1664 If you have problems, questions, comments send to webblaster@nhgri.nih.gov
1665
1666 =head1 COPYRIGHT INFORMATION
1667
1668 This software/database is "United States Government Work" under the
1669 terms of the United States Copyright Act. It was written as part of
1670 the authors' official duties for the United States Government and thus
1671 cannot be copyrighted. This software/database is freely available to
1672 the public for use without a copyright notice. Restrictions cannot
1673 be placed on its present or future use.
1674
1675 Although all reasonable efforts have been taken to ensure the
1676 accuracy and reliability of the software and data, the National
1677 Human Genome Research Institute (NHGRI) and the U.S. Government
1678 does not and cannot warrant the performance or results that may be
1679 obtained by using this software or data. NHGRI and the U.S.
1680 Government disclaims all warranties as to performance,
1681 merchantability or fitness for any particular purpose.
1682
1683 In any work or product derived from this material, proper
1684 attribution of the authors as the source of the software or
1685 data should be made, using http://genome.nhgri.nih.gov/blastall
1686 as the citation.
1687
1688 =head1 ENVIRONMENT VARIABLES
1689
1690 =over 4
1691
1692 =item B<BLASTDB>
1693
1694 location of BLAST formated databases
1695
1696 =item B<BLASTMAT>
1697
1698 location of BLAST matrices
1699
1700 =item B<TMPDIR> B<TEMP> B<TMP>
1701
1702 If the `o' option is not passed to the blastall method than NHGRI::Blastall
1703 looks for one of these vars (in order) to store the BLAST report. This
1704 report is destroyed after the NHGRI::Blastall.pm object is destroyed.
1705
1706 =back
1707
1708 =head1 SEE ALSO
1709
1710 L<perl(1)> L<perlref(1)>
1711
1712 http://www.ncbi.nlm.nih.gov/BLAST/newblast.html
1713
1714 ftp://ncbi.nlm.nih.gov/blast/db/README
1715
1716 http://www.ncbi.nlm.nih.gov/BLAST/tutorial/Altschul-1.html
1717
1718 =cut
0 VERSION 0.66
1 ---------------------------------------------------------------------
2 fixed a section of code in masking logic which would cause a warning
3 if user got no hits to their masking database.
4
5 VERSION 0.65
6 ---------------------------------------------------------------------
7 added the ability to get the coordinates back from a mask.
8 usually mask is called in a scalar context, but if you ask
9 in an array context you will receive a masked sequence and
10 a reference to an array of masked sequences.
11
12 VERSION 0.64 (never released)
13 ---------------------------------------------------------------------
14 Version .63 fix fixed a masking problem but introduced a bug in blasting
15 which would delete all BLAST reports after BLASTing. (version .63 was
16 never released). This version re-fixes the masking problem without
17 breaking anything else.
18
19 VERSION 0.63 (never released)
20 ---------------------------------------------------------------------
21 fixed problem with masking in which masking would fail if you provided -o opt
22 fixed typo in pod documentation.
23
24 VERSION 0.61
25 ---------------------------------------------------------------------
26 fixed pod which caused make to gripe
27 fixed reference in README
28
29 VERSION 0.60
30 ---------------------------------------------------------------------
31 added bl2seq method which BLASTs 2 sequences against each other
32 added blast_one_to_many method which BLASTs 1 sequence against
33 a library of sequences.
34 added formatdb method which creates BLASTable databases
35 added remove_formatdb_indexes method which removes the BLASTable
36 databases made by the formatdb method.
37 cleaned up some of the code
38 fixed a couple methods so they would work with other methods besides blastall
39 updated documentation
40
41 VERSION 0.56
42 ---------------------------------------------------------------------
43 added get_report method which returns the BLAST report file.
44
45 VERSION 0.55
46 ---------------------------------------------------------------------
47 fixes a bug which would turn some 0.0 expect values to 1000
48 if they did not appear in the summaries. This was to correct
49 an old bug in NCBI BLAST but seems to not exist any longer.
50
51 VERSION 0.54
52 ---------------------------------------------------------------------
53 added the blastcl3 method which uses the blastcl3 binary
54 to make network BLASTs. This became necessary when NCBI
55 changed their web interface (QBLAST) and broke my LWP
56 routines.
57
58 VERSION 0.53
59 ---------------------------------------------------------------------
60 fixed bug in wu-blast option processing.
61
62 VERSION 0.52
63 ---------------------------------------------------------------------
64 Added check to make sure temporary directories are writable.
65
66 VERSION 0.51
67 ---------------------------------------------------------------------
68 Added query_ends and subject_ends to the data parsed from the BLAST report
69 instead of relying on query_starts and match_lengths to get the end.
70
71 VERSION 0.50
72 ---------------------------------------------------------------------
73 fixed a number of bugs. Added the DB_ID_REGEX
74 added some accessor methods to get the database description,
75 number of sequences in the database, number of letters in the database,
76 BLAST program and program version.
77
78 VERSION 0.44
79 ---------------------------------------------------------------------
80
81 Blastall now parses and stores "Frames" from blastx, tblastn and tblastx runs.
82 Updated documentation.
83
84 VERSION 0.43
85 ---------------------------------------------------------------------
86
87 fixed some misleading documentation
88
89 VERSION 0.42
90 ----------------------------------------------------------------------
91
92 methods that required a hash_ref (wu-blastall, blastall, net-blastall and mask)
93 will now accept a hash. so you can either say...
94
95 blastall({'p'=>'blastn', 'd'=>'nr', 'i'=>'infile'}); #with ref
96 blastall('p'=>'blastn', 'd'=>'nr', 'i'=>'infile'); #without ref
97
98 Fixed the test suite so it tests for LWP modules before trying to run
99 net-blastall. If it doesn't find LWP modules it looks for $BLASTDB
100 environmental variable. If your BLASTDB environmental variable =~ /wu/
101 than it will run wu-blastall. If the BLASTDB env var does exist but
102 does not have the 'wu' pattern it tries to run NCBI's BLAST via the
103 blastall method. If all else fails it gives you a warning and fails
104 the test.
105
106
107 VERSION 0.41
108 ----------------------------------------------------------------------
109 Filter method used to remove sub entries but now does not.
110 Now if query sequence has one significant hit to a subject sequence
111 all hits to that sequence are shown.
112 There is a bug(feature?).
113 If one of the hits passes on Identity but not E-value
114 and another hit does the opposite all hits are reported.
115 Haven't decided what to do about this.
116
117 NEW IN VERSION 0.40
118 ----------------------------------------------------------------------
119 * support for WU BLAST
120 * support for WWW BLAST through NCBI
121 HOWEVER: NCBI has recently changed there web protocol to QBLAST and this
122 may present future problems.
123 * added sample scripts in scripts directory for WU, WWW and NCBI BLASTs
124 * documentation may be slightly behind. Sorry. Working on this...
125
0 To install this module, cd to the directory that contains this INSTALL
1 file and type the following:
2
3 perl Makefile.PL
4 make
5 make test
6 make install
7
8 ----------------------------------------------------------------------------
9 NHGRI::Blastall.pm expects that you have NCBI's blastall binary,
10 wu-blast binaries, or NCBI's blastcl3 binary installed.
11 If you do not have it you can download the NCBI binary
12 for several platforms from ftp://ftp.ncbi.nlm.nih.gov/blast/executables/
13 If you do not have the blastcl3 binary you can either compile the
14 NCBI toolkit (ftp://ftp.ncbi.nlm.nih.gov/toolbox/) or download the Entrez
15 binaries from (ftp://ncbi.nlm.nih.gov/entrez/CURRENT/).
16 WU-BLAST is available from (http://blast.wustl.edu/).
17
18 ----------------------------------------------------------------------------
19 Certain methods in NHGRI::Blastall expect to find binaries in your path.
20 METHOD EXPECTED BINARIES
21 ---------- ---------------------
22 blastall() blastall
23 blastcl3() blastcl3
24 wu_blastall wu-blastn wu-blastp, wu-blastx, wu-tblastn, wu-tblastx
25
26
27 ----------------------------------------------------------------------------
28 If you are using the blastall or wu_blastall methods
29 you need to make sure the BLASTDB environmental variable is set.
30 If you stored your BLAST formatted databases in /usr/local/ncbi/blast/db
31 the command under bash, sh, or ksh would be...
32
33 BLASTDB=/usr/local/ncbi/blast/db;
34 export BLASTDB;
35
36 under csh or tcsh it would be
37
38 setenv BLASTDB /usr/local/ncbi/blast/db
39
40 Also make sure you have a .ncbirc file in your home directory or in
41 $NCBI/.ncbirc.
42
43 ---------------------------------------------------------------------------
44 If you are having problems using the blastall method you may want to test
45 BLAST from the command line.
46
47 cd NHGRI-Blastall-0.xx
48 blastall -d vector -p blastn -i t/fasta.seq
49
50 if this fails than the problem is probably one of the following
51 a. you don't have the vector database installed
52 b. you don't have the blastall binary in your path
53 c. you don't have NCBI's BLAST2.xx installed on your machine
54 d. the user that you are running the test under does not have
55 the correct environment settings (see above)
56
57 ------------------
58
59 If you have trouble installing NHGRI::Blastall.pm because you have
60 insufficient access privileges to add to the perl library directory,
61 you can still use NHGRI::Blastall.pm. You would want to do something
62 like:
63
64 perl Makefile.PL LIB=~/lib
65
66 See perldoc ExtUtils::MakeMaker for more details on this.
67
68 If you have problems send mail to: webblaster@nhgri.nih.gov
69
70 Read the POD docs in Blastall.pm
71
72 try perldoc NHGRI::Blastall
73
0 Blastall.pm
1 Changes
2 MANIFEST
3 INSTALL
4 Makefile.PL
5 README
6 t/blast.report
7 t/CAB96192.aa
8 t/basic.t
9 scripts/wu/test.nt
10 scripts/wu/basic.plx
11 scripts/net/test.nt
12 scripts/net/basic.plx
13 scripts/ncbi/test.nt
14 scripts/ncbi/basic.plx
15 scripts/id_regex_test.pl
0 use ExtUtils::MakeMaker;
1 # See lib/ExtUtils/MakeMaker.pm for details of how to influence
2 # the contents of the Makefile that is written.
3 WriteMakefile(
4 'NAME' => 'NHGRI::Blastall',
5 'VERSION_FROM' => 'Blastall.pm', # finds $VERSION
6 'dist' => {
7 COMPRESS=> 'gzip -9f', SUFFIX=>'gz',
8 }
9 );
0 Send bug reports to webblaster@nhgri.nih.gov
1
2 See Changes file for revision history.
3
4 ---------------------------------------------------------------------------
5 Warning: The parsing methods have been tested against GenBANK datasets.
6 If you are using private databases that have non-standard
7 deflines, you will want to adjust the defline id regex.
8 See DB_ID_REGEX section of the pod documentation
9 There is also a testing script to help develop a regular expression
10 that fits your databases. Look in the scripts subdirectory of
11 the distribution.
12 ---------------------------------------------------------------------------
13
14 If you have NCBI's BLAST2 or WU-BLAST installed locally and your
15 environment is already setup you can use Perl's object-oriented
16 capabilities to run your BLASTs. Also if you have a blastcl3 binary
17 from the toolkit (or binaries from our FTP site) you can run BLAST over
18 the network. There are also methods to blast single sequences against
19 each other using the bl2seq binaries (also in the toolkit and binaries).
20 You can blast one sequence against a library of sequences using the
21 blast_one_to_many method. You can format databases with formatdb method.
22 You can also have NHGRI::Blastall read existing BLAST
23 reports. If you have a database of repetitive DNA or other DNA you would
24 like to mask out, you can use the mask method to mask the data against
25 these databases. You can then use either the filter or result methods
26 to parse the report and access the various elements of the data.
27
28 If you need help installing NCBI's local blast check...
29
30 http://genome.nhgri.nih.gov/blastall/blast_install/
31
32 ---------------------------------------------------------------------------
33
34 IT IS HIGHLY RECOMMENDED THAT YOU READ THROUGH THE POD DOCUMENTATION
35 embedded in the module to learn how to use NHGRI::Blastall.
36 It won't take you that long. After Installation...
37
38 perldoc NHGRI::Blastall
39
40 the on-line manual page is available at
41
42 http://genome.nhgri.nih.gov/blastall/Blastall.html
43
44 ---------------------------------------------------------------------------
45
46 If you have problems, questions, comments send to webblaster@nhgri.nih.gov
47
48 ---------------------------------------------------------------------------
49
50 PUBLIC DOMAIN NOTICE
51
52 This software/database is ``United States Government Work'' under the
53 terms of the United States Copyright Act. It was written as part of
54 the authors' official duties for the United States Government and thus
55 cannot be copyrighted. This software/database is freely available to
56 the public for use without a copyright notice. Restrictions cannot
57 be placed on its present or future use.
58
59 Although all reasonable efforts have been taken to ensure the accuracy
60 and reliability of the software and data, the National Human Genome
61 Research Institute (NHGRI) and the U.S. Government does not and cannot
62 warrant the performance or results that may be obtained by using this
63 software or data. NHGRI and the U.S. Government disclaims all
64 warranties as to performance, merchantability or fitness for any
65 particular purpose.
66
67 In any work or product derived from this material, proper attribution
68 of the authors as the source of the software or data should be made,
69 using http://genome.nhgri.nih.gov/webblast as the citation.
70
71 ---------------------------------------------------------------------------
72
73 Special thanks to Peter Chines for reporting a bunch of bugs and suggesting
74 some excellent features.
75
76 Thanks to David Lapointe from UMass Medical School for his suggestion
77 to use blastcl3 for network blasting.
78
79
0 # this is a little script to help come up with a -DB_ID_REGEX
1 # if you are using non-Genbank formatted databases than odds are
2 # this regex that matches GENBANK deflines,
3 #
4 # [^\|]+(?:\|[^\|,\s]*){1,10}
5 #
6 # that matches GENBANK formatted deflines
7 #
8 # sp|Q54152|XASA_SHIFL AMINO ACID ANTIPORTER...
9 #
10 # is not going to work for your database. You may want to use something
11 # a bit more generic like,
12 #
13 # [^ ]+
14 #
15 # which will match everything before the first space
16 # which would match H_DJ0592P03.Contig29 in this WashU defline
17 #
18 # H_DJ0592P03.Contig29 : preliminary Length:47736
19 #
20 # This script will let you test your regex.
21 # THE KEY to picking a good regex is to make sure your regex
22 # catches something which is UNIQUE to each sequence in the database.
23 # set $REGEX to the value of the regex you would like to test and
24 # set $BLAST_REPORT to a BLAST report against your formatted databases
25 # After you find a regex you will need to add -DB_ID_REGEX => YOUR_REGEX
26 # to your new declaration.
27 #
28 # $b = new NHGRI::Blastall(-DB_ID_REGEX => '[^ ]+');
29 #
30
31 $REGEX = '[^ ]+';
32 $BLAST_REPORT = '../t/blast.out';
33
34 use NHGRI::Blastall;
35
36 $b = new NHGRI::Blastall( -DB_ID_REGEX => $REGEX );
37 $b->read_report( $BLAST_REPORT );
38 my @results = $b->result();
39
40 foreach $rh_r (@results) {
41 print "id: $rh_r->{'id'}\n";
42 }
43
44
45
0 #!/usr/local/bin/perl -w
1
2 use lib qw(/sysdev/users/jfryan/blastall.pm/NHGRI/Blastall);
3 use strict;
4 use NHGRI::Blastall;
5
6 my $b = new NHGRI::Blastall;
7
8 print "running ncbi blast...\n";
9
10 $b->blastall( p => 'blastn',
11 d => 'est',
12 e => 0.001,
13 i => 'test.nt',
14 o => 'blastn.est.output' } );
15
16 $b->print_report();
17
0 >gi|3278015|gb|AI038821.1|AI038821 ox96d03.x1 Soares_senescent_fibroblasts_NbHSF Homo sapiens cDNA clone IMAGE:1664165 3' similar to gb:V00574_cds1 TRANSFORMING PROTEIN P21/H-RAS-1 (HUMAN);contains element MER22 repetitive element ;, mRNA sequence [Homo sapiens]
1 TTTTTGACCATCCAATAATTGGGTGGGATCCCATCTGTGCCCGACAAGGGCCCACAGAGGCCTGGGAGGG
2 GAGCTAAGGGCTGGGGTTCCGGTGGCATTTGGGATGTTCAAGACAGTCTGTGCACAGCCTCCCTGGGAGG
3 GTCTGCAGTCACCTCGGCCCACGGTCCCGGGGTGACTGGGCTCCAGCAGCCCTTCCTTCCTTCCTTGCTT
4 CCGTCCTTCCTTCCTCCTCCTTCCGTCTGCACCTCCTTCCTGCATCCGGCACCTCCATGTCCTGAGCTTG
5 TGCTGCGTCAGGAGAGCACACACTTGCAGCTCATGCAGCCGGGGCCACTCTCATCAGGAGGGTTCAGCTT
6 CCGCAGCTTGTGCTGCCGGATCTCACGCACCAACGTGTAGAAGGCATCCTCCACTCTCTGCCGGGTCTTG
7 GCCGATGTCTCGATGTANGGGATGCCGTAGCTTCGGGCGAGGTCCTGAGCCTGCCGAGATTCCACAGTGC
8 GTGCAGCCAGGTCACACTTTGTCCCCACCAGCACCATGGGCACGTCATCCGAGTCCTTCACCCCGTTGAT
9 CTGCTCCCTGGTACTGGTGATGTCCTCAAAAGACTGGTGTTGTGATGGAAACACCACAGGAAGCCCTCCC
10 TGTCCCATGACTGGTCC
0 #!/usr/local/bin/perl -w
1
2 use strict;
3 use NHGRI::Blastall;
4
5 my $b = new NHGRI::Blastall;
6
7 print "running blastn over the network...\n";
8
9 $b->blastcl3( {'p' => 'blastn',
10 'd' => 'est',
11 'i' => 'test.nt',
12 'o' => 'blastn.est.output' } );
13
14 $b->print_report();
15
16
0 >gi|3278015|gb|AI038821.1|AI038821 ox96d03.x1 Soares_senescent_fibroblasts_NbHSF Homo sapiens cDNA clone IMAGE:1664165 3' similar to gb:V00574_cds1 TRANSFORMING PROTEIN P21/H-RAS-1 (HUMAN);contains element MER22 repetitive element ;, mRNA sequence [Homo sapiens]
1 TTTTTGACCATCCAATAATTGGGTGGGATCCCATCTGTGCCCGACAAGGGCCCACAGAGGCCTGGGAGGG
2 GAGCTAAGGGCTGGGGTTCCGGTGGCATTTGGGATGTTCAAGACAGTCTGTGCACAGCCTCCCTGGGAGG
3 GTCTGCAGTCACCTCGGCCCACGGTCCCGGGGTGACTGGGCTCCAGCAGCCCTTCCTTCCTTCCTTGCTT
4 CCGTCCTTCCTTCCTCCTCCTTCCGTCTGCACCTCCTTCCTGCATCCGGCACCTCCATGTCCTGAGCTTG
5 TGCTGCGTCAGGAGAGCACACACTTGCAGCTCATGCAGCCGGGGCCACTCTCATCAGGAGGGTTCAGCTT
6 CCGCAGCTTGTGCTGCCGGATCTCACGCACCAACGTGTAGAAGGCATCCTCCACTCTCTGCCGGGTCTTG
7 GCCGATGTCTCGATGTANGGGATGCCGTAGCTTCGGGCGAGGTCCTGAGCCTGCCGAGATTCCACAGTGC
8 GTGCAGCCAGGTCACACTTTGTCCCCACCAGCACCATGGGCACGTCATCCGAGTCCTTCACCCCGTTGAT
9 CTGCTCCCTGGTACTGGTGATGTCCTCAAAAGACTGGTGTTGTGATGGAAACACCACAGGAAGCCCTCCC
10 TGTCCCATGACTGGTCC
0 #!/usr/local/bin/perl -w
1
2 use lib qw(/sysdev/users/jfryan/blastall.pm/NHGRI/Blastall);
3 use strict;
4 use NHGRI::Blastall;
5
6 my $b = new NHGRI::Blastall;
7
8 print "running wu-blastn...\n";
9
10 $b->wu_blastall( {'p' => 'blastn',
11 'd' => 'est',
12 'i' => 'test.nt',
13 'o' => 'blastn.est.output' } );
14
15 $b->print_report();
16
0 >gi|3278015|gb|AI038821.1|AI038821 ox96d03.x1 Soares_senescent_fibroblasts_NbHSF Homo sapiens cDNA clone IMAGE:1664165 3' similar to gb:V00574_cds1 TRANSFORMING PROTEIN P21/H-RAS-1 (HUMAN);contains element MER22 repetitive element ;, mRNA sequence [Homo sapiens]
1 TTTTTGACCATCCAATAATTGGGTGGGATCCCATCTGTGCCCGACAAGGGCCCACAGAGGCCTGGGAGGG
2 GAGCTAAGGGCTGGGGTTCCGGTGGCATTTGGGATGTTCAAGACAGTCTGTGCACAGCCTCCCTGGGAGG
3 GTCTGCAGTCACCTCGGCCCACGGTCCCGGGGTGACTGGGCTCCAGCAGCCCTTCCTTCCTTCCTTGCTT
4 CCGTCCTTCCTTCCTCCTCCTTCCGTCTGCACCTCCTTCCTGCATCCGGCACCTCCATGTCCTGAGCTTG
5 TGCTGCGTCAGGAGAGCACACACTTGCAGCTCATGCAGCCGGGGCCACTCTCATCAGGAGGGTTCAGCTT
6 CCGCAGCTTGTGCTGCCGGATCTCACGCACCAACGTGTAGAAGGCATCCTCCACTCTCTGCCGGGTCTTG
7 GCCGATGTCTCGATGTANGGGATGCCGTAGCTTCGGGCGAGGTCCTGAGCCTGCCGAGATTCCACAGTGC
8 GTGCAGCCAGGTCACACTTTGTCCCCACCAGCACCATGGGCACGTCATCCGAGTCCTTCACCCCGTTGAT
9 CTGCTCCCTGGTACTGGTGATGTCCTCAAAAGACTGGTGTTGTGATGGAAACACCACAGGAAGCCCTCCC
10 TGTCCCATGACTGGTCC
0 >gi|8919844|emb|CAB96192.1| PSI 9 KDa protein [Aloe vera]
1 YLWHETTRSMGLSY
0 BEGIN { $| = 1; print "1..4\n"; }
1 END {print "not ok 1\n" unless $loaded;}
2 use NHGRI::Blastall;
3 $loaded = 1;
4 print "ok 1\n";
5
6 my $t = 't';
7
8 my $b = new NHGRI::Blastall;
9
10 if ($b->read_report("$t/blast.report")) {
11 print "ok 2\n";
12 } else {
13 print "not ok 2\n";
14 }
15
16 if ($b->result('id')) {
17 print "ok 3\n";
18 } else {
19 print "not ok 3\n";
20 }
21
22 if ($b->filter( {'identities' => .70} )) {
23 print "ok 4\n";
24 } else {
25 print "not ok 4\n";
26 }
27
28
29
0 BLASTN 2.0.10 [Aug-26-1999]
1
2
3 Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaffer,
4 Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997),
5 "Gapped BLAST and PSI-BLAST: a new generation of protein database search
6 programs", Nucleic Acids Res. 25:3389-3402.
7
8 Query= HUMRASH|gi|190890|gb|J00277 Human c-Ha-ras1 proto-oncogene.
9 (6453 letters)
10
11 Database: Non-redundant Database of all other organisms
12 GenBank+EMBL+DDBJ EST sequences: updated on Sat Dec 4 01:15:02 1999
13 800,021 sequences; 338,270,040 total letters
14
15 Searching..................................................done
16
17 Score E
18 Sequences producing significant alignments: (bits) Value
19
20 gb|AI657850.1|AI657850 fc23g10.y1 Zebrafish WashU MPIMG EST Dani... 139 2e-30
21 gb|AA497334|AA497334 fa04e08.r1 Zebrafish ICRFzfls Danio rerio c... 123 1e-25
22 gb|AI877912.1|AI877912 fc55a12.y1 Zebrafish WashU MPIMG EST Dani... 100 2e-18
23 gb|AI959079.1|AI959079 fd24c02.y1 Zebrafish WashU MPIMG EST Dani... 90 2e-15
24 gb|AA141102|AA141102 CK01231.5prime CK Drosophila melanogaster e... 90 2e-15
25 gb|AW148172.1|AW148172 da13b03.x1 normalized Xenopus laevis gast... 70 2e-09
26 dbj|AU003591|AU003591 AU003591 Bombyx mori p50(Daizo) Bombyx mor... 64 1e-07
27 dbj|AU004341|AU004341 AU004341 Bombyx mori p50(Daizo) Bombyx mor... 64 1e-07
28 gb|AI575394.1|AI575394 UI-R-G0-uh-e-12-0-UI.s1 UI-R-G0 Rattus no... 64 1e-07
29 gb|AI259307|AI259307 LP02693.3prime LP Drosophila melanogaster l... 64 1e-07
30 dbj|AU004702|AU004702 AU004702 Bombyx mori p50(Daizo) Bombyx mor... 64 1e-07
31 gb|AI295927|AI295927 LP09693.5prime LP Drosophila melanogaster l... 62 5e-07
32 gb|AI405934|AI405934 GH26087.5prime GH Drosophila melanogaster h... 62 5e-07
33 gb|AI238083|AI238083 GH14071.5prime GH Drosophila melanogaster h... 62 5e-07
34 gb|AI626207.1|AI626207 fc12c05.y1 Zebrafish WashU MPIMG EST Dani... 58 7e-06
35 gb|AA900048|AA900048 UI-R-E0-da-e-01-0-UI.s1 UI-R-E0 Rattus norv... 58 7e-06
36
37 >gb|AI657850.1|AI657850 fc23g10.y1 Zebrafish WashU MPIMG EST Danio rerio cDNA 5' similar to
38 gb:V00574_cds1 TRANSFORMING PROTEIN P21/H-RAS-1 (HUMAN);,
39 mRNA sequence
40 Length = 573
41
42 Score = 139 bits (70), Expect = 2e-30
43 Identities = 145/170 (85%)
44 Strand = Plus / Plus
45
46
47 Query: 2052 ggaagcaggtggtcattgatggggagacgtgcctgttggacatcctggataccgccggcc 2111
48 ||||||||||||| ||||| || || ||||| || ||||||||||||| || || || |
49 Sbjct: 194 ggaagcaggtggtgattgacggagaaacgtgtctactggacatcctggacactgcaggtc 253
50
51
52 Query: 2112 aggaggagtacagcgccatgcgggaccagtacatgcgcaccggggagggcttcctgtgtg 2171
53 ||||||| ||||| |||||| |||||||||||||| | || || ||||||||||| ||||
54 Sbjct: 254 aggaggaatacagtgccatgagggaccagtacatgaggacaggagagggcttcctctgtg 313
55
56
57 Query: 2172 tgtttgccatcaacaacaccaagtcttttgaggacatccaccagtacagg 2221
58 | ||||||||||| ||||| ||||| ||||||||||| ||||| ||||||
59 Sbjct: 314 tctttgccatcaataacacaaagtcctttgaggacattcaccactacagg 363
60
61
62 Score = 79.8 bits (40), Expect = 2e-12
63 Identities = 94/112 (83%)
64 Strand = Plus / Plus
65
66
67 Query: 1664 atgacggaatataagctggtggtggtgggcgccggcggtgtgggcaagagtgcgctgacc 1723
68 ||||| ||||||||||| ||||| ||||| || || || || ||||| || || || |||
69 Sbjct: 73 atgaccgaatataagcttgtggtcgtgggagctggaggcgtaggcaaaagcgctctcacc 132
70
71
72 Query: 1724 atccagctgatccagaaccattttgtggacgaatacgaccccactatagagg 1775
73 ||||| || ||||||||||| |||||||| ||||| ||||| ||||||||||
74 Sbjct: 133 atccaactcatccagaaccactttgtggatgaatatgacccaactatagagg 184
75
76
77 Score = 65.9 bits (33), Expect = 3e-08
78 Identities = 60/69 (86%)
79 Strand = Plus / Plus
80
81
82 Query: 2371 cagggagcagatcaaacgggtgaaggactcggatgacgtgcccatggtgctggtggggaa 2430
83 |||||||||||| || || || |||||||| || ||||| |||||||| |||||||||||
84 Sbjct: 360 cagggagcagataaagcgagtaaaggactccgaggacgtccccatggttctggtggggaa 419
85
86
87 Query: 2431 caagtgtga 2439
88 ||||||||
89 Sbjct: 420 taagtgtga 428
90
91
92 >gb|AA497334|AA497334 fa04e08.r1 Zebrafish ICRFzfls Danio rerio cDNA clone 1O24 5' similar
93 to gb:V00574_cds1 TRANSFORMING PROTEIN P21/H-RAS-1
94 (HUMAN);
95 Length = 671
96
97 Score = 123 bits (62), Expect = 1e-25
98 Identities = 143/170 (84%)
99 Strand = Plus / Plus
100
101
102 Query: 2052 ggaagcaggtggtcattgatggggagacgtgcctgttggacatcctggataccgccggcc 2111
103 ||||||||||||| ||||| || || ||||| || ||||||||||||| || || || |
104 Sbjct: 452 ggaagcaggtggtgattgacggagaaacgtgtctactggacatcctggacactgcaggtc 511
105
106
107 Query: 2112 aggaggagtacagcgccatgcgggaccagtacatgcgcaccggggagggcttcctgtgtg 2171
108 ||||||| ||||| |||||| |||||||||||||| | || || || |||||||| ||||
109 Sbjct: 512 aggaggaatacagtgccatgagggaccagtacatgaggacaggagaaggcttcctctgtg 571
110
111
112 Query: 2172 tgtttgccatcaacaacaccaagtcttttgaggacatccaccagtacagg 2221
113 | ||||||||||| ||||| |||| ||||||||||| ||||| ||||||
114 Sbjct: 572 tctttgccatcaataacacaaagtgctttgaggacattcaccactacagg 621
115
116
117 Score = 71.9 bits (36), Expect = 5e-10
118 Identities = 93/112 (83%)
119 Strand = Plus / Plus
120
121
122 Query: 1664 atgacggaatataagctggtggtggtgggcgccggcggtgtgggcaagagtgcgctgacc 1723
123 ||||| ||||||||||| ||||| ||||| || || || || ||||| || || || |||
124 Sbjct: 331 atgaccgaatataagcttgtggtcgtgggagctggaggcgtaggcaaaagcgctctcacc 390
125
126
127 Query: 1724 atccagctgatccagaaccattttgtggacgaatacgaccccactatagagg 1775
128 ||||| || ||||||||||| |||||| | ||||| ||||| ||||||||||
129 Sbjct: 391 atccaactcatccagaaccactttgtgaatgaatatgacccaactatagagg 442
130
131
132 >gb|AI877912.1|AI877912 fc55a12.y1 Zebrafish WashU MPIMG EST Danio rerio cDNA 5' similar to
133 gb:V00574_cds1 TRANSFORMING PROTEIN P21/H-RAS-1 (HUMAN);,
134 mRNA sequence
135 Length = 647
136
137 Score = 99.6 bits (50), Expect = 2e-18
138 Identities = 116/138 (84%)
139 Strand = Plus / Plus
140
141
142 Query: 2052 ggaagcaggtggtcattgatggggagacgtgcctgttggacatcctggataccgccggcc 2111
143 ||||||||||||| ||||| || |||||||| ||| ||||||||||||| || || ||||
144 Sbjct: 336 ggaagcaggtggtgattgacggcgagacgtgtctgctggacatcctggacactgcaggcc 395
145
146
147 Query: 2112 aggaggagtacagcgccatgcgggaccagtacatgcgcaccggggagggcttcctgtgtg 2171
148 |||| ||||||||||| ||| | |||||||||||| | || || ||||| ||||| || |
149 Sbjct: 396 aggaagagtacagcgcaatgagagaccagtacatgaggacaggagagggtttcctctgcg 455
150
151
152 Query: 2172 tgtttgccatcaacaaca 2189
153 | || || ||||||||||
154 Sbjct: 456 tcttcgctatcaacaaca 473
155
156
157 Score = 73.8 bits (37), Expect = 1e-10
158 Identities = 94/113 (83%)
159 Strand = Plus / Plus
160
161
162 Query: 1663 gatgacggaatataagctggtggtggtgggcgccggcggtgtgggcaagagtgcgctgac 1722
163 |||||| || ||||||||||| || ||||| || || ||||| || ||||| ||| | ||
164 Sbjct: 214 gatgactgagtataagctggttgttgtgggagcaggaggtgttgggaagagcgcgttaac 273
165
166
167 Query: 1723 catccagctgatccagaaccattttgtggacgaatacgaccccactatagagg 1775
168 |||||||| |||||||| || |||||||| ||||| ||||||||||| ||||
169 Sbjct: 274 aatccagctcatccagaatcactttgtggatgaatatgaccccactattgagg 326
170
171
172 Score = 67.9 bits (34), Expect = 8e-09
173 Identities = 52/58 (89%)
174 Strand = Plus / Plus
175
176
177 Query: 2375 gagcagatcaaacgggtgaaggactcggatgacgtgcccatggtgctggtggggaaca 2432
178 ||||||||||| || ||||||||||||||||| || |||||||| || ||||||||||
179 Sbjct: 506 gagcagatcaagcgtgtgaaggactcggatgatgttcccatggtcctagtggggaaca 563
180
181
182 >gb|AI959079.1|AI959079 fd24c02.y1 Zebrafish WashU MPIMG EST Danio rerio cDNA 5' similar to
183 gb:V00574_cds1 TRANSFORMING PROTEIN P21/H-RAS-1 (HUMAN);,
184 mRNA sequence
185 Length = 647
186
187 Score = 89.7 bits (45), Expect = 2e-15
188 Identities = 129/157 (82%)
189 Strand = Plus / Plus
190
191
192 Query: 2040 aggattcctaccggaagcaggtggtcattgatggggagacgtgcctgttggacatcctgg 2099
193 ||||||| ||| | |||||||| || |||||||| ||||| || || |||| |||||||
194 Sbjct: 397 aggattcatacagaaagcaggttgtgattgatggagagacttgtttgctggatatcctgg 456
195
196
197 Query: 2100 ataccgccggccaggaggagtacagcgccatgcgggaccagtacatgcgcaccggggagg 2159
198 | || || || |||||||||||||| |||||| | |||||||| ||| | || || ||||
199 Sbjct: 457 acactgcaggtcaggaggagtacagtgccatgagagaccagtatatgaggacaggagagg 516
200
201
202 Query: 2160 gcttcctgtgtgtgtttgccatcaacaacaccaagtc 2196
203 | || || |||||||||||||||||||| |||||||
204 Sbjct: 517 gatttctttgtgtgtttgccatcaacaatgccaagtc 553
205
206
207 >gb|AA141102|AA141102 CK01231.5prime CK Drosophila melanogaster embryo BlueScript
208 Drosophila melanogaster cDNA clone CK01231 5prime similar
209 to 0:
210 Length = 551
211
212 Score = 89.7 bits (45), Expect = 2e-15
213 Identities = 89/104 (85%)
214 Strand = Plus / Plus
215
216
217 Query: 1664 atgacggaatataagctggtggtggtgggcgccggcggtgtgggcaagagtgcgctgacc 1723
218 ||||||||||| || ||||| || || || ||||| || ||||||||| ||||| |||
219 Sbjct: 181 atgacggaatacaaactggtcgtcgttggagccggaggcgtgggcaagtccgcgctcacc 240
220
221
222 Query: 1724 atccagctgatccagaaccattttgtggacgaatacgaccccac 1767
223 |||||||| |||||||||||||| |||||||| |||||||||||
224 Sbjct: 241 atccagctaatccagaaccatttcgtggacgantacgaccccac 284
225
226
227 Score = 63.9 bits (32), Expect = 1e-07
228 Identities = 83/101 (82%)
229 Strand = Plus / Plus
230
231
232 Query: 2048 taccggaagcaggtggtcattgatggggagacgtgcctgttggacatcctggataccgcc 2107
233 ||||| ||||| ||||| || ||||| || || |||||| ||||||||||||| ||||||
234 Sbjct: 298 taccgaaagcaagtggttatcgatggaganacctgcctgctggacatcctggacaccgcc 357
235
236
237 Query: 2108 ggccaggaggagtacagcgccatgcgggaccagtacatgcg 2148
238 ||||| || || | | ||||||||||| ||||| |||||
239 Sbjct: 358 ggccaagatgantnctcggccatgcgggatcagtatatgcg 398
240
241
242 >gb|AW148172.1|AW148172 da13b03.x1 normalized Xenopus laevis gastrula Xenopus laevis cDNA
243 clone XENOPUS_SOURCE_ID:xlnga001a06 3' similar to
244 gb:V00574_cds1 TRANSFORMING PROTEIN P21/H-RAS-1 (HUMAN);
245 gb:X13664 Mouse mRNA for N-ras protein (MOUSE...
246 Length = 652
247
248 Score = 69.9 bits (35), Expect = 2e-09
249 Identities = 134/167 (80%)
250 Strand = Plus / Minus
251
252
253 Query: 2055 agcaggtggtcattgatggggagacgtgcctgttggacatcctggataccgccggccagg 2114
254 |||||||||| || || ||||| || ||||| | ||||| ||||||| || || || |
255 Sbjct: 526 agcaggtggtgatagacggggaaacttgcctcctagacatattggatactgcggggcaag 467
256
257
258 Query: 2115 aggagtacagcgccatgcgggaccagtacatgcgcaccggggagggcttcctgtgtgtgt 2174
259 |||| ||||| |||||| |||| |||||||||||||| || || || || || ||||| |
260 Sbjct: 466 aggaatacagtgccatgagggatcagtacatgcgcacgggagaagggtttctctgtgtct 407
261
262
263 Query: 2175 ttgccatcaacaacaccaagtcttttgaggacatccaccagtacagg 2221
264 |||| || || ||||| |||||||| |||||| |||| || ||||||
265 Sbjct: 406 ttgctattaataacacaaagtctttcgaggacgtccatcattacagg 360
266
267
268 >dbj|AU003591|AU003591 AU003591 Bombyx mori p50(Daizo) Bombyx mori cDNA clone ws00308, mRNA
269 sequence [Bombyx mori]
270 Length = 724
271
272 Score = 63.9 bits (32), Expect = 1e-07
273 Identities = 59/68 (86%)
274 Strand = Plus / Plus
275
276
277 Query: 2375 gagcagatcaaacgggtgaaggactcggatgacgtgcccatggtgctggtggggaacaag 2434
278 ||||||||||| || ||||||||| |||| || ||||||||||| || ||||| ||||||
279 Sbjct: 150 gagcagatcaagcgagtgaaggacgcggaagaggtgcccatggtacttgtgggcaacaag 209
280
281
282 Query: 2435 tgtgacct 2442
283 || |||||
284 Sbjct: 210 tgcgacct 217
285
286
287 >dbj|AU004341|AU004341 AU004341 Bombyx mori p50(Daizo) Bombyx mori cDNA clone ws20237, mRNA
288 sequence [Bombyx mori]
289 Length = 739
290
291 Score = 63.9 bits (32), Expect = 1e-07
292 Identities = 59/68 (86%)
293 Strand = Plus / Plus
294
295
296 Query: 2375 gagcagatcaaacgggtgaaggactcggatgacgtgcccatggtgctggtggggaacaag 2434
297 ||||||||||| || ||||||||| |||| || ||||||||||| || ||||| ||||||
298 Sbjct: 499 gagcagatcaagcgagtgaaggacgcggaagaggtgcccatggtacttgtgggcaacaag 558
299
300
301 Query: 2435 tgtgacct 2442
302 || |||||
303 Sbjct: 559 tgcgacct 566
304
305
306 >gb|AI575394.1|AI575394 UI-R-G0-uh-e-12-0-UI.s1 UI-R-G0 Rattus norvegicus cDNA clone
307 UI-R-G0-uh-e-12-0-UI 3', mRNA sequence
308 Length = 536
309
310 Score = 63.9 bits (32), Expect = 1e-07
311 Identities = 59/68 (86%)
312 Strand = Plus / Plus
313
314
315 Query: 2114 gaggagtacagcgccatgcgggaccagtacatgcgcaccggggagggcttcctgtgtgtg 2173
316 ||||||||||| || ||| |||||||||||||| | || ||||||||||| || |||||
317 Sbjct: 387 gaggagtacagtgcaatgagggaccagtacatgagaactggggagggctttctttgtgta 446
318
319
320 Query: 2174 tttgccat 2181
321 ||||||||
322 Sbjct: 447 tttgccat 454
323
324
325 >gb|AI259307|AI259307 LP02693.3prime LP Drosophila melanogaster larval-early pupal pOT2
326 Drosophila melanogaster cDNA clone LP02693 3prime similar
327 to M80535: R FBgn0004636 PID:g158198 SWISS-PROT:P08645,
328 mRNA sequence [Drosophila melanogaster]
329 Length = 542
330
331 Score = 63.9 bits (32), Expect = 1e-07
332 Identities = 50/56 (89%)
333 Strand = Plus / Minus
334
335
336 Query: 2387 cgggtgaaggactcggatgacgtgcccatggtgctggtggggaacaagtgtgacct 2442
337 |||||||||||| |||| || |||||||||||||| ||||| |||||||| |||||
338 Sbjct: 275 cgggtgaaggacacggacgatgtgcccatggtgctcgtgggcaacaagtgcgacct 220
339
340
341 >dbj|AU004702|AU004702 AU004702 Bombyx mori p50(Daizo) Bombyx mori cDNA clone ws20754, mRNA
342 sequence [Bombyx mori]
343 Length = 747
344
345 Score = 63.9 bits (32), Expect = 1e-07
346 Identities = 59/68 (86%)
347 Strand = Plus / Plus
348
349
350 Query: 2375 gagcagatcaaacgggtgaaggactcggatgacgtgcccatggtgctggtggggaacaag 2434
351 ||||||||||| || ||||||||| |||| || ||||||||||| || ||||| ||||||
352 Sbjct: 485 gagcagatcaagcgagtgaaggacgcggaagaggtgcccatggtacttgtgggcaacaag 544
353
354
355 Query: 2435 tgtgacct 2442
356 || |||||
357 Sbjct: 545 tgcgacct 552
358
359
360 >gb|AI295927|AI295927 LP09693.5prime LP Drosophila melanogaster larval-early pupal pOT2
361 Drosophila melanogaster cDNA clone LP09693 5prime similar
362 to Y07564: Ric FBgn0017549 PID:e276381 SPTREMBL:P91637,
363 mRNA sequence [Drosophila melanogaster]
364 Length = 522
365
366 Score = 61.9 bits (31), Expect = 5e-07
367 Identities = 52/59 (88%)
368 Strand = Plus / Plus
369
370
371 Query: 2090 gacatcctggataccgccggccaggaggagtacagcgccatgcgggaccagtacatgcg 2148
372 |||||||| || ||||||||||||| ||||| || |||||||||||||| ||||||||
373 Sbjct: 413 gacatccttgacaccgccggccaggtggagttcacggccatgcgggaccaatacatgcg 471
374
375
376 >gb|AI405934|AI405934 GH26087.5prime GH Drosophila melanogaster head pOT2 Drosophila
377 melanogaster cDNA clone GH26087 5prime similar to Y07564:
378 Ric FBgn0017549 PID:e276381 SPTREMBL:P91637, mRNA
379 sequence [Drosophila melanogaster]
380 Length = 602
381
382 Score = 61.9 bits (31), Expect = 5e-07
383 Identities = 52/59 (88%)
384 Strand = Plus / Plus
385
386
387 Query: 2090 gacatcctggataccgccggccaggaggagtacagcgccatgcgggaccagtacatgcg 2148
388 |||||||| || ||||||||||||| ||||| || |||||||||||||| ||||||||
389 Sbjct: 77 gacatccttgacaccgccggccaggtggagttcacggccatgcgggaccaatacatgcg 135
390
391
392 >gb|AI238083|AI238083 GH14071.5prime GH Drosophila melanogaster head pOT2 Drosophila
393 melanogaster cDNA clone GH14071 5prime similar to Y07564:
394 Ric FBgn0017549 PID:e276381 SPTREMBL:P91637, mRNA
395 sequence [Drosophila melanogaster]
396 Length = 535
397
398 Score = 61.9 bits (31), Expect = 5e-07
399 Identities = 52/59 (88%)
400 Strand = Plus / Plus
401
402
403 Query: 2090 gacatcctggataccgccggccaggaggagtacagcgccatgcgggaccagtacatgcg 2148
404 |||||||| || ||||||||||||| ||||| || |||||||||||||| ||||||||
405 Sbjct: 425 gacatccttgacaccgccggccaggtggagttcacggccatgcgggaccaatacatgcg 483
406
407
408 >gb|AI626207.1|AI626207 fc12c05.y1 Zebrafish WashU MPIMG EST Danio rerio cDNA 5' similar to
409 TR:P97913 P97913 N-RAS {EXONS 1 AND 2 ;, mRNA sequence
410 Length = 529
411
412 Score = 58.0 bits (29), Expect = 7e-06
413 Identities = 92/113 (81%)
414 Strand = Plus / Plus
415
416
417 Query: 1663 gatgacggaatataagctggtggtggtgggcgccggcggtgtgggcaagagtgcgctgac 1722
418 |||||| || ||||||||||| || ||||| || || |||| | ||||| ||| | ||
419 Sbjct: 298 gatgactgagtataagctggttgttgtgggagcaggaggtggttggaagagcgcgttaac 357
420
421
422 Query: 1723 catccagctgatccagaaccattttgtggacgaatacgaccccactatagagg 1775
423 |||||||| |||||||| || |||||||| ||||| ||||||||||| ||||
424 Sbjct: 358 aatccagctcatccagaatcactttgtggatgaatatgaccccactattgagg 410
425
426
427 >gb|AA900048|AA900048 UI-R-E0-da-e-01-0-UI.s1 UI-R-E0 Rattus norvegicus cDNA clone
428 UI-R-E0-da-e-01-0-UI 3' similar to >
429 gi|191344|gb|M84166|HAMCHARAS Hamster c-Ha-ras protein
430 gene, complete cds, mRNA sequence [Rattus norvegicus]
431 Length = 315
432
433 Score = 58.0 bits (29), Expect = 7e-06
434 Identities = 32/33 (96%)
435 Strand = Plus / Minus
436
437
438 Query: 3319 gctgcatgagctgcaagtgtgtgctctcctgac 3351
439 ||||||||||||||||||||||||| |||||||
440 Sbjct: 315 gctgcatgagctgcaagtgtgtgctgtcctgac 283
441
442
443 Database: Non-redundant Database of all other organisms
444 GenBank+EMBL+DDBJ EST sequences: updated on Sat Dec 4 01:15:02 1999
445 Posted date: Dec 4, 1999 2:31 AM
446 Number of letters in database: 338,270,040
447 Number of sequences in database: 800,021
448
449 Lambda K H
450 1.37 0.711 1.31
451
452 Gapped
453 Lambda K H
454 1.37 0.711 1.31
455
456
457 Matrix: blastn matrix:1 -3
458 Gap Penalties: Existence: 5, Extension: 2
459 Number of Hits to DB: 890609
460 Number of Sequences: 800021
461 Number of extensions: 890609
462 Number of successful extensions: 160
463 Number of sequences better than 1.0e-04: 19
464 length of query: 6453
465 length of database: 338,270,040
466 effective HSP length: 21
467 effective length of query: 6432
468 effective length of database: 321,469,599
469 effective search space: 2067692460768
470 effective search space used: 2067692460768
471 T: 0
472 A: 0
473 X1: 6 (11.9 bits)
474 X2: 25 (49.6 bits)
475 S1: 12 (24.3 bits)
476 S2: 28 (56.0 bits)