|
0 |
package NHGRI::Blastall;
|
|
1 |
require 5.004;
|
|
2 |
|
|
3 |
###########################################################################
|
|
4 |
# See the bottom of this file for the POD documentation. Search for the
|
|
5 |
# string '=head'.
|
|
6 |
#
|
|
7 |
# You can run this file through either pod2man or pod2html to produce pretty
|
|
8 |
# documentation in manual or html file format (these utilities are part of the
|
|
9 |
# Perl 5 distribution).
|
|
10 |
#
|
|
11 |
# The most recent version and complete docs are available at:
|
|
12 |
# ftp://ftp.nhgri.nih.gov/pub/software/
|
|
13 |
###########################################################################
|
|
14 |
# This software is "United States Government Work" under the
|
|
15 |
# terms of the United States Copyright Act. It was written as part of the
|
|
16 |
# authors' official duties for the United States Government and thus
|
|
17 |
# cannot be copyrighted. This software/database is freely available to
|
|
18 |
# the public for use without a copyright notice. Restrictions cannot be
|
|
19 |
# placed on its present or future use.
|
|
20 |
#
|
|
21 |
# Although all reasonable efforts have been taken to ensure the accuracy
|
|
22 |
# and reliability of the software and data, the National Human Genome
|
|
23 |
# Research Institute (NHGRI) and the U.S. Government does not and cannot
|
|
24 |
# warrant the performance or results that may be obtained by using this
|
|
25 |
# software or data. NHGRI and the U.S. Government disclaims all
|
|
26 |
# warranties as to performance, merchantability or fitness for any
|
|
27 |
# particular purpose.
|
|
28 |
#
|
|
29 |
# In any work or product derived from this material, proper attribution
|
|
30 |
# of the authors as the source of the software or data should be made,
|
|
31 |
# using http://genome.nhgri.nih.gov/blastall as the citation.
|
|
32 |
###########################################################################
|
|
33 |
|
|
34 |
use strict;
|
|
35 |
use vars qw($VERSION $REVISION $BLASTALL $BLASTCL3 $BL2SEQ %WU_BLAST
|
|
36 |
$FORMATDB %DEFAULT_FILTERS);
|
|
37 |
use IO::File;
|
|
38 |
use Carp qw(carp croak);
|
|
39 |
|
|
40 |
$VERSION = '0.67';
|
|
41 |
$REVISION = '# $Id: Blastall.pm,v 1.3 2002/01/15 20:09:08 jpearson Exp $';
|
|
42 |
|
|
43 |
# you can set these variables to the full path of the corresponding binaries
|
|
44 |
$BLASTALL = 'blastall';
|
|
45 |
$BLASTCL3 = 'blastcl3';
|
|
46 |
$BL2SEQ = 'bl2seq';
|
|
47 |
$FORMATDB = 'formatdb';
|
|
48 |
|
|
49 |
# you can adjust the locations of wu-blast programs
|
|
50 |
# for instance 'wu_blastn' => '/usr/local/wu/blastn',
|
|
51 |
# I haven't added support for wu-blastall (hopefully I will get to this)
|
|
52 |
%WU_BLAST = ('wu_blastn' => 'wu-blastn',
|
|
53 |
'wu_blastp' => 'wu-blastp',
|
|
54 |
'wu_blastx' => 'wu-blastx',
|
|
55 |
'wu_tblastn' => 'wu-tblastn',
|
|
56 |
'wu_tblastx' => 'wu-tblastx'
|
|
57 |
);
|
|
58 |
|
|
59 |
###########################################################################
|
|
60 |
# You probably shouldn't be mucking below here...
|
|
61 |
###########################################################################
|
|
62 |
%DEFAULT_FILTERS = ('id' => '=~'
|
|
63 |
,'defline' => '=~'
|
|
64 |
,'subject_length' => '>'
|
|
65 |
,'scores' => '>'
|
|
66 |
,'expects' => '<'
|
|
67 |
,'identities' => '>'
|
|
68 |
,'match_lengths' => '>'
|
|
69 |
,'subject_strands' => 'eq'
|
|
70 |
,'query_strands' => 'eq'
|
|
71 |
,'query_frames' => '=='
|
|
72 |
,'subject_frames' => '=='
|
|
73 |
);
|
|
74 |
|
|
75 |
sub new {
|
|
76 |
my $this = shift;
|
|
77 |
my %args = @_;
|
|
78 |
my $class = ref($this) || $this;
|
|
79 |
my $self = \%args;
|
|
80 |
bless $self, $class;
|
|
81 |
return($self);
|
|
82 |
}
|
|
83 |
sub blastall {
|
|
84 |
my $self = shift;
|
|
85 |
my $rh_opts = _get_opts(@_);
|
|
86 |
|
|
87 |
my ($ra_args,$tmp_file) = _get_args($rh_opts);
|
|
88 |
unshift @{$ra_args}, $BLASTALL;
|
|
89 |
|
|
90 |
my $rc = 0xffff & system(@{$ra_args});
|
|
91 |
if ($rc == 0) {
|
|
92 |
$self->{options} = $rh_opts;
|
|
93 |
push @{$self->{tmp_file}}, $tmp_file; # record so we can DESTROY later
|
|
94 |
$self->{report_file} = $self->{options}->{o} || $tmp_file;
|
|
95 |
return(1);
|
|
96 |
} else {
|
|
97 |
foreach (@{$ra_args}) { print STDERR "$_ "; }
|
|
98 |
print STDERR "\n";
|
|
99 |
carp qq!the above system call to $BLASTALL failed.
|
|
100 |
Possible problems include
|
|
101 |
1. Is blastall in your path or is \$BLASTALL the full path to blastall?
|
|
102 |
2. Do you have the DATABASE installed
|
|
103 |
3. Are you pointing to the correct location of the query sequence
|
|
104 |
4. Is your DATABASE corrupted
|
|
105 |
5. Are you running the correct program for the type of database and
|
|
106 |
query sequence you are running. e.g. BLASTP -> PROTEIN
|
|
107 |
6. Perhaps try blastcl3 instead of blastall.
|
|
108 |
!;
|
|
109 |
return undef;
|
|
110 |
}
|
|
111 |
}
|
|
112 |
sub formatdb {
|
|
113 |
my $self = shift;
|
|
114 |
my $rh_opts = _get_opts(@_);
|
|
115 |
|
|
116 |
my $ra_args = _get_formatdb_args($rh_opts);
|
|
117 |
unshift @{$ra_args}, $FORMATDB;
|
|
118 |
|
|
119 |
my $rc = 0xffff & system(@{$ra_args});
|
|
120 |
if ($rc == 0) {
|
|
121 |
$self->{options} = $rh_opts;
|
|
122 |
my $ra_indexes = _get_formatdb_indexes($rh_opts);
|
|
123 |
push @{$self->{indexes}}, @{$ra_indexes};
|
|
124 |
#$self->{indexes} = _get_formatdb_indexes($rh_opts);
|
|
125 |
$self->{tmp_file} = [];
|
|
126 |
return(1);
|
|
127 |
} else {
|
|
128 |
foreach (@{$ra_args}) { print STDERR "$_ "; }
|
|
129 |
print STDERR "\n";
|
|
130 |
carp qq!the above system call to $FORMATDB failed.
|
|
131 |
Possible problems include
|
|
132 |
1. Is formatdb in your path or is \$FORMATDB the full path to formatdb?
|
|
133 |
!;
|
|
134 |
return undef;
|
|
135 |
}
|
|
136 |
}
|
|
137 |
sub remove_formatdb_indexes {
|
|
138 |
my $self = shift;
|
|
139 |
if ($self->{indexes}) {
|
|
140 |
foreach my $i (@{$self->{indexes}}) {
|
|
141 |
unlink $i or carp "could not remove $i:$!";
|
|
142 |
}
|
|
143 |
}
|
|
144 |
}
|
|
145 |
sub blast_one_to_many {
|
|
146 |
my $self = shift;
|
|
147 |
my $rh_opts = _get_opts(@_);
|
|
148 |
my ($ra_args,$tmp_file) = _get_args($rh_opts);
|
|
149 |
if ($rh_opts->{p} eq 'blastp' or $rh_opts->{p} eq 'blastx') {
|
|
150 |
$self->formatdb(p => 'T', i => $rh_opts->{d});
|
|
151 |
} else {
|
|
152 |
$self->formatdb(p => 'F', i => $rh_opts->{d});
|
|
153 |
}
|
|
154 |
$self->blastall($rh_opts);
|
|
155 |
$self->remove_formatdb_indexes();
|
|
156 |
}
|
|
157 |
sub bl2seq {
|
|
158 |
my $self = shift;
|
|
159 |
my $rh_opts = _get_opts(@_);
|
|
160 |
|
|
161 |
my ($ra_args,$tmp_file) = _get_args($rh_opts);
|
|
162 |
unshift @{$ra_args}, $BL2SEQ;
|
|
163 |
|
|
164 |
my $rc = 0xffff & system(@{$ra_args});
|
|
165 |
if ($rc == 0) {
|
|
166 |
$self->{options} = $rh_opts;
|
|
167 |
push @{$self->{tmp_file}}, $tmp_file; # record so we can DESTROY later
|
|
168 |
$self->{report_file} = $self->{options}->{o} || $tmp_file;
|
|
169 |
return(1);
|
|
170 |
} else {
|
|
171 |
foreach (@{$ra_args}) { print STDERR "$_ "; }
|
|
172 |
print STDERR "\n";
|
|
173 |
carp qq!the above system call to $BL2SEQ failed.
|
|
174 |
Possible problems include
|
|
175 |
1. Is bl2seq in your path or is \$BL2SEQ the full path to bl2seq?
|
|
176 |
2. Are you running the correct program for the type of sequences you
|
|
177 |
are comparing? e.g. BLASTP -> PROTEIN
|
|
178 |
!;
|
|
179 |
return undef;
|
|
180 |
}
|
|
181 |
}
|
|
182 |
sub wu_blastall {
|
|
183 |
my $self = shift;
|
|
184 |
my $rh_opts = _get_opts(@_);
|
|
185 |
|
|
186 |
my ($ra_args,$tmp_file) = _get_wu_args($rh_opts);
|
|
187 |
my $command = join ' ', @{$ra_args};
|
|
188 |
my $output = $rh_opts->{o} || $tmp_file;
|
|
189 |
$command .= " > $output";
|
|
190 |
#print "$command\n";
|
|
191 |
|
|
192 |
my $rc = 0xffff & system("$command");
|
|
193 |
if ($rc == 0) {
|
|
194 |
$self->{options} = $rh_opts;
|
|
195 |
push @{$self->{tmp_file}}, $tmp_file; # record so we can DESTROY later
|
|
196 |
$self->{report_file} = $self->{options}->{o} || $tmp_file;
|
|
197 |
return(1);
|
|
198 |
} else {
|
|
199 |
print STDERR "$command\n";
|
|
200 |
carp qq!the above system call failed.!;
|
|
201 |
return undef;
|
|
202 |
}
|
|
203 |
}
|
|
204 |
sub net_blastall {
|
|
205 |
carp "net_blastall is depricated please use blastcl3 instead\n";
|
|
206 |
blastcl3(@_);
|
|
207 |
}
|
|
208 |
sub blastcl3 {
|
|
209 |
{
|
|
210 |
local $BLASTALL = $BLASTCL3;
|
|
211 |
blastall(@_);
|
|
212 |
}
|
|
213 |
}
|
|
214 |
sub mask {
|
|
215 |
my $self = shift;
|
|
216 |
my $rh_opts;
|
|
217 |
if (@_ > 1) {
|
|
218 |
my %opts = @_;
|
|
219 |
$rh_opts = \%opts;
|
|
220 |
} else {
|
|
221 |
$rh_opts = shift;
|
|
222 |
}
|
|
223 |
my ($rc);
|
|
224 |
# type can be wu_blastall, blastcl3, blastall or net_blastall(depricated)
|
|
225 |
my $type_of_blast = $rh_opts->{type} || 'blastall';
|
|
226 |
delete $rh_opts->{type};
|
|
227 |
my ($ra_args,$tmp_file) = _get_args($rh_opts);
|
|
228 |
unshift @{$ra_args}, $BLASTALL;
|
|
229 |
my $m = new NHGRI::Blastall;
|
|
230 |
$rh_opts->{o} = $rh_opts->{o} || $tmp_file;
|
|
231 |
if ($type_of_blast eq 'net_blastall') {
|
|
232 |
$m->net_blastall($rh_opts);
|
|
233 |
} elsif ($type_of_blast eq 'blastcl3') {
|
|
234 |
$m->blastcl3($rh_opts);
|
|
235 |
} elsif ($type_of_blast eq 'wu_blastall') {
|
|
236 |
$m->wu_blastall($rh_opts);
|
|
237 |
} else {
|
|
238 |
# 2002-01-12 JVP added "return undef unless"
|
|
239 |
return undef unless ($m->blastall($rh_opts));
|
|
240 |
}
|
|
241 |
my $report_file = $m->{report_file};
|
|
242 |
my $fa_file = $m->{options}->{i};
|
|
243 |
my($fa_line,$sequence) = _get_sequence($fa_file);
|
|
244 |
my($ra_mask_positions) = _get_mask_positions($report_file);
|
|
245 |
my $masked_sequence = _get_masked_sequence($sequence,$ra_mask_positions);
|
|
246 |
return ("$fa_line\n$masked_sequence", $ra_mask_positions) if (wantarray);
|
|
247 |
return ("$fa_line\n$masked_sequence");
|
|
248 |
}
|
|
249 |
sub print_report {
|
|
250 |
my $self = shift;
|
|
251 |
my $report_file = $self->{report_file};
|
|
252 |
return undef unless ($report_file);
|
|
253 |
open REPORT, $report_file or return(0);
|
|
254 |
my $report = join ('',<REPORT>);
|
|
255 |
print $report;
|
|
256 |
return;
|
|
257 |
}
|
|
258 |
sub filter {
|
|
259 |
my $self = shift;
|
|
260 |
my $rh_criteria;
|
|
261 |
if (@_ > 1) {
|
|
262 |
my %criteria = @_;
|
|
263 |
$rh_criteria = \%criteria;
|
|
264 |
} else {
|
|
265 |
$rh_criteria = shift;
|
|
266 |
}
|
|
267 |
|
|
268 |
my @filtered_results = ();
|
|
269 |
$self->_parse_report() unless ($self->{been_parsed});
|
|
270 |
|
|
271 |
my ($ra_stripped_values,$ra_cmp_ops) =
|
|
272 |
_get_stripped_values_and_comparison_operators($rh_criteria);
|
|
273 |
foreach my $entry (@{$self->{results}}) {
|
|
274 |
if ($entry = _entry_passes_filter($entry,$ra_stripped_values,
|
|
275 |
$ra_cmp_ops,$rh_criteria)) {
|
|
276 |
push @filtered_results,$entry;
|
|
277 |
}
|
|
278 |
}
|
|
279 |
$self->{results} = \@filtered_results;
|
|
280 |
return @filtered_results;
|
|
281 |
}
|
|
282 |
sub unfilter {
|
|
283 |
my $self = shift;
|
|
284 |
$self->_parse_report();
|
|
285 |
}
|
|
286 |
sub result {
|
|
287 |
my($self,@r) = @_;
|
|
288 |
my $ra_results = [];
|
|
289 |
|
|
290 |
$self->_parse_report() unless ($self->{been_parsed});
|
|
291 |
return $self->_all_results unless @r;
|
|
292 |
if (@r == 1) {
|
|
293 |
$ra_results = $self->_result_for_one(@r);
|
|
294 |
} elsif (@r == 2) {
|
|
295 |
$ra_results = $self->_result_for_two(@r);
|
|
296 |
} else {
|
|
297 |
return undef;
|
|
298 |
}
|
|
299 |
return(@{$ra_results});
|
|
300 |
}
|
|
301 |
sub read_report {
|
|
302 |
my $self = shift;
|
|
303 |
my $file = shift;
|
|
304 |
$self->{report_file} = $file;
|
|
305 |
}
|
|
306 |
sub get_database_description {
|
|
307 |
my $self = shift;
|
|
308 |
$self->_parse_report() unless ($self->{been_parsed});
|
|
309 |
return $self->{database_description};
|
|
310 |
}
|
|
311 |
sub get_database_sequence_count {
|
|
312 |
my $self = shift;
|
|
313 |
$self->_parse_report() unless ($self->{been_parsed});
|
|
314 |
return $self->{database_sequence_count};
|
|
315 |
}
|
|
316 |
sub get_database_letter_count {
|
|
317 |
my $self = shift;
|
|
318 |
$self->_parse_report() unless ($self->{been_parsed});
|
|
319 |
return $self->{database_letter_count};
|
|
320 |
}
|
|
321 |
sub get_blast_program {
|
|
322 |
my $self = shift;
|
|
323 |
$self->_parse_report() unless ($self->{been_parsed});
|
|
324 |
return $self->{program};
|
|
325 |
}
|
|
326 |
sub get_blast_version {
|
|
327 |
my $self = shift;
|
|
328 |
$self->_parse_report() unless ($self->{been_parsed});
|
|
329 |
return $self->{blast_version};
|
|
330 |
}
|
|
331 |
sub get_report {
|
|
332 |
my $self = shift;
|
|
333 |
return $self->{report_file};
|
|
334 |
}
|
|
335 |
sub DESTROY {
|
|
336 |
my $self = shift;
|
|
337 |
foreach (@{$self->{tmp_file}}) {
|
|
338 |
unlink $_ if (-e $_);
|
|
339 |
}
|
|
340 |
}
|
|
341 |
|
|
342 |
# END OF PUBLIC METHODS
|
|
343 |
# BEGIN PRIVATE METHODS
|
|
344 |
|
|
345 |
sub _get_opts {
|
|
346 |
my @args = @_;
|
|
347 |
my $rh_opts;
|
|
348 |
|
|
349 |
if (@args == 1) {
|
|
350 |
$rh_opts = shift;
|
|
351 |
} else {
|
|
352 |
my %opts = @args;
|
|
353 |
$rh_opts = \%opts;
|
|
354 |
}
|
|
355 |
return $rh_opts;
|
|
356 |
}
|
|
357 |
sub _get_formatdb_indexes {
|
|
358 |
my $rh_opts = shift;
|
|
359 |
my @indexes = ();
|
|
360 |
my $x = 'p';
|
|
361 |
my $name = $rh_opts->{i};
|
|
362 |
$name = $rh_opts->{n} if ($rh_opts->{n});
|
|
363 |
$x = 'n' if ($rh_opts->{p} =~ /f/i);
|
|
364 |
push @indexes, "${name}.${x}hr", "${name}.${x}in", "${name}.${x}sq";
|
|
365 |
if ($rh_opts->{o} && $rh_opts->{o} =~ /t/i) {
|
|
366 |
push @indexes, "${name}.${x}nd", "${name}.${x}ni",
|
|
367 |
"${name}.${x}sd", "${name}.${x}si";
|
|
368 |
}
|
|
369 |
return \@indexes;
|
|
370 |
}
|
|
371 |
sub _write_report_to_disk {
|
|
372 |
my $report_file = shift;
|
|
373 |
my $results = shift;
|
|
374 |
open OUT, ">$report_file" or croak "cannot open $report_file:$!";
|
|
375 |
print OUT $results;
|
|
376 |
}
|
|
377 |
|
|
378 |
sub _get_matrix_parameters {
|
|
379 |
return { PAM30 => [9, 1],
|
|
380 |
PAM70 => [10, 1],
|
|
381 |
BLOSUM45 => [14, 2],
|
|
382 |
BLOSUM80 => [10, 1],
|
|
383 |
BLOSUM62 => [11, 1],
|
|
384 |
BLOSUM45 => [14, 2]
|
|
385 |
};
|
|
386 |
}
|
|
387 |
sub _get_genetic_codes {
|
|
388 |
return { 1 => "Standard (1)",
|
|
389 |
2 => "Vertebrate Mitochondrial (2)",
|
|
390 |
3 => "Yeast Mitochondrial (3)",
|
|
391 |
4 => "Mold Mitochondrial; ... (4)",
|
|
392 |
5 => "Invertebrate Mitochondrial (5)",
|
|
393 |
6 => "Ciliate Nuclear; ... (6)",
|
|
394 |
9 => "Echinoderm Mitochondrial (9)",
|
|
395 |
10 => "Euplotid Nuclear (10)",
|
|
396 |
11 => "Bacterial (11)",
|
|
397 |
12 => "lternative Yeast Nuclear (12)",
|
|
398 |
13 => "Ascidian Mitochondrial (13)",
|
|
399 |
14 => "Flatworm Mitochondrial (14)",
|
|
400 |
15 => "Blepharisma Macronuclear (15)"
|
|
401 |
};
|
|
402 |
}
|
|
403 |
|
|
404 |
sub _get_masked_sequence {
|
|
405 |
my $sequence = shift;
|
|
406 |
my $ra_mask_positions = shift;
|
|
407 |
my ($i,$iplus);
|
|
408 |
return $sequence unless ($ra_mask_positions); # nothing masked
|
|
409 |
|
|
410 |
if ($ra_mask_positions->[0] &&
|
|
411 |
$ra_mask_positions->[0] > $ra_mask_positions->[1]) {
|
|
412 |
@{$ra_mask_positions} = reverse @{$ra_mask_positions};
|
|
413 |
}
|
|
414 |
|
|
415 |
for ($i=0; $i < @{$ra_mask_positions}; $i=$i+2) {
|
|
416 |
$iplus = ($i + 1);
|
|
417 |
$sequence = _mask_from_x_to_y($sequence,$ra_mask_positions->[$i],
|
|
418 |
$ra_mask_positions->[$iplus]);
|
|
419 |
}
|
|
420 |
return $sequence;
|
|
421 |
}
|
|
422 |
|
|
423 |
sub _mask_from_x_to_y {
|
|
424 |
my $sequence = shift;
|
|
425 |
my $x = shift;
|
|
426 |
my $y = shift;
|
|
427 |
my $length = ($y - $x);
|
|
428 |
substr($sequence, $x, $length) = ('N' x $length);
|
|
429 |
return($sequence);
|
|
430 |
}
|
|
431 |
|
|
432 |
|
|
433 |
sub _get_mask_positions {
|
|
434 |
my $report_file = shift;
|
|
435 |
my @mask_positions = ();
|
|
436 |
|
|
437 |
open REPORT, $report_file or carp "cannot open $report_file:$!";
|
|
438 |
while (<REPORT>) {
|
|
439 |
if (/^Query:\s+(\d+)\s+.*\s+(\d+)$/) {
|
|
440 |
push @mask_positions,($1,$2);
|
|
441 |
}
|
|
442 |
}
|
|
443 |
return(\@mask_positions);
|
|
444 |
}
|
|
445 |
|
|
446 |
|
|
447 |
sub _get_sequence {
|
|
448 |
my $fasta_file = shift;
|
|
449 |
my @fasta_lines = ();
|
|
450 |
|
|
451 |
open FASTA, $fasta_file or warn "cannot open $fasta_file:$!";
|
|
452 |
chomp(@fasta_lines = <FASTA>);
|
|
453 |
my $first_line = shift @fasta_lines;
|
|
454 |
my $seq = join '',@fasta_lines;
|
|
455 |
return ($first_line,$seq);
|
|
456 |
}
|
|
457 |
sub _result_for_one {
|
|
458 |
my $self = shift;
|
|
459 |
my $attribute = shift;
|
|
460 |
my $count = 0;
|
|
461 |
my $ra_results = [];
|
|
462 |
$attribute = _map_attribute($attribute);
|
|
463 |
foreach (@{$self->{results}}) {
|
|
464 |
if (ref($self->{results}->[$count]->{$attribute}) eq 'ARRAY'){
|
|
465 |
push @{$ra_results},@{$self->{results}->[$count]->{$attribute}};
|
|
466 |
} else {
|
|
467 |
push @{$ra_results},$self->{results}->[$count]->{$attribute};
|
|
468 |
}
|
|
469 |
$count++;
|
|
470 |
}
|
|
471 |
return($ra_results);
|
|
472 |
}
|
|
473 |
sub _result_for_two {
|
|
474 |
my $self = shift;
|
|
475 |
my $attribute = shift;
|
|
476 |
my $id = shift;
|
|
477 |
my $ra_results = [];
|
|
478 |
$attribute = _map_attribute($attribute);
|
|
479 |
for (my $i = 0; $i < @{$self->{results}}; $i++) {
|
|
480 |
next unless ($self->{results}->[$i]->{id} eq $id);
|
|
481 |
if (ref($self->{results}->[$i]->{$attribute}) eq 'ARRAY'){
|
|
482 |
push @{$ra_results},@{$self->{results}->[$i]->{$attribute}};
|
|
483 |
} else {
|
|
484 |
push @{$ra_results},$self->{results}->[$i]->{$attribute};
|
|
485 |
}
|
|
486 |
}
|
|
487 |
return($ra_results);
|
|
488 |
}
|
|
489 |
sub _map_attribute {
|
|
490 |
my $attribute = shift;
|
|
491 |
my %attribute_map = ('ids' => 'id',
|
|
492 |
'score' => 'scores',
|
|
493 |
'def_line' => 'defline',
|
|
494 |
'def_lines' => 'defline',
|
|
495 |
'def-line' => 'defline',
|
|
496 |
'def-lines' => 'defline',
|
|
497 |
'deflines' => 'defline',
|
|
498 |
'expect' => 'expects',
|
|
499 |
'identity' => 'identities',
|
|
500 |
'gap' => 'gaps',
|
|
501 |
'subject_lengths' => 'subject_length',
|
|
502 |
'subject-length' => 'subject_length',
|
|
503 |
'subject-lengths' => 'subject_length',
|
|
504 |
'subjectlength' => 'subject_length',
|
|
505 |
'subjectlengths' => 'subject_length',
|
|
506 |
'length' => 'subject_length',
|
|
507 |
'lengths' => 'subject_length',
|
|
508 |
'match_length' => 'match_lengths',
|
|
509 |
'matchlengths' => 'match_lengths',
|
|
510 |
'matchlength' => 'match_lengths',
|
|
511 |
'match-length' => 'match_lengths',
|
|
512 |
'match-lengths' => 'match_lengths',
|
|
513 |
'query-frame' => 'query_frames',
|
|
514 |
'queryframe' => 'query_frames',
|
|
515 |
'query-frames' => 'query_frames',
|
|
516 |
'queryframes' => 'query_frames',
|
|
517 |
'query_frame' => 'query_frames',
|
|
518 |
'query-frame' => 'query_frames',
|
|
519 |
'subjectframe' => 'subject_frames',
|
|
520 |
'subject-frames' => 'subject_frames',
|
|
521 |
'subjectframes' => 'subject_frames',
|
|
522 |
'subject_frame' => 'subject_frames',
|
|
523 |
);
|
|
524 |
|
|
525 |
$attribute = lc($attribute);
|
|
526 |
$attribute = $attribute_map{$attribute} || $attribute;
|
|
527 |
return($attribute);
|
|
528 |
}
|
|
529 |
sub _all_results {
|
|
530 |
my $self = shift;
|
|
531 |
return () unless defined($self) && $self->{results};
|
|
532 |
return () unless @{$self->{results}};
|
|
533 |
return @{$self->{results}};
|
|
534 |
}
|
|
535 |
sub _entry_passes_filter {
|
|
536 |
my $entry = shift;
|
|
537 |
my $ra_stripped_values = shift;
|
|
538 |
my $ra_cmp_ops = shift;
|
|
539 |
my $rh_criteria = shift;
|
|
540 |
my @fields = keys %$rh_criteria;
|
|
541 |
my ($i,$code,$pass,@codes,$ra_safe_vals);
|
|
542 |
|
|
543 |
for($i=0; $i < @$ra_stripped_values; $i++) {
|
|
544 |
if (ref($entry->{$fields[$i]}) eq "ARRAY") {
|
|
545 |
my $n = 0;
|
|
546 |
my (@sorted,$best_result);
|
|
547 |
foreach (@{$entry->{$fields[$i]}}) {
|
|
548 |
if ($ra_cmp_ops->[$i] =~ /\~/) {
|
|
549 |
s/"/'/g; # cannot nest double quotes in regex
|
|
550 |
$code = '$pass' . " = 1 if (\"$_\" $ra_cmp_ops->[$i] ";
|
|
551 |
$ra_stripped_values->[$n] =~ s/\//\\\//g;
|
|
552 |
$code .= "/$ra_stripped_values->[$n]/i)";
|
|
553 |
} else {
|
|
554 |
$ra_safe_vals = _check_for_unsafe_e($entry->{$fields[$i]});
|
|
555 |
if ($ra_cmp_ops->[$i] =~ />/) {
|
|
556 |
@sorted = sort { $b <=> $a } @{$ra_safe_vals};
|
|
557 |
} else {
|
|
558 |
@sorted = sort { $a <=> $b } @{$ra_safe_vals};
|
|
559 |
}
|
|
560 |
|
|
561 |
$best_result = shift @sorted;
|
|
562 |
$code = '$pass' . qq! = 1 if ("$best_result" !;
|
|
563 |
$code .= qq!$ra_cmp_ops->[$i] '$ra_stripped_values->[$i]')!;
|
|
564 |
}
|
|
565 |
_set_passing_positions_flags ($entry,
|
|
566 |
$ra_safe_vals,
|
|
567 |
$ra_cmp_ops->[$i],
|
|
568 |
$ra_stripped_values->[$i]);
|
|
569 |
push @codes, $code;
|
|
570 |
$n++;
|
|
571 |
}
|
|
572 |
foreach (@codes) {
|
|
573 |
$pass = 0;
|
|
574 |
eval $_;
|
|
575 |
carp "eval on $_ failed: $@" if $@;
|
|
576 |
}
|
|
577 |
return undef unless $pass;
|
|
578 |
} else {
|
|
579 |
if ($ra_cmp_ops->[$i] =~ /\~/) {
|
|
580 |
# cannot nest double quotes in regex
|
|
581 |
$entry->{$fields[$i]} =~ s/"/'/g;
|
|
582 |
$code = '$pass' . qq! = 1 if ("$entry->{$fields[$i]}" !;
|
|
583 |
$code .= "$ra_cmp_ops->[$i] ";
|
|
584 |
$ra_stripped_values =~ s/\//\\\//g;
|
|
585 |
$code .= "/$ra_stripped_values->[$i]/i)\;";
|
|
586 |
} else {
|
|
587 |
$entry->{$fields[$i]} =~ s/^e-(\d*)$/1e-$1/;
|
|
588 |
$code = '$pass' . qq! = 1 if ("$entry->{$fields[$i]}" !;
|
|
589 |
$code .= "$ra_cmp_ops->[$i] ";
|
|
590 |
$code .= qq!'$ra_stripped_values->[$i]')\;!;
|
|
591 |
}
|
|
592 |
$pass = 0;
|
|
593 |
|
|
594 |
eval $code;
|
|
595 |
return undef unless $pass;
|
|
596 |
croak "eval on $code failed: $@" if $@;
|
|
597 |
}
|
|
598 |
}
|
|
599 |
return $entry;
|
|
600 |
}
|
|
601 |
|
|
602 |
sub _set_passing_positions_flags {
|
|
603 |
my $entry = shift;
|
|
604 |
my $ra_safe_vals = shift;
|
|
605 |
my $cmp_op = shift;
|
|
606 |
my $cmp_val = shift;
|
|
607 |
my $count = 0;
|
|
608 |
my $code = '';
|
|
609 |
my @array_positions = ();
|
|
610 |
|
|
611 |
$entry->{passing_positions} = [];
|
|
612 |
foreach (@{$ra_safe_vals}) {
|
|
613 |
$code = qq^if ("$_" $cmp_op "$cmp_val") { ^;
|
|
614 |
$code .= q^ $entry->{passing_positions}->[$count] = "pass"; ^;
|
|
615 |
$code .= q^} else { ^;
|
|
616 |
$code .= q^ $entry->{passing_positions}->[$count] = "fail"; }^;
|
|
617 |
eval $code;
|
|
618 |
carp "eval on $code failed: $@" if $@;
|
|
619 |
$count++;
|
|
620 |
}
|
|
621 |
return;
|
|
622 |
}
|
|
623 |
###########################################################################
|
|
624 |
# sub _check_for_unsafe_e #
|
|
625 |
###########################################################################
|
|
626 |
# When sorting BLAST values that have an initial e (e.g. `e-122')
|
|
627 |
# they will not sort. so we need to put a 1 if front (e.g. `1e-122')
|
|
628 |
###########################################################################
|
|
629 |
sub _check_for_unsafe_e {
|
|
630 |
my $ra_unsafe_vals = shift;
|
|
631 |
my @safe_vals = ();
|
|
632 |
|
|
633 |
foreach (@{$ra_unsafe_vals}) {
|
|
634 |
$_ =~ s/^e-(\d*)$/1e-$1/;
|
|
635 |
push @safe_vals,$_;
|
|
636 |
}
|
|
637 |
return \@safe_vals;
|
|
638 |
}
|
|
639 |
|
|
640 |
sub _get_stripped_values_and_comparison_operators {
|
|
641 |
my $rh_criteria = shift;
|
|
642 |
my ($key,$value,@stripped_values,@cmp_ops);
|
|
643 |
my $count = 0;
|
|
644 |
|
|
645 |
while (($key,$value) = each %$rh_criteria) {
|
|
646 |
$key = lc $key;
|
|
647 |
($stripped_values[$count],$cmp_ops[$count]) =
|
|
648 |
_check_for_comparison_operators($key,$value);
|
|
649 |
$count++;
|
|
650 |
}
|
|
651 |
return(\@stripped_values,\@cmp_ops);
|
|
652 |
}
|
|
653 |
sub _check_for_comparison_operators {
|
|
654 |
my $key = shift;
|
|
655 |
my $value = shift;
|
|
656 |
my ($cmp_op,$new_value);
|
|
657 |
if ($value =~ /^(<=|>=|=~|~|!~|>|<|=|==)(.*)/) {
|
|
658 |
if ($1 eq '~') {
|
|
659 |
$cmp_op = '=~';
|
|
660 |
} elsif ($1 eq '=') {
|
|
661 |
$cmp_op = '==';
|
|
662 |
} else {
|
|
663 |
$cmp_op = $1;
|
|
664 |
}
|
|
665 |
$new_value = $2;
|
|
666 |
} else {
|
|
667 |
$cmp_op = $DEFAULT_FILTERS{$key};
|
|
668 |
$new_value = $value;
|
|
669 |
}
|
|
670 |
return($new_value,$cmp_op);
|
|
671 |
}
|
|
672 |
sub _parse_report {
|
|
673 |
my $self = shift;
|
|
674 |
my $report_file = $self->{report_file};
|
|
675 |
$self->{been_parsed} = 1;
|
|
676 |
$self->{results} = [];
|
|
677 |
unless ($self->{report_file_handle}) {
|
|
678 |
open REPORT, $report_file or carp "cannot open $report_file:$!";
|
|
679 |
$self->{report_file_handle} = \*REPORT;
|
|
680 |
}
|
|
681 |
my $hit_new_seq = 0;
|
|
682 |
my $hit_a_seq = 0;
|
|
683 |
my $hit_first_seq_line = 0;
|
|
684 |
my $hit_first_hsp = 0;
|
|
685 |
my $hit_summaries = 0;
|
|
686 |
my $hit_query = 0;
|
|
687 |
my $hit_database = 0;
|
|
688 |
my ($id,$defline,$subject_length,@scores,@expects,@identities,
|
|
689 |
@match_lengths,@query_starts,@subject_starts,$strand,$q_strand,
|
|
690 |
$last_end_point,$q_last_end_point,@subject_strands,
|
|
691 |
@query_strands,$query,$query_length,$e,@query_seqs,
|
|
692 |
@subject_seqs,$query_seq,$subject_seq,@query_frames,@subject_frames,
|
|
693 |
@query_ends, @subject_ends);
|
|
694 |
my $id_regex = $self->{-DB_ID_REGEX}
|
|
695 |
|| '[^\|]+(?:\|[^\|,\s]*){1,10}';
|
|
696 |
$id_regex = "($id_regex)"; # add capturing parens
|
|
697 |
while (<REPORT>) {
|
|
698 |
$hit_summaries = 1 if (/^Sequences producing significant alignments/);
|
|
699 |
$hit_a_seq = 1 if (/^>/);
|
|
700 |
|
|
701 |
# Gather Query defline and Length
|
|
702 |
if ($hit_query == 0 && /^Query=\s+(.*)/) {
|
|
703 |
$query = $1;
|
|
704 |
$hit_query = 1;
|
|
705 |
} elsif ($hit_query == 1) {
|
|
706 |
if (/\(([0-9,]+) letters\)/) {
|
|
707 |
$query_length = $1;
|
|
708 |
$query_length =~ s/,//g;
|
|
709 |
$hit_query = 0;
|
|
710 |
} else {
|
|
711 |
chomp;
|
|
712 |
$query .= $_;
|
|
713 |
}
|
|
714 |
}
|
|
715 |
# Gather Database information
|
|
716 |
if (/^Database:\s+(.*)/) {
|
|
717 |
$self->{database_description} = $1;
|
|
718 |
$hit_database = 1;
|
|
719 |
} elsif ($hit_database &&
|
|
720 |
/^\s+([\d,]+) sequences; ([\d,]+) total letters/) {
|
|
721 |
$self->{database_sequence_count} = $1;
|
|
722 |
$self->{database_letter_count} = $2;
|
|
723 |
$hit_database = 0;
|
|
724 |
} elsif ($hit_database && /^\s*(.*)$/) {
|
|
725 |
$self->{database_description} .= $1;
|
|
726 |
}
|
|
727 |
|
|
728 |
if (!$self->{program} && $_ =~ /^(T?BLAST[XNP]) ([\d.]+)?/) {
|
|
729 |
$self->{program} = $1;
|
|
730 |
$self->{blast_version} = $2;
|
|
731 |
}
|
|
732 |
$self->{options}->{p} = lc($1) if (/^(T?BLAST[XNP]?)/);
|
|
733 |
next unless ($hit_a_seq);
|
|
734 |
if (/^>$id_regex\s+(.*)/) {
|
|
735 |
if ($id) {
|
|
736 |
# catches the starting point for the previous reverse strand
|
|
737 |
push @query_starts,$q_last_end_point if ($q_strand eq 'minus');
|
|
738 |
push @query_ends,$q_last_end_point if ($q_strand eq 'plus');
|
|
739 |
push @subject_starts,$last_end_point if ($strand eq 'minus');
|
|
740 |
push @subject_ends,$last_end_point if ($strand eq 'plus');
|
|
741 |
push @subject_seqs,$subject_seq if ($subject_seq);
|
|
742 |
push @query_seqs,$query_seq if ($query_seq);
|
|
743 |
|
|
744 |
$self->_push_new_results($id,$defline,$subject_length,
|
|
745 |
$query,$query_length,
|
|
746 |
\@scores,\@expects,\@identities,\@match_lengths,
|
|
747 |
\@query_starts,\@query_ends,\@subject_starts,\@subject_ends,
|
|
748 |
\@subject_strands,\@query_strands,\@subject_seqs,
|
|
749 |
\@query_seqs,\@query_frames,\@subject_frames);
|
|
750 |
}
|
|
751 |
# initialize arrays for new sample
|
|
752 |
@scores = (); @expects = (); @identities = (); @match_lengths = ();
|
|
753 |
@query_starts = (); @query_ends = (); @subject_strands = ();
|
|
754 |
@query_strands = (); @subject_starts = (); @subject_ends = ();
|
|
755 |
$strand = ''; $q_strand = ''; $last_end_point = '';
|
|
756 |
$q_last_end_point = ''; $subject_seq = ''; $query_seq = '';
|
|
757 |
@subject_seqs = (); @query_seqs = (); @query_frames = ();
|
|
758 |
@subject_frames = (); $hit_new_seq = 1;
|
|
759 |
$id = $1;
|
|
760 |
$defline = $2;
|
|
761 |
} elsif (/\s+Length = (\d+)/) {
|
|
762 |
$hit_new_seq = 0;
|
|
763 |
$subject_length = $1;
|
|
764 |
} elsif ($hit_new_seq == 1) {
|
|
765 |
chomp();
|
|
766 |
$defline .= $_;
|
|
767 |
} elsif (/^\s*Score[^=]*=\s*([0-9\.-]*)[^E]*Expect[^=]*=\s*([0-9\.e-]*)/) {
|
|
768 |
# catches the starting point for the previous reverse strand
|
|
769 |
push @query_starts,$q_last_end_point if ($q_strand eq 'minus');
|
|
770 |
push @query_ends,$q_last_end_point if ($q_strand eq 'plus');
|
|
771 |
push @subject_starts,$last_end_point if ($strand eq 'minus');
|
|
772 |
push @subject_ends,$last_end_point if ($strand eq 'plus');
|
|
773 |
push @subject_seqs,$subject_seq if ($subject_seq);
|
|
774 |
push @query_seqs,$query_seq if ($query_seq);
|
|
775 |
$subject_seq = ''; $query_seq = '';
|
|
776 |
|
|
777 |
push @scores,$1;
|
|
778 |
push @expects,$2;
|
|
779 |
$hit_first_seq_line = 0;
|
|
780 |
} elsif (/^\s*Identities[^=]*=\s*(\d+)\/(\d+)/) {
|
|
781 |
my $identity = $1 / $2;
|
|
782 |
push @identities,$identity;
|
|
783 |
push @match_lengths,$2;
|
|
784 |
$hit_first_seq_line = 0;
|
|
785 |
} elsif (/^\s*Frame = ([-,+]\d)(?: \/ ([-,+]\d))?/) {
|
|
786 |
push @query_frames, $1;
|
|
787 |
push @subject_frames, $2 if ($2);
|
|
788 |
} elsif (/Query:\s+(\d+)\s+(\S+)\s+(\d+)/) {
|
|
789 |
if (($1 > $3) && $hit_first_seq_line == 0) {
|
|
790 |
$q_strand = 'minus';
|
|
791 |
push @query_ends,$1;
|
|
792 |
} elsif (($1 < $3) && $hit_first_seq_line == 0) {
|
|
793 |
$q_strand = 'plus';
|
|
794 |
push @query_starts,$1;
|
|
795 |
}
|
|
796 |
push @query_strands,$q_strand if ($hit_first_seq_line == 0);
|
|
797 |
$q_last_end_point = $3;
|
|
798 |
$query_seq .= $2;
|
|
799 |
$hit_first_seq_line++;
|
|
800 |
} elsif (/^Sbjct:\s+(\d+)\s+(\S+)\s+(\d+)/) {
|
|
801 |
if (($1 > $3) && $hit_first_seq_line == 1) {
|
|
802 |
$strand = 'minus';
|
|
803 |
push @subject_ends,$1;
|
|
804 |
} elsif (($1 < $3) && $hit_first_seq_line == 1) {
|
|
805 |
$strand = 'plus';
|
|
806 |
push @subject_starts,$1;
|
|
807 |
}
|
|
808 |
push @subject_strands,$strand if ($hit_first_seq_line == 1);
|
|
809 |
$last_end_point = $3;
|
|
810 |
$subject_seq .= $2;
|
|
811 |
$hit_first_seq_line++;
|
|
812 |
}
|
|
813 |
}
|
|
814 |
# catch the last one.
|
|
815 |
my $ra_safe_expects = _check_for_unsafe_e(\@expects);
|
|
816 |
if ($id) {
|
|
817 |
# catches the starting point for the previous reverse strand
|
|
818 |
push @query_starts,$q_last_end_point if ($q_strand eq 'minus');
|
|
819 |
push @query_ends,$q_last_end_point if ($q_strand eq 'plus');
|
|
820 |
push @subject_starts,$last_end_point if ($strand eq 'minus');
|
|
821 |
push @subject_ends,$last_end_point if ($strand eq 'plus');
|
|
822 |
push @subject_seqs,$subject_seq if ($subject_seq);
|
|
823 |
push @query_seqs,$query_seq if ($query_seq);
|
|
824 |
$self->_push_new_results($id,$defline,$subject_length,
|
|
825 |
$query,$query_length,\@scores,$ra_safe_expects,\@identities,
|
|
826 |
\@match_lengths,\@query_starts,\@query_ends,\@subject_starts,
|
|
827 |
\@subject_ends,\@subject_strands,\@query_strands,
|
|
828 |
\@subject_seqs,\@query_seqs,\@query_frames,\@subject_frames);
|
|
829 |
}
|
|
830 |
}
|
|
831 |
|
|
832 |
sub _push_new_results {
|
|
833 |
my $self = shift;
|
|
834 |
my ($id,$defline,$subject_length,$query,$query_length,$ra_scores,
|
|
835 |
$ra_expects,$ra_identities,$ra_match_lengths,$ra_query_starts,
|
|
836 |
$ra_query_ends,$ra_subject_starts,$ra_subject_ends,$ra_subject_strands,
|
|
837 |
$ra_query_strands,$ra_subject_seqs,$ra_query_seqs,$ra_query_frames,
|
|
838 |
$ra_subject_frames) = @_;
|
|
839 |
|
|
840 |
my $rh_hash = {'id' =>$id
|
|
841 |
, 'defline' =>$defline
|
|
842 |
, 'subject_length' =>$subject_length
|
|
843 |
, 'query' =>$query
|
|
844 |
, 'query_length' =>$query_length
|
|
845 |
, 'scores' => [ @{$ra_scores} ]
|
|
846 |
, 'expects' => [ @{$ra_expects} ]
|
|
847 |
, 'identities' => [ @{$ra_identities} ]
|
|
848 |
, 'match_lengths' => [ @{$ra_match_lengths} ]
|
|
849 |
, 'query_starts' => [ @{$ra_query_starts} ]
|
|
850 |
, 'query_ends' => [ @{$ra_query_ends} ]
|
|
851 |
, 'subject_starts' => [ @{$ra_subject_starts} ]
|
|
852 |
, 'subject_ends' => [ @{$ra_subject_ends} ]
|
|
853 |
, 'subject_strands' => [ @{$ra_subject_strands} ]
|
|
854 |
, 'query_strands' => [ @{$ra_query_strands} ]
|
|
855 |
, 'subject_seqs' => [ @{$ra_subject_seqs} ]
|
|
856 |
, 'query_seqs' => [ @{$ra_query_seqs} ]
|
|
857 |
, 'query_frames' => []
|
|
858 |
, 'subject_frames' => []
|
|
859 |
};
|
|
860 |
# only true in blastx or tblastx
|
|
861 |
if ($ra_query_frames && $self->{options}->{p} =~ /blastx/i) {
|
|
862 |
push @{$rh_hash->{query_frames}}, @{$ra_query_frames};
|
|
863 |
}
|
|
864 |
# only true in tblastn
|
|
865 |
if ($ra_query_frames && $self->{options}->{p} =~ /tblastn/i) {
|
|
866 |
push @{$rh_hash->{subject_frames}}, @{$ra_query_frames};
|
|
867 |
}
|
|
868 |
# only true in tblastx
|
|
869 |
if ($self->{options}->{p} =~ /tblastx/i) {
|
|
870 |
push @{$rh_hash->{subject_frames}}, @{$ra_subject_frames};
|
|
871 |
}
|
|
872 |
push @{$self->{results}}, $rh_hash;
|
|
873 |
}
|
|
874 |
sub _get_args {
|
|
875 |
my $rh_opts = shift;
|
|
876 |
my $tmp_dir = _get_temp_directory();
|
|
877 |
my ($key,$value,$outfile);
|
|
878 |
my @args = ();
|
|
879 |
my $tmp_file = "";
|
|
880 |
while (($key,$value) = each %$rh_opts) {
|
|
881 |
push @args, "-$key$value";
|
|
882 |
$outfile = $value if ($key eq "o");
|
|
883 |
}
|
|
884 |
unless ($outfile) {
|
|
885 |
my $base = sprintf("%s/%s-%d-%d-0", $tmp_dir, ,"Blastall", $$, time());
|
|
886 |
$tmp_file = _generate_temp_file($base);
|
|
887 |
push @args, "-o$tmp_file";
|
|
888 |
}
|
|
889 |
return(\@args,$tmp_file);
|
|
890 |
}
|
|
891 |
sub _get_formatdb_args {
|
|
892 |
my $rh_opts = shift;
|
|
893 |
my @args = ();
|
|
894 |
foreach my $k (keys %{$rh_opts}) {
|
|
895 |
push @args, "-${k}$rh_opts->{$k}";
|
|
896 |
}
|
|
897 |
return \@args;
|
|
898 |
}
|
|
899 |
sub _get_wu_args {
|
|
900 |
my $rh_opts = shift;
|
|
901 |
my ($key,$value);
|
|
902 |
my $tmp_dir = _get_temp_directory();
|
|
903 |
my $base_name = sprintf("%s/%s-%d-%d-0", $tmp_dir, ,"Blastall", $$, time());
|
|
904 |
my $tmp_file = _generate_temp_file($base_name);
|
|
905 |
my $program = $WU_BLAST{$rh_opts->{p}} || $WU_BLAST{wu_blastn};
|
|
906 |
my $database = $rh_opts->{d} || 'nr';
|
|
907 |
my $infile = $rh_opts->{i};
|
|
908 |
my $outfile = $rh_opts->{o};
|
|
909 |
my @args = ($program, $database, $infile);
|
|
910 |
while (($key,$value) = each %${rh_opts}) {
|
|
911 |
next if ($key eq 'p' || $key eq 'd' || $key eq 'i' || $key eq 'o');
|
|
912 |
if ($value eq '!' || $value eq '') {
|
|
913 |
push @args, "-$key";
|
|
914 |
} else {
|
|
915 |
push @args, "-$key=$value";
|
|
916 |
}
|
|
917 |
}
|
|
918 |
# $rh_opts->{o} ? push @args, "> $rh_opts->{o}" : push @args, "> $tmp_file";
|
|
919 |
return(\@args,$tmp_file);
|
|
920 |
}
|
|
921 |
sub _get_temp_directory {
|
|
922 |
return($ENV{TMPDIR}) if ($ENV{TMPDIR});
|
|
923 |
return($ENV{TEMP}) if ($ENV{TEMP});
|
|
924 |
return($ENV{TMP}) if ($ENV{TMP});
|
|
925 |
return('/tmp') if (-d '/tmp' && -w '/tmp');
|
|
926 |
return('/var/tmp') if (-d '/var/tmp' && -w '/var/tmp');
|
|
927 |
return('/usr/tmp') if (-d '/usr/tmp' && -w '/usr/tmp');
|
|
928 |
return('/temp') if (-d '/temp' && -w '/temp');
|
|
929 |
return('.') if (-w '.');
|
|
930 |
$_ = 'Cannot find a place to write tempfiles. ';
|
|
931 |
$_ .= 'Please set the TMPDIR environmental variable. ';
|
|
932 |
return undef;
|
|
933 |
}
|
|
934 |
sub _generate_temp_file {
|
|
935 |
my $base_name = shift;
|
|
936 |
my $count = 0;
|
|
937 |
while ($count < 1000) {
|
|
938 |
return ($base_name) unless (-e $base_name); # don't clobber existing
|
|
939 |
$base_name =~ s/-?(\d+)?$/"-" . (1 + $1)/e;
|
|
940 |
$count++;
|
|
941 |
}
|
|
942 |
return undef;
|
|
943 |
}
|
|
944 |
|
|
945 |
1;
|
|
946 |
__END__
|
|
947 |
|
|
948 |
=head1 NAME
|
|
949 |
|
|
950 |
NHGRI::Blastall - Perl extension for running and parsing NCBI's BLAST 2.x
|
|
951 |
|
|
952 |
=head1 SYNOPSIS
|
|
953 |
|
|
954 |
=over 4
|
|
955 |
|
|
956 |
=head1 DESCRIPTION
|
|
957 |
|
|
958 |
If you have NCBI's BLAST2 or WU-BLAST installed locally and your
|
|
959 |
environment is already setup you can use Perl's object-oriented
|
|
960 |
capabilities to run your BLASTs. Also if you have a blastcl3 binary
|
|
961 |
from the toolkit (or binaries from our FTP site) you can run BLAST over
|
|
962 |
the network. There are also methods to blast single sequences against
|
|
963 |
each other using the bl2seq binaries (also in the toolkit and binaries).
|
|
964 |
You can blast one sequence against a library of sequences using the
|
|
965 |
blast_one_to_many method. You can format databases with formatdb method.
|
|
966 |
You can also have NHGRI::Blastall read existing BLAST
|
|
967 |
reports. If you have a database of repetitive DNA or other DNA you would
|
|
968 |
like to mask out, you can use the mask method to mask the data against
|
|
969 |
these databases. You can then use either the filter or result methods
|
|
970 |
to parse the report and access the various elements of the data.
|
|
971 |
|
|
972 |
|
|
973 |
=item RUNNING NEW BLASTS
|
|
974 |
|
|
975 |
use NHGRI::Blastall;
|
|
976 |
my $b = new NHGRI::Blastall;
|
|
977 |
# If you are running NCBI's Local BLAST
|
|
978 |
$b->blastall( p => 'blastn',
|
|
979 |
d => 'nr',
|
|
980 |
i => 'infile',
|
|
981 |
o => 'outfile'
|
|
982 |
);
|
|
983 |
# If you are running NCBI's blastcl3 network client
|
|
984 |
$b->blastcl3( p => 'blastn',
|
|
985 |
d => 'nr',
|
|
986 |
i => 'infile',
|
|
987 |
o => 'outfile'
|
|
988 |
);
|
|
989 |
# If you are running WU-BLAST locally
|
|
990 |
$b->wu_blastall( p => 'blastn',
|
|
991 |
d => 'nr',
|
|
992 |
nogap => '!', #use ! for arguments w/o parameter
|
|
993 |
i => 'infile',
|
|
994 |
o => 'outfile'
|
|
995 |
);
|
|
996 |
|
|
997 |
See BLASTALL for more info
|
|
998 |
|
|
999 |
=item BLASTING 2 SEQUENCES
|
|
1000 |
|
|
1001 |
use NHGRI::Blastall;
|
|
1002 |
my $b = new NHGRI::Blastall;
|
|
1003 |
$b->bl2seq(i => 'seq1',
|
|
1004 |
j => 'seq2',
|
|
1005 |
p => 'tblastx'
|
|
1006 |
);
|
|
1007 |
|
|
1008 |
See BL2SEQ for more info
|
|
1009 |
|
|
1010 |
=item BLASTING 1 SEQUENCE AGAINST A FASTA LIBRARY OF SEQUENCES
|
|
1011 |
|
|
1012 |
# a library is a FASTA file with multiple FASTA formatted sequences
|
|
1013 |
use NHGRI::Blastall;
|
|
1014 |
my $b = new NHGRI::Blastall;
|
|
1015 |
$b->blast_one_to_many(i => 'seq1',
|
|
1016 |
d => 'seq2.lib',
|
|
1017 |
p => 'tblastx',
|
|
1018 |
);
|
|
1019 |
|
|
1020 |
See BLAST_ONE_TO_MANY for more info
|
|
1021 |
|
|
1022 |
=item INITIALIZING EXISTING BLAST REPORTS
|
|
1023 |
|
|
1024 |
use NHGRI::Blastall;
|
|
1025 |
my $b = new NHGRI::Blastall;
|
|
1026 |
$b->read_report('/path/to/report');
|
|
1027 |
|
|
1028 |
=item MASKING SEQUENCES
|
|
1029 |
|
|
1030 |
use NHGRI::Blastall;
|
|
1031 |
my $b = new NHGRI::Blastall;
|
|
1032 |
$masked_seq = $b->mask( type => 'wu_blastall',
|
|
1033 |
p => 'blastn',
|
|
1034 |
d => 'alu',
|
|
1035 |
i => 'infile'
|
|
1036 |
);
|
|
1037 |
|
|
1038 |
See MASKING for more info
|
|
1039 |
|
|
1040 |
=item CREATING BLAST INDEXES
|
|
1041 |
|
|
1042 |
use NHGRI::Blastall;
|
|
1043 |
my $b = new NHGRI::Blastall;
|
|
1044 |
$b->formatdb( i => 'est',
|
|
1045 |
p => 'F',
|
|
1046 |
o => 'T',
|
|
1047 |
);
|
|
1048 |
|
|
1049 |
See FORMATDB for more info
|
|
1050 |
|
|
1051 |
=item PRINTING REPORTS
|
|
1052 |
|
|
1053 |
$b->print_report();
|
|
1054 |
# this method only opens the report and prints. It does not print
|
|
1055 |
# summary reports
|
|
1056 |
|
|
1057 |
=item FILTERING BLAST RESULTS
|
|
1058 |
|
|
1059 |
@hits = $b->filter( scores => '38.2',
|
|
1060 |
identities => '.98'
|
|
1061 |
);
|
|
1062 |
|
|
1063 |
# returns an array of hash references.
|
|
1064 |
See HASHREF for more info on manipulating the results.
|
|
1065 |
See FILTERING for more info on using the filter method
|
|
1066 |
|
|
1067 |
=item GETTING AT ELEMENTS
|
|
1068 |
|
|
1069 |
@ids = $b->result('id');
|
|
1070 |
@scores = $b->result('scores',$ids[0]); # second param must be an id
|
|
1071 |
|
|
1072 |
See RESULT for more info on using the result method
|
|
1073 |
See ELEMENTS for element names
|
|
1074 |
|
|
1075 |
=item GETTING AT ALL THE DATA
|
|
1076 |
|
|
1077 |
@results = $b->result(); # returns an array of hashes
|
|
1078 |
|
|
1079 |
See HASHREF for information on the array of hashes that is returned.
|
|
1080 |
See DUMP RESULTS to see how to work with the array of hashes
|
|
1081 |
|
|
1082 |
=item ADJUSTING THE DEFLINE REGEX
|
|
1083 |
|
|
1084 |
$b = new NHGRI::Blastall (-DB_ID_REGEX => '[^ ]+');
|
|
1085 |
|
|
1086 |
See DB_ID_REGEX for more info
|
|
1087 |
|
|
1088 |
=back
|
|
1089 |
|
|
1090 |
=head1 BLASTALL
|
|
1091 |
|
|
1092 |
This method provides a simple object oriented frontend to BLAST.
|
|
1093 |
This module works with either NCBI's blastall binary distributed with
|
|
1094 |
BLAST 2.x, WU-BLAST or over the web through NCBI's Web Site.
|
|
1095 |
The blastall function accepts as a parameter an anonymous hash with keys
|
|
1096 |
that are the command line options (See BLASTALL OPTIONS) and values
|
|
1097 |
which are the corresponding values to those options. You may want to
|
|
1098 |
set the BLASTALL variable in Blastall.pm to the full path of your
|
|
1099 |
`blastall' binary, especially if you will be running scripts as cron
|
|
1100 |
jobs or if blastall is not in the system path.
|
|
1101 |
|
|
1102 |
=head1 BLASTALL OPTIONS
|
|
1103 |
|
|
1104 |
For wu_blastall you need to use NCBI type switches for the following
|
|
1105 |
[CB<-i>] for infile
|
|
1106 |
[CB<-o>] for outfile
|
|
1107 |
[CB<-p>] for program
|
|
1108 |
[CB<-d>] for database
|
|
1109 |
the rest of the parameters MUST be the parameters available through WU-BLAST
|
|
1110 |
(e.g. -sump, -nogap -compat1.4, etc.)
|
|
1111 |
use a `!' to specify that an argument has no parameters. See the example
|
|
1112 |
at the top of the manpage.
|
|
1113 |
|
|
1114 |
These are the options that NCBI's blastall and binary accepts and these
|
|
1115 |
are the same options that are accepted by the blastall and blastcl3 methods.
|
|
1116 |
NOTE: You must set the proper environmental variables for the blastall
|
|
1117 |
method to work (BLASTDB,BLASTMAT).
|
|
1118 |
|
|
1119 |
=over 4
|
|
1120 |
|
|
1121 |
=item
|
|
1122 |
|
|
1123 |
B<p> => Program Name
|
|
1124 |
|
|
1125 |
=item
|
|
1126 |
|
|
1127 |
B<d> => Database default=nr
|
|
1128 |
|
|
1129 |
=item
|
|
1130 |
|
|
1131 |
B<i> => QueryFile
|
|
1132 |
|
|
1133 |
=item
|
|
1134 |
|
|
1135 |
B<e> => Expectation vaule (E) default=10.0
|
|
1136 |
|
|
1137 |
=item
|
|
1138 |
|
|
1139 |
B<m> => alignment view default=0
|
|
1140 |
0 = pairwise,
|
|
1141 |
1 = master-slave showing identities,
|
|
1142 |
2 = master-slave no identities,
|
|
1143 |
3 = flat master-slave, show identities,
|
|
1144 |
4 = flat master-slave, no identities,
|
|
1145 |
5 = master-slave no identities and blunt ends,
|
|
1146 |
6 = flat master-slave, no identities and blunt ends
|
|
1147 |
|
|
1148 |
=item
|
|
1149 |
|
|
1150 |
B<o> => BLAST report Output File default=stdout
|
|
1151 |
|
|
1152 |
=item
|
|
1153 |
|
|
1154 |
B<F> => Filter query sequence default=T
|
|
1155 |
(DUST with blastn, SEG with others)
|
|
1156 |
|
|
1157 |
=item
|
|
1158 |
|
|
1159 |
B<G> => Cost to open a gap default=0
|
|
1160 |
(zero invokes default behavior)
|
|
1161 |
|
|
1162 |
=item
|
|
1163 |
|
|
1164 |
B<E> => Cost to extend a gap default=0
|
|
1165 |
(zero invokes default behavior)
|
|
1166 |
|
|
1167 |
=item
|
|
1168 |
|
|
1169 |
B<X> => X dropoff value for gapped alignment (in bits) default=0
|
|
1170 |
(zero invokes default behavior)
|
|
1171 |
|
|
1172 |
=item
|
|
1173 |
|
|
1174 |
B<I> => Show GI's in deflines default=F
|
|
1175 |
|
|
1176 |
=item
|
|
1177 |
|
|
1178 |
B<q> => Penalty for a nucleotide mismatch (blastn only) default=-3
|
|
1179 |
|
|
1180 |
=item
|
|
1181 |
|
|
1182 |
B<r> => Reward for a nucleotide match (blastn only) default=1
|
|
1183 |
|
|
1184 |
=item
|
|
1185 |
|
|
1186 |
B<v> => Number of one-line descriptions (V) default=500
|
|
1187 |
|
|
1188 |
=item
|
|
1189 |
|
|
1190 |
B<b> => Number of alignments to show (B) default=250
|
|
1191 |
|
|
1192 |
=item
|
|
1193 |
|
|
1194 |
B<f> => Threshold for extending hits, default if zero default=0
|
|
1195 |
|
|
1196 |
=item
|
|
1197 |
|
|
1198 |
B<g> => Perfom gapped alignment (NA with tblastx) default=T
|
|
1199 |
|
|
1200 |
=item
|
|
1201 |
|
|
1202 |
B<Q> => Query Genetic code to use default=1
|
|
1203 |
|
|
1204 |
=item
|
|
1205 |
|
|
1206 |
B<D> => DB Genetic code (for tblast[nx] only) default=1
|
|
1207 |
|
|
1208 |
=item
|
|
1209 |
|
|
1210 |
B<a> => Number of processors to use default=1
|
|
1211 |
|
|
1212 |
=item
|
|
1213 |
|
|
1214 |
B<O> => SeqAlign file Optional
|
|
1215 |
|
|
1216 |
=item
|
|
1217 |
|
|
1218 |
B<J> => Believe the query defline default=F
|
|
1219 |
|
|
1220 |
=item
|
|
1221 |
|
|
1222 |
B<M> => Matrix default=BLOSUM62
|
|
1223 |
|
|
1224 |
=item
|
|
1225 |
|
|
1226 |
B<W> => Word size, default if zero default=0
|
|
1227 |
|
|
1228 |
=item
|
|
1229 |
|
|
1230 |
B<z> => Effective length of the database default=0
|
|
1231 |
(use zero for the real size)
|
|
1232 |
|
|
1233 |
=item
|
|
1234 |
|
|
1235 |
B<K> => Number of best hits from a region to keep default=100
|
|
1236 |
|
|
1237 |
=item
|
|
1238 |
|
|
1239 |
B<L> => Length of region used to judge hits default=20
|
|
1240 |
|
|
1241 |
=item
|
|
1242 |
|
|
1243 |
B<Y> => Effective length of the search space default=0
|
|
1244 |
(use zero for the real size)
|
|
1245 |
|
|
1246 |
=item
|
|
1247 |
|
|
1248 |
B<S> => Query strands to search against database default=3
|
|
1249 |
(for blast[nx], and tblastx).
|
|
1250 |
3 is both, 1 is top, 2 is bottom
|
|
1251 |
|
|
1252 |
=item
|
|
1253 |
|
|
1254 |
B<T> => Produce HTML output [T/F] default=F
|
|
1255 |
|
|
1256 |
=item
|
|
1257 |
|
|
1258 |
B<l> => Restrict search of database to list of GI's [String]
|
|
1259 |
|
|
1260 |
=back
|
|
1261 |
|
|
1262 |
NOTE: If you do not supply an `o' option (outfile),
|
|
1263 |
the following environment variables are checked in order:
|
|
1264 |
`TMPDIR', `TEMP', and `TMP'.
|
|
1265 |
If one of them is set, outfiles are created relative to the
|
|
1266 |
directory it specifies. If none of them are set, the first
|
|
1267 |
possible one of the following directories is used:
|
|
1268 |
/var/tmp , /usr/tmp , /temp , /tmp ,
|
|
1269 |
This file is deleted after the NHGRI::Blastall object is destroyed.
|
|
1270 |
It is recommended that you create a tmp directory in your home
|
|
1271 |
directory and set one of the above environmental vars to point
|
|
1272 |
to this directory and then set the permissions on this directory
|
|
1273 |
to 0700. Writing to a "public" tmp directory can have
|
|
1274 |
security ramifications.
|
|
1275 |
|
|
1276 |
=head1 BL2SEQ
|
|
1277 |
|
|
1278 |
This method uses the bl2seq binary (distributed with BLAST executables
|
|
1279 |
and source) to BLAST one sequence against another sequence.
|
|
1280 |
Like the blastall method the bl2seq method accepts the same options
|
|
1281 |
that the bl2seq binary accepts. Run bl2seq without options from the
|
|
1282 |
command line to get a full list of options. An important note about
|
|
1283 |
the options, when running blastx 1st sequence should be nucleotide;
|
|
1284 |
when running tblastn 2nd sequence should be nucleotide.
|
|
1285 |
|
|
1286 |
use NHGRI::Blastall;
|
|
1287 |
my $b = new NHGRI::Blastall;
|
|
1288 |
$b->bl2seq(i => 'seq1.nt',
|
|
1289 |
j => 'seq2.aa',
|
|
1290 |
p => 'blastx'
|
|
1291 |
);
|
|
1292 |
|
|
1293 |
|
|
1294 |
=head1 BLAST_ONE_TO_MANY
|
|
1295 |
|
|
1296 |
This method allows for blasting one sequence against a FASTA library
|
|
1297 |
of sequences. Behind the scenes, BLAST indexes are created (in the
|
|
1298 |
same directory as the FASTA library) using the provided FASTA library
|
|
1299 |
and the one sequence is used to search against this database.
|
|
1300 |
If the program completes successfully, the databases are removed.
|
|
1301 |
To compare two sequences, use the bl2seq method which is faster and
|
|
1302 |
less messy (no tmp indexes). This method accepts the same options
|
|
1303 |
as the blastall binary with the d option corresponding to the
|
|
1304 |
FASTA library.
|
|
1305 |
|
|
1306 |
use NHGRI::Blastall;
|
|
1307 |
my $b = new NHGRI::Blastall;
|
|
1308 |
$b->blast_one_to_many(i => 'seq.aa',
|
|
1309 |
d => 'seq.nt.lib',
|
|
1310 |
e => '0.001',
|
|
1311 |
p => 'tblastn',
|
|
1312 |
);
|
|
1313 |
|
|
1314 |
=head1 MASKING
|
|
1315 |
|
|
1316 |
Screens DNA sequences in fasta format against the database specified
|
|
1317 |
in the blastall 'd' option. The mask method accepts the same parameters
|
|
1318 |
as the blastall method. Any matches to the masking database will be
|
|
1319 |
substituted with "N"s. The mask method returns the masked sequence.
|
|
1320 |
Performs similar function as xblast, an old NCBI program written in C.
|
|
1321 |
|
|
1322 |
Set the type parameter to wu_blastall, blastcl3 or blastall
|
|
1323 |
depending on your configuration.
|
|
1324 |
|
|
1325 |
$masked_seq = $b->mask( type => 'blastcl3', # defaults to blastall
|
|
1326 |
p => 'blastn',
|
|
1327 |
d => 'alu',
|
|
1328 |
i => 'infile'
|
|
1329 |
);
|
|
1330 |
|
|
1331 |
To get the mask coordinates back call the mask method in an array
|
|
1332 |
context.
|
|
1333 |
|
|
1334 |
@mask = $b->mask(p => 'blastn',
|
|
1335 |
d => 'alu',
|
|
1336 |
i => 'infile'
|
|
1337 |
);
|
|
1338 |
$masked_seq = $mask[0]; # same as above masked seq
|
|
1339 |
$ra_masked_coords = $mask[1]; # reference to array of mask coordinates
|
|
1340 |
|
|
1341 |
=head1 FORMATDB
|
|
1342 |
|
|
1343 |
This method creates BLAST indexes using the formatdb binary which
|
|
1344 |
is distributed with BLAST. It accepts the same parameters as formatdb.
|
|
1345 |
The remove_formatdb_indexes method will remove databases created
|
|
1346 |
using the formatdb method (if called by the same object).
|
|
1347 |
formatdb leaves a file called formatdb.log by default in the current
|
|
1348 |
working directory (if it has permission). To change this behavior,
|
|
1349 |
use the l option to direct the sequence to /dev/null or somewhere else.
|
|
1350 |
|
|
1351 |
use NHGRI::Blastall;
|
|
1352 |
my $b = new NHGRI::Blastall;
|
|
1353 |
$b->formatdb( i => 'swissprot',
|
|
1354 |
p => 'T',
|
|
1355 |
l => '/dev/null',
|
|
1356 |
o => 'T',
|
|
1357 |
);
|
|
1358 |
|
|
1359 |
|
|
1360 |
=head1 DB_ID_REGEX
|
|
1361 |
|
|
1362 |
By default Blastall.pm expects FASTA deflines of BLAST databases to be
|
|
1363 |
formatted like Genbank database (gi|GINUMBER|DATABASE|ACCESSION|SUMMARY).
|
|
1364 |
The default regular expression is [^\|]+(?:\|[^\|,\s]*){1,10}
|
|
1365 |
When using non-genbankformatted deflines, it may become necessary to
|
|
1366 |
adjust the regular expression that identifies the unique identifier
|
|
1367 |
in a defline. This can be done with the -DB_ID_REGEX parameter to the
|
|
1368 |
new method. For example
|
|
1369 |
|
|
1370 |
$b = new NHGRI::Blastall( -DB_ID_REGEX => '[^ ]+' );
|
|
1371 |
|
|
1372 |
=head1 FILTERING
|
|
1373 |
|
|
1374 |
The filter method accepts an anonymous hash in which the keys are
|
|
1375 |
elements of the blast report and the values are limits that
|
|
1376 |
are put on the result set.
|
|
1377 |
|
|
1378 |
The following are the Filter elements and their default operations.
|
|
1379 |
|
|
1380 |
id => regular expression match
|
|
1381 |
defline => regular expression match
|
|
1382 |
subject_length => greater than
|
|
1383 |
scores => greater than
|
|
1384 |
expects => less than
|
|
1385 |
identities => greater than
|
|
1386 |
match_length => greater than
|
|
1387 |
subject_strand => equals
|
|
1388 |
query_frames => equals
|
|
1389 |
subject_frames => equals
|
|
1390 |
|
|
1391 |
so if you would like to limit your results to entries that have scores
|
|
1392 |
greater than 38.2 and identities greater than 98% you would say...
|
|
1393 |
|
|
1394 |
@hits = $b->filter( scores => '38.2',
|
|
1395 |
identities => '.98'
|
|
1396 |
);
|
|
1397 |
|
|
1398 |
you can also override the defaults. if you would like only scores
|
|
1399 |
that are less than 38.2 you could say...
|
|
1400 |
|
|
1401 |
@hits = $b->filter( scores => '<38.2' );
|
|
1402 |
|
|
1403 |
or if you wanted only identities that were equal to 1 and you didn't
|
|
1404 |
care about the hits array you could say...
|
|
1405 |
|
|
1406 |
$b->filter( identities => '=1' );
|
|
1407 |
|
|
1408 |
Regular expression matches are case insensitive. If you wanted
|
|
1409 |
only records with the word "human" in the defline you could say...
|
|
1410 |
|
|
1411 |
@hits = $b->filter( defline => 'HuMaN' );
|
|
1412 |
|
|
1413 |
After you run the filter method on an object the object only contains
|
|
1414 |
those results which passed the filter. This will effect additional
|
|
1415 |
calls to the filter method as well as calls to other methods
|
|
1416 |
(e.g. result). To reset the NHGRI::Blastall object you can use
|
|
1417 |
the unfilter method.
|
|
1418 |
|
|
1419 |
$b->unfilter();
|
|
1420 |
|
|
1421 |
See DUMP RESULTS for info on how to manipulate the array of hash refs.
|
|
1422 |
|
|
1423 |
|
|
1424 |
=head1 RESULT
|
|
1425 |
|
|
1426 |
|
|
1427 |
The result method has 3 possible invocations. The first invocation
|
|
1428 |
is when it is called without parameters.
|
|
1429 |
|
|
1430 |
@results = $b->result();
|
|
1431 |
|
|
1432 |
This invocation returns an array of hash references.
|
|
1433 |
See HASHREF for further explanation of this structure.
|
|
1434 |
|
|
1435 |
To get a list of all the ids do...
|
|
1436 |
|
|
1437 |
@ids = $b->result('id');
|
|
1438 |
|
|
1439 |
These ids can be used to get at specific elements. If 2 parameters
|
|
1440 |
are present and the first one is an element (See ELEMENTS for a list
|
|
1441 |
of ELEMENTS) and the second one is an id then the routine will
|
|
1442 |
return a list of elements corresponding to the id.
|
|
1443 |
|
|
1444 |
@scores = $b->result('scores',$ids[0]); # second param must be an id
|
|
1445 |
|
|
1446 |
If more than 2 elements are passed the function will return undef.
|
|
1447 |
|
|
1448 |
=head1 ACCESSOR METHODS
|
|
1449 |
|
|
1450 |
=over 4
|
|
1451 |
|
|
1452 |
=item B<get_report>
|
|
1453 |
|
|
1454 |
returns the filename of the BLAST report.
|
|
1455 |
|
|
1456 |
=item B<get_database_description>
|
|
1457 |
|
|
1458 |
returns description given to the database during formatting of db.
|
|
1459 |
e.g. All non-redundant GenBank CDStranslations+PDB+SwissProt+PIR
|
|
1460 |
|
|
1461 |
=item B<get_database_sequence_count>
|
|
1462 |
|
|
1463 |
returns the number of sequences in the database.
|
|
1464 |
|
|
1465 |
=item B<get_database_letter_count>
|
|
1466 |
|
|
1467 |
returns the number of total letters in the database.
|
|
1468 |
|
|
1469 |
=item B<get_blast_program>
|
|
1470 |
|
|
1471 |
returns the BLAST program name that appears at the top of the report.
|
|
1472 |
either BLASTN, BLASTP, BLASTX, TBLASTN or TBLASTX
|
|
1473 |
|
|
1474 |
=item B<get_blast_version>
|
|
1475 |
|
|
1476 |
returns the version of the BLAST program that was used.
|
|
1477 |
|
|
1478 |
=back
|
|
1479 |
|
|
1480 |
=head1 ELEMENTS
|
|
1481 |
|
|
1482 |
=over 4
|
|
1483 |
|
|
1484 |
=item B<id>
|
|
1485 |
|
|
1486 |
an example of an id is `>gb|U19386|MMU19386' the initial `>'
|
|
1487 |
is just a flag. The next characters up until the first pipe
|
|
1488 |
is the database the subject was taken from. The next characters
|
|
1489 |
up to the next pipe is the Genbank accession number. The last
|
|
1490 |
characters are the locus. This element is used as a unique
|
|
1491 |
identifier by the NHGRI::Blastall module.
|
|
1492 |
(SCALAR)
|
|
1493 |
|
|
1494 |
=item B<defline>
|
|
1495 |
|
|
1496 |
The definition line taken from the subject
|
|
1497 |
(SCALAR)
|
|
1498 |
|
|
1499 |
=item B<subject_length>
|
|
1500 |
|
|
1501 |
This is the length of the full subject, not the
|
|
1502 |
length of the match.
|
|
1503 |
(SCALAR)
|
|
1504 |
|
|
1505 |
=item B<scores>
|
|
1506 |
|
|
1507 |
This is score (in bits) of the match.
|
|
1508 |
(ARRAY)
|
|
1509 |
|
|
1510 |
=item B<expects>
|
|
1511 |
|
|
1512 |
This is the statistical significance (`E value') for the match.
|
|
1513 |
(ARRAY)
|
|
1514 |
|
|
1515 |
=item B<identities>
|
|
1516 |
|
|
1517 |
This is the number of identities divided by the match
|
|
1518 |
length in decimal format. (Listed as a fraction and a percentage
|
|
1519 |
in a BLAST report.)
|
|
1520 |
(ARRAY)
|
|
1521 |
|
|
1522 |
=item B<match_lengths>
|
|
1523 |
|
|
1524 |
this is the number of base pairs that match up.
|
|
1525 |
(ARRAY)
|
|
1526 |
|
|
1527 |
=item B<query_starts>
|
|
1528 |
|
|
1529 |
This is the number of the first base which matched
|
|
1530 |
with the subject.
|
|
1531 |
(ARRAY)
|
|
1532 |
|
|
1533 |
=item B<query_ends>
|
|
1534 |
|
|
1535 |
This is the number of the last base which matched
|
|
1536 |
with the subject.
|
|
1537 |
(ARRAY)
|
|
1538 |
|
|
1539 |
=item B<subject_starts>
|
|
1540 |
|
|
1541 |
This is the number of the first base which matched
|
|
1542 |
with the query.
|
|
1543 |
(ARRAY)
|
|
1544 |
|
|
1545 |
=item B<subject_ends>
|
|
1546 |
|
|
1547 |
This is the number of the last base which matched
|
|
1548 |
with the query.
|
|
1549 |
(ARRAY)
|
|
1550 |
|
|
1551 |
=item B<subject_strands>
|
|
1552 |
|
|
1553 |
This is either plus or minus depending on the orientation
|
|
1554 |
of the subject sequence in the match.
|
|
1555 |
(ARRAY)
|
|
1556 |
|
|
1557 |
=item B<query_strands>
|
|
1558 |
|
|
1559 |
This is either plus or minus depending on the orientation
|
|
1560 |
of the query sequence in the match.
|
|
1561 |
(ARRAY)
|
|
1562 |
|
|
1563 |
=item B<query_frames>
|
|
1564 |
|
|
1565 |
If you are running a blastx or tblastx search in which the
|
|
1566 |
query_sequence is translated this is the frame the query
|
|
1567 |
sequence matched.
|
|
1568 |
(ARRAY)
|
|
1569 |
|
|
1570 |
=item B<subject_frames>
|
|
1571 |
|
|
1572 |
If you are running a tblastn or tblastx search in which the
|
|
1573 |
subject sequence is translated, this is the frame where the
|
|
1574 |
subject sequence matched.
|
|
1575 |
(ARRAY)
|
|
1576 |
|
|
1577 |
=back
|
|
1578 |
|
|
1579 |
=head1 HASHREF
|
|
1580 |
|
|
1581 |
Each hash ref contains an id, defline and subject Length. Because
|
|
1582 |
there can be multiple scores, expect values, Identities, match_lengths,
|
|
1583 |
query_starts, query_strands and subject_starts, these are stored
|
|
1584 |
as array references. The following is an array containing two hash
|
|
1585 |
refs.
|
|
1586 |
|
|
1587 |
@hits = (
|
|
1588 |
{'id' => '>gb|U79716|HSU79716',
|
|
1589 |
'defline' => 'Human reelin (RELN) mRNA, complete cds',
|
|
1590 |
'subject_length' => '11580',
|
|
1591 |
'scores' => [ 684, 123 ],
|
|
1592 |
'expects' => [ 0.0, 3e-26 ],
|
|
1593 |
'identities' => [ .99430199, .89256198 ],
|
|
1594 |
'match_lengths' => [ 351, 121 ],
|
|
1595 |
'query_starts' => [ 3, 404 ],
|
|
1596 |
'query_ends' => [ 303, 704 ],
|
|
1597 |
'subject_starts' => [ 5858, 6259 ],
|
|
1598 |
'subject_ends' => [ 6158, 6559 ],
|
|
1599 |
'subject_strands' => [ 'plus', 'minus' ],
|
|
1600 |
'query_strands' => [ 'plus', 'plus' ],
|
|
1601 |
'query_frames' => [ '+1', '-3' ],
|
|
1602 |
'subject_frames' => [ '+2', '-1' ],
|
|
1603 |
},
|
|
1604 |
{'id' => '>gb|U24703|MMU24703',
|
|
1605 |
'defline' => 'Mus musculus reelin mRNA, complete cds',
|
|
1606 |
'subject_length' => '11673',
|
|
1607 |
'scores' => [ 319, 38.2 ],
|
|
1608 |
'expects' => [ 2e-85, 1.2 ],
|
|
1609 |
'identities' => [ .86455331, 1 ],
|
|
1610 |
'match_lengths' => [ 347, 19 ],
|
|
1611 |
'query_starts' => [ 3, 493 ],
|
|
1612 |
'query_ends' => [ 303, 793 ],
|
|
1613 |
'subject_starts' => [ 5968, 6457 ]
|
|
1614 |
'subject_ends' => [ 6268, 6757 ],
|
|
1615 |
'subject_strands' => [ 'plus', 'minus' ],
|
|
1616 |
'query_strands' => [ 'plus', 'plus' ],
|
|
1617 |
'query_frames' => [ '+3', '-3' ],
|
|
1618 |
'subject_frames' => [ '+1', '-2' ],
|
|
1619 |
}
|
|
1620 |
);
|
|
1621 |
|
|
1622 |
See ELEMENTS for explanation of each element.
|
|
1623 |
See DUMP RESULTS and/or the perlref(1) manpage for clues on working
|
|
1624 |
with this structure.
|
|
1625 |
|
|
1626 |
=head1 DUMP RESULTS
|
|
1627 |
|
|
1628 |
When calling the result function or with no parameters, or calling the
|
|
1629 |
filter function, an array of references to hashes is returned.
|
|
1630 |
Each elment of the array is a reference to a hash containing 1 record.
|
|
1631 |
See HASHREF for details on this structure. The following
|
|
1632 |
routine will go through each element of the array of hashes and
|
|
1633 |
then print out the element and it's corresponding value or values.
|
|
1634 |
See perlref(1) for more info on references.
|
|
1635 |
|
|
1636 |
sub dump_results {
|
|
1637 |
foreach $rh_r (@results) {
|
|
1638 |
while (($key,$value) = each %$rh_r) {
|
|
1639 |
if (ref($value) eq "ARRAY") {
|
|
1640 |
print "$key: ";
|
|
1641 |
foreach $v (@$value) {
|
|
1642 |
print "$v ";
|
|
1643 |
}
|
|
1644 |
print "\n";
|
|
1645 |
} else {
|
|
1646 |
print "$key: $value\n";
|
|
1647 |
}
|
|
1648 |
}
|
|
1649 |
}
|
|
1650 |
}
|
|
1651 |
|
|
1652 |
=head1 AUTHOR
|
|
1653 |
|
|
1654 |
=over 4
|
|
1655 |
|
|
1656 |
=item
|
|
1657 |
|
|
1658 |
Joseph Ryan (jfryan@nhgri.nih.gov)
|
|
1659 |
|
|
1660 |
=back
|
|
1661 |
|
|
1662 |
=head1 CONTACT ADDRESS
|
|
1663 |
|
|
1664 |
If you have problems, questions, comments send to webblaster@nhgri.nih.gov
|
|
1665 |
|
|
1666 |
=head1 COPYRIGHT INFORMATION
|
|
1667 |
|
|
1668 |
This software/database is "United States Government Work" under the
|
|
1669 |
terms of the United States Copyright Act. It was written as part of
|
|
1670 |
the authors' official duties for the United States Government and thus
|
|
1671 |
cannot be copyrighted. This software/database is freely available to
|
|
1672 |
the public for use without a copyright notice. Restrictions cannot
|
|
1673 |
be placed on its present or future use.
|
|
1674 |
|
|
1675 |
Although all reasonable efforts have been taken to ensure the
|
|
1676 |
accuracy and reliability of the software and data, the National
|
|
1677 |
Human Genome Research Institute (NHGRI) and the U.S. Government
|
|
1678 |
does not and cannot warrant the performance or results that may be
|
|
1679 |
obtained by using this software or data. NHGRI and the U.S.
|
|
1680 |
Government disclaims all warranties as to performance,
|
|
1681 |
merchantability or fitness for any particular purpose.
|
|
1682 |
|
|
1683 |
In any work or product derived from this material, proper
|
|
1684 |
attribution of the authors as the source of the software or
|
|
1685 |
data should be made, using http://genome.nhgri.nih.gov/blastall
|
|
1686 |
as the citation.
|
|
1687 |
|
|
1688 |
=head1 ENVIRONMENT VARIABLES
|
|
1689 |
|
|
1690 |
=over 4
|
|
1691 |
|
|
1692 |
=item B<BLASTDB>
|
|
1693 |
|
|
1694 |
location of BLAST formated databases
|
|
1695 |
|
|
1696 |
=item B<BLASTMAT>
|
|
1697 |
|
|
1698 |
location of BLAST matrices
|
|
1699 |
|
|
1700 |
=item B<TMPDIR> B<TEMP> B<TMP>
|
|
1701 |
|
|
1702 |
If the `o' option is not passed to the blastall method than NHGRI::Blastall
|
|
1703 |
looks for one of these vars (in order) to store the BLAST report. This
|
|
1704 |
report is destroyed after the NHGRI::Blastall.pm object is destroyed.
|
|
1705 |
|
|
1706 |
=back
|
|
1707 |
|
|
1708 |
=head1 SEE ALSO
|
|
1709 |
|
|
1710 |
L<perl(1)> L<perlref(1)>
|
|
1711 |
|
|
1712 |
http://www.ncbi.nlm.nih.gov/BLAST/newblast.html
|
|
1713 |
|
|
1714 |
ftp://ncbi.nlm.nih.gov/blast/db/README
|
|
1715 |
|
|
1716 |
http://www.ncbi.nlm.nih.gov/BLAST/tutorial/Altschul-1.html
|
|
1717 |
|
|
1718 |
=cut
|