New upstream version 3.9.1+dfsg
Sascha Steinbiss
6 years ago
0 | 0 | name = Bio-Roary |
1 | version = 3.9.0 | |
1 | version = 3.9.1 | |
2 | 2 | author = Andrew J. Page <ap13@sanger.ac.uk> |
3 | 3 | license = GPL_3 |
4 | 4 | copyright_holder = Wellcome Trust Sanger Institute |
46 | 46 | has 'verbose_stats' => ( is => 'rw', isa => 'Bool', default => 0 ); |
47 | 47 | has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 ); |
48 | 48 | has 'mafft' => ( is => 'rw', isa => 'Bool', default => 0 ); |
49 | has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); | |
49 | 50 | has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 ); |
50 | 51 | has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 ); |
51 | 52 | has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 ); |
70 | 71 | $job_runner, $makeblastdb_exec, $mcxdeblast_exec, $mcl_exec, $blastp_exec, |
71 | 72 | $apply_unknowns_filter, $cpus, $output_multifasta_files, $verbose_stats, $translation_table, |
72 | 73 | $run_qc, $core_definition, $help, $kraken_db, $cmd_version, |
73 | $mafft, $output_directory, $check_dependancies, $inflation_value, | |
74 | $mafft, $output_directory, $check_dependancies, $inflation_value, $allow_paralogs, | |
74 | 75 | ); |
75 | 76 | |
76 | 77 | GetOptionsFromArray( |
97 | 98 | 'cd|core_definition=f' => \$core_definition, |
98 | 99 | 'v|verbose' => \$verbose, |
99 | 100 | 'n|mafft' => \$mafft, |
101 | 'ap|allow_paralogs' => \$allow_paralogs, | |
100 | 102 | 'k|kraken_db=s' => \$kraken_db, |
101 | 103 | 'w|version' => \$cmd_version, |
102 | 104 | 'a|check_dependancies' => \$check_dependancies, |
301 | 303 | core_definition => $self->core_definition, |
302 | 304 | verbose => $self->verbose, |
303 | 305 | mafft => $self->mafft, |
304 | inflation_value => $self->inflation_value, | |
306 | allow_paralogs => $self->allow_paralogs, | |
307 | inflation_value => $self->inflation_value, | |
305 | 308 | ); |
306 | 309 | $pan_genome_obj->run(); |
307 | 310 | |
342 | 345 | -r create R plots, requires R and ggplot2 |
343 | 346 | -s dont split paralogs |
344 | 347 | -t INT translation table [11] |
348 | -ap allow paralogs in core alignment | |
345 | 349 | -z dont delete intermediate files |
346 | 350 | -v verbose output to STDOUT |
347 | 351 | -w print version and exit |
348 | 352 | -y add gene inference information to spreadsheet, doesnt work with -e |
349 | -iv STR Change the MCL inflation value [1.5] | |
353 | -iv STR Change the MCL inflation value [1.5] | |
350 | 354 | -h this help message |
351 | 355 | |
352 | 356 | Example: Quickly generate a core gene alignment using 8 threads |
26 | 26 | has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'core_gene_alignment.aln' ); |
27 | 27 | has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 ); |
28 | 28 | has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 ); |
29 | has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); | |
29 | 30 | has '_error_message' => ( is => 'rw', isa => 'Str' ); |
30 | 31 | has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 ); |
31 | 32 | |
32 | 33 | sub BUILD { |
33 | 34 | my ($self) = @_; |
34 | 35 | |
35 | my ( $multifasta_base_directory, $spreadsheet_filename, $output_filename, $core_definition,$verbose, $help, $mafft, $dont_delete_files ); | |
36 | my ( $multifasta_base_directory, $spreadsheet_filename, $output_filename, $core_definition,$verbose, $help, $mafft, $allow_paralogs, $dont_delete_files ); | |
36 | 37 | |
37 | 38 | GetOptionsFromArray( |
38 | 39 | $self->args, |
41 | 42 | 'o|output_filename=s' => \$output_filename, |
42 | 43 | 'cd|core_definition=f' => \$core_definition, |
43 | 44 | 'z|dont_delete_files' => \$dont_delete_files, |
45 | 'p|allow_paralogs' => \$allow_paralogs, | |
44 | 46 | 'v|verbose' => \$verbose, |
45 | 47 | 'h|help' => \$help, |
46 | 48 | ); |
50 | 52 | $self->logger->level(10000); |
51 | 53 | } |
52 | 54 | $self->help($help) if(defined($help)); |
55 | $self->allow_paralogs($allow_paralogs) if(defined($allow_paralogs)); | |
53 | 56 | |
54 | 57 | if ( defined($multifasta_base_directory) && ( -d $multifasta_base_directory ) ) { |
55 | 58 | $self->multifasta_base_directory( abs_path($multifasta_base_directory)); |
94 | 97 | $self->logger->info("Extract core genes from spreadsheet"); |
95 | 98 | my $core_genes_obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new( |
96 | 99 | spreadsheet => $self->spreadsheet_filename, |
97 | core_definition => $self->core_definition | |
100 | core_definition => $self->core_definition, | |
101 | allow_paralogs => $self->allow_paralogs | |
98 | 102 | ); |
99 | 103 | |
100 | 104 | $self->logger->info("Looking up genes in files"); |
129 | 133 | -cd FLOAT percentage of isolates a gene must be in to be core [99] |
130 | 134 | -m STR directory containing gene multi-FASTAs [pan_genome_sequences] |
131 | 135 | -s STR gene presence and absence spreadsheet [gene_presence_absence.csv] |
136 | -p allow paralogs | |
132 | 137 | -z dont delete intermediate files |
133 | 138 | -v verbose output to STDOUT |
134 | 139 | -h this help message |
40 | 40 | has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 ); |
41 | 41 | has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 ); |
42 | 42 | has 'mafft' => ( is => 'rw', isa => 'Bool', default => 0 ); |
43 | has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); | |
43 | 44 | |
44 | 45 | sub BUILD { |
45 | 46 | my ($self) = @_; |
47 | 48 | my ( |
48 | 49 | $output_filename, $dont_create_rplots, $dont_delete_files, $dont_split_groups, $output_pan_geneome_filename, |
49 | 50 | $job_runner, $output_statistics_filename, $output_multifasta_files, $clusters_filename, $core_definition, |
50 | $fasta_files, $input_files, $verbose_stats, $translation_table, $help, $cpus,$group_limit,$verbose,$mafft | |
51 | $fasta_files, $input_files, $verbose_stats, $translation_table, $help, $cpus,$group_limit,$verbose,$mafft, $allow_paralogs | |
51 | 52 | ); |
52 | 53 | |
53 | 54 | |
71 | 72 | 'cd|core_definition=f' => \$core_definition, |
72 | 73 | 'v|verbose' => \$verbose, |
73 | 74 | 'n|mafft' => \$mafft, |
75 | 'q|allow_paralogs' => \$allow_paralogs, | |
74 | 76 | 'h|help' => \$help, |
75 | 77 | ); |
76 | 78 | |
92 | 94 | $self->group_limit($group_limit) if ( defined($group_limit) ); |
93 | 95 | $self->core_definition( $core_definition/100 ) if ( defined($core_definition) ); |
94 | 96 | $self->mafft($mafft) if ( defined($mafft) ); |
97 | $self->allow_paralogs($allow_paralogs) if ( defined($allow_paralogs) ); | |
95 | 98 | if ( defined($verbose) ) { |
96 | 99 | $self->verbose($verbose); |
97 | 100 | $self->logger->level(10000); |
157 | 160 | cpus => $self->cpus, |
158 | 161 | verbose => $self->verbose, |
159 | 162 | mafft => $self->mafft, |
163 | allow_paralogs => $self->allow_paralogs, | |
160 | 164 | dont_delete_files => $self->dont_delete_files, |
161 | 165 | num_input_files => $#{$input_files}, |
162 | 166 | ); |
221 | 225 | -n fast core gene alignement with MAFFT instead of PRANK |
222 | 226 | -o STR clusters output filename [clustered_proteins] |
223 | 227 | -p STR output pan genome filename [pan_genome.fa] |
228 | -q allow paralogs in core alignment | |
224 | 229 | -s STR output gene presence and absence filename [gene_presence_absence.csv] |
225 | 230 | -t INT translation table [11] |
226 | 231 | -z INT number of threads [1] |
28 | 28 | has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1 ); |
29 | 29 | has 'mafft' => ( is => 'ro', isa => 'Bool', default => 0 ); |
30 | 30 | has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 ); |
31 | has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); | |
31 | 32 | has 'num_input_files' => ( is => 'ro', isa => 'Int', required => 1); |
32 | 33 | |
33 | 34 | # Overload Role` |
84 | 85 | my $core_cmd = "pan_genome_core_alignment"; |
85 | 86 | $core_cmd .= " -cd " . ($self->core_definition*100) if ( defined $self->core_definition ); |
86 | 87 | $core_cmd .= " --dont_delete_files " if ( defined $self->dont_delete_files && $self->dont_delete_files == 1 ); |
88 | $core_cmd .= " --allow_paralogs " if ( defined $self->allow_paralogs && $self->allow_paralogs == 1 ); | |
87 | 89 | |
88 | 90 | return $core_cmd; |
89 | 91 | } |
36 | 36 | has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1.0 ); |
37 | 37 | has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 ); |
38 | 38 | has 'mafft' => ( is => 'ro', isa => 'Bool', default => 0 ); |
39 | has 'allow_paralogs' => ( is => 'ro', isa => 'Bool', default => 0 ); | |
39 | 40 | has '_working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } ); |
40 | 41 | has '_gff_fofn' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__gff_fofn' ); |
41 | 42 | has '_fasta_fofn' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__fasta_fofn' ); |
136 | 137 | |
137 | 138 | my $verbose_flag = ''; |
138 | 139 | $verbose_flag = '-v' if ( defined($self->verbose) && $self->verbose == 1 ); |
140 | ||
141 | my $allow_paralogs_flag = ''; | |
142 | $allow_paralogs_flag = '--allow_paralogs' if ( defined($self->allow_paralogs) && $self->allow_paralogs == 1 ); | |
139 | 143 | |
140 | 144 | return join( |
141 | 145 | " ", |
155 | 159 | $verbose_stats_flag, |
156 | 160 | $verbose_flag, |
157 | 161 | $mafft_flag, |
162 | $allow_paralogs_flag, | |
158 | 163 | '-j', $self->job_runner, |
159 | 164 | '--processors', $self->cpus, |
160 | 165 | '--group_limit', $self->group_limit, |
18 | 18 | use Bio::Roary::GroupStatistics; |
19 | 19 | use POSIX; |
20 | 20 | |
21 | has 'spreadsheet' => ( is => 'ro', isa => 'Str', required => 1 ); | |
22 | has '_csv_parser' => ( is => 'ro', isa => 'Text::CSV',lazy => 1, builder => '_build__csv_parser' ); | |
23 | has '_input_spreadsheet_fh' => ( is => 'ro', lazy => 1, builder => '_build__input_spreadsheet_fh' ); | |
24 | has 'ordered_core_genes' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_core_genes' ); | |
25 | has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1 ); | |
26 | has 'sample_names' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} ); | |
27 | has 'sample_names_to_genes' => ( is => 'rw', isa => 'HashRef', default => sub {{}} ); | |
21 | has 'spreadsheet' => ( is => 'ro', isa => 'Str', required => 1 ); | |
22 | has '_csv_parser' => ( is => 'ro', isa => 'Text::CSV', lazy => 1, builder => '_build__csv_parser' ); | |
23 | has '_input_spreadsheet_fh' => ( is => 'ro', lazy => 1, builder => '_build__input_spreadsheet_fh' ); | |
24 | has 'ordered_core_genes' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_core_genes' ); | |
25 | has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1 ); | |
26 | has 'sample_names' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } ); | |
27 | has 'sample_names_to_genes' => ( is => 'rw', isa => 'HashRef', default => sub { {} } ); | |
28 | has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); | |
28 | 29 | |
29 | has '_number_of_isolates' => ( is => 'rw', isa => 'Int'); | |
30 | has '_gene_column' => ( is => 'rw', isa => 'Int'); | |
31 | has '_num_isolates_column' => ( is => 'rw', isa => 'Int'); | |
32 | has '_avg_sequences_per_isolate_column' => ( is => 'rw', isa => 'Int'); | |
33 | has '_genome_fragement_column' => ( is => 'rw', isa => 'Int'); | |
34 | has '_order_within_fragement_column' => ( is => 'rw', isa => 'Int'); | |
35 | has '_min_no_isolates_for_core' => ( is => 'rw', isa => 'Num', lazy => 1, builder => '_build__min_no_isolates_for_core' ); | |
30 | has '_number_of_isolates' => ( is => 'rw', isa => 'Int' ); | |
31 | has '_gene_column' => ( is => 'rw', isa => 'Int' ); | |
32 | has '_num_isolates_column' => ( is => 'rw', isa => 'Int' ); | |
33 | has '_avg_sequences_per_isolate_column' => ( is => 'rw', isa => 'Int' ); | |
34 | has '_genome_fragement_column' => ( is => 'rw', isa => 'Int' ); | |
35 | has '_order_within_fragement_column' => ( is => 'rw', isa => 'Int' ); | |
36 | has '_min_no_isolates_for_core' => ( is => 'rw', isa => 'Num', lazy => 1, builder => '_build__min_no_isolates_for_core' ); | |
36 | 37 | |
37 | 38 | sub _build__min_no_isolates_for_core { |
38 | my ($self) = @_; | |
39 | my $threshold = $self->_number_of_isolates * $self->core_definition; | |
39 | my ($self) = @_; | |
40 | my $threshold = $self->_number_of_isolates * $self->core_definition; | |
40 | 41 | |
41 | return $threshold; | |
42 | return $threshold; | |
42 | 43 | } |
43 | 44 | |
44 | sub _build__csv_parser | |
45 | { | |
46 | my ($self) = @_; | |
47 | return Text::CSV->new( { binary => 1, always_quote => 1} ); | |
45 | sub _build__csv_parser { | |
46 | my ($self) = @_; | |
47 | return Text::CSV->new( { binary => 1, always_quote => 1 } ); | |
48 | 48 | } |
49 | 49 | |
50 | 50 | sub _build__input_spreadsheet_fh { |
53 | 53 | return $fh; |
54 | 54 | } |
55 | 55 | |
56 | sub _update_number_of_isolates | |
57 | { | |
58 | my ($self, $header_row) = @_; | |
59 | my $number_of_isolates = @{$header_row} - @{Bio::Roary::GroupStatistics->fixed_headers}; | |
60 | $self->_number_of_isolates($number_of_isolates); | |
56 | sub _update_number_of_isolates { | |
57 | my ( $self, $header_row ) = @_; | |
58 | my $number_of_isolates = @{$header_row} - @{ Bio::Roary::GroupStatistics->fixed_headers }; | |
59 | $self->_number_of_isolates($number_of_isolates); | |
61 | 60 | } |
62 | 61 | |
63 | sub _setup_column_mappings | |
64 | { | |
65 | my ($self, $header_row) = @_; | |
66 | #Â current ordering | |
67 | my %columns_of_interest_mappings = ( | |
68 | 'Gene' => 0, | |
69 | 'No. isolates' => 3, | |
70 | 'Avg sequences per isolate' => 5, | |
71 | 'Genome Fragment' => 6, | |
72 | 'Order within Fragment' => 7, | |
73 | 'QC' => 10, | |
62 | sub _setup_column_mappings { | |
63 | my ( $self, $header_row ) = @_; | |
64 | ||
65 | #Â current ordering | |
66 | my %columns_of_interest_mappings = ( | |
67 | 'Gene' => 0, | |
68 | 'No. isolates' => 3, | |
69 | 'Avg sequences per isolate' => 5, | |
70 | 'Genome Fragment' => 6, | |
71 | 'Order within Fragment' => 7, | |
72 | 'QC' => 10, | |
74 | 73 | ); |
75 | ||
76 | # Dynamically overwrite the default ordering | |
77 | for(my $i = 0; $i < @{$header_row}; $i++) | |
78 | { | |
79 | for my $col_name (%columns_of_interest_mappings) | |
80 | { | |
81 | if($header_row->[$i] eq $col_name) | |
82 | { | |
83 | $columns_of_interest_mappings{$col_name} = $i; | |
84 | last; | |
85 | } | |
74 | ||
75 | # Dynamically overwrite the default ordering | |
76 | for ( my $i = 0 ; $i < @{$header_row} ; $i++ ) { | |
77 | for my $col_name (%columns_of_interest_mappings) { | |
78 | if ( $header_row->[$i] eq $col_name ) { | |
79 | $columns_of_interest_mappings{$col_name} = $i; | |
80 | last; | |
81 | } | |
82 | } | |
86 | 83 | } |
87 | } | |
88 | $self->_gene_column($columns_of_interest_mappings{'Gene'}); | |
89 | $self->_num_isolates_column($columns_of_interest_mappings{'No. isolates'}); | |
90 | $self->_avg_sequences_per_isolate_column($columns_of_interest_mappings{'Avg sequences per isolate'}); | |
91 | $self->_genome_fragement_column($columns_of_interest_mappings{'Genome Fragment'}); | |
92 | $self->_order_within_fragement_column($columns_of_interest_mappings{'Order within Fragment'}); | |
93 | $self->_update_number_of_isolates($header_row); | |
94 | ||
95 | # Get the sample_names | |
96 | my @sample_names; | |
97 | for(my $i = $self->_length_of_fixed_headers(); $i < @{$header_row}; $i++) | |
98 | { | |
99 | push(@sample_names,$header_row->[$i]); | |
100 | } | |
101 | $self->sample_names(\@sample_names); | |
84 | $self->_gene_column( $columns_of_interest_mappings{'Gene'} ); | |
85 | $self->_num_isolates_column( $columns_of_interest_mappings{'No. isolates'} ); | |
86 | $self->_avg_sequences_per_isolate_column( $columns_of_interest_mappings{'Avg sequences per isolate'} ); | |
87 | $self->_genome_fragement_column( $columns_of_interest_mappings{'Genome Fragment'} ); | |
88 | $self->_order_within_fragement_column( $columns_of_interest_mappings{'Order within Fragment'} ); | |
89 | $self->_update_number_of_isolates($header_row); | |
90 | ||
91 | # Get the sample_names | |
92 | my @sample_names; | |
93 | for ( my $i = $self->_length_of_fixed_headers() ; $i < @{$header_row} ; $i++ ) { | |
94 | push( @sample_names, $header_row->[$i] ); | |
95 | } | |
96 | $self->sample_names( \@sample_names ); | |
102 | 97 | } |
103 | 98 | |
104 | sub _length_of_fixed_headers | |
105 | { | |
106 | my ($self) = @_; | |
107 | return @{Bio::Roary::GroupStatistics->fixed_headers()}; | |
99 | sub _length_of_fixed_headers { | |
100 | my ($self) = @_; | |
101 | return @{ Bio::Roary::GroupStatistics->fixed_headers() }; | |
108 | 102 | } |
109 | 103 | |
110 | sub _populate_sample_to_gene_lookup_with_row | |
111 | { | |
112 | my ($self, $row) = @_; | |
113 | ||
114 | for(my $i = $self->_length_of_fixed_headers(); $i < @{$row}; $i++ ) | |
115 | { | |
116 | if(defined($row->[$i]) && $row->[$i] ne "" ) | |
117 | { | |
118 | my $sample_name = $self->sample_names->[$i - $self->_length_of_fixed_headers()]; | |
119 | ||
120 | $self->sample_names_to_genes->{$sample_name}->{$row->[$i]} = 1; | |
121 | } | |
122 | } | |
123 | return 1; | |
104 | sub _populate_sample_to_gene_lookup_with_row { | |
105 | my ( $self, $row ) = @_; | |
106 | ||
107 | for ( my $i = $self->_length_of_fixed_headers() ; $i < @{$row} ; $i++ ) { | |
108 | if ( defined( $row->[$i] ) && $row->[$i] ne "" ) { | |
109 | my $sample_name = $self->sample_names->[ $i - $self->_length_of_fixed_headers() ]; | |
110 | ||
111 | $self->sample_names_to_genes->{$sample_name}->{ $row->[$i] } = 1; | |
112 | } | |
113 | } | |
114 | return 1; | |
124 | 115 | } |
125 | 116 | |
117 | sub _ordered_core_genes { | |
118 | my ($self) = @_; | |
119 | my %ordered_genes; | |
120 | while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) ) { | |
121 | next if ( @{$row} < 12 ); # no genes in group | |
122 | next if ( !defined( $row->[ $self->_gene_column ] ) || $row->[ $self->_gene_column ] eq '' ); # no gene name | |
123 | next | |
124 | if ( !defined( $row->[ $self->_avg_sequences_per_isolate_column ] ) || $row->[ $self->_avg_sequences_per_isolate_column ] eq '' ) | |
125 | ; # no average | |
126 | next | |
127 | if ( !defined( $row->[ $self->_genome_fragement_column ] ) || $row->[ $self->_genome_fragement_column ] eq '' ) | |
128 | ; # fragment not defined | |
126 | 129 | |
127 | sub _ordered_core_genes | |
128 | { | |
129 | my ($self) = @_; | |
130 | my %ordered_genes; | |
131 | while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) ) | |
132 | { | |
133 | next if(@{$row} < 12); # no genes in group | |
134 | next if(!defined($row->[$self->_gene_column]) || $row->[$self->_gene_column] eq '' ); # no gene name | |
135 | next if(!defined($row->[$self->_avg_sequences_per_isolate_column]) || $row->[$self->_avg_sequences_per_isolate_column] eq '' ); # no average | |
136 | next if(!defined($row->[$self->_genome_fragement_column]) || $row->[$self->_genome_fragement_column] eq '' ); # fragment not defined | |
137 | ||
138 | # next if($self->_number_of_isolates != $row->[$self->_num_isolates_column]); # if gene is not in all isolates | |
139 | next if ( $row->[$self->_num_isolates_column] < $self->_min_no_isolates_for_core ); | |
140 | next if($row->[$self->_avg_sequences_per_isolate_column] != 1); | |
141 | $ordered_genes{$row->[$self->_genome_fragement_column]}{$row->[$self->_order_within_fragement_column]} = $row->[$self->_gene_column]; | |
142 | $self->_populate_sample_to_gene_lookup_with_row($row); | |
143 | } | |
144 | ||
145 | my @ordered_core_genes ; | |
146 | for my $fragment_key(sort {$a <=> $b } keys %ordered_genes) | |
147 | { | |
148 | for my $order_within_fragement(sort {$a <=> $b } keys %{$ordered_genes{$fragment_key}}) | |
149 | { | |
150 | push(@ordered_core_genes,$ordered_genes{$fragment_key}{$order_within_fragement}); | |
130 | # next if($self->_number_of_isolates != $row->[$self->_num_isolates_column]); # if gene is not in all isolates | |
131 | next if ( $row->[ $self->_num_isolates_column ] < $self->_min_no_isolates_for_core ); | |
132 | ||
133 | if ( $self->allow_paralogs ) { | |
134 | # should never happen | |
135 | next if ( $row->[ $self->_avg_sequences_per_isolate_column ] < 1 ); | |
136 | } | |
137 | else { | |
138 | next if ( $row->[ $self->_avg_sequences_per_isolate_column ] != 1 ); | |
139 | } | |
140 | ||
141 | $ordered_genes{ $row->[ $self->_genome_fragement_column ] }{ $row->[ $self->_order_within_fragement_column ] } = | |
142 | $row->[ $self->_gene_column ]; | |
143 | $self->_populate_sample_to_gene_lookup_with_row($row); | |
151 | 144 | } |
152 | } | |
153 | return \@ordered_core_genes; | |
145 | ||
146 | my @ordered_core_genes; | |
147 | for my $fragment_key ( sort { $a <=> $b } keys %ordered_genes ) { | |
148 | for my $order_within_fragement ( sort { $a <=> $b } keys %{ $ordered_genes{$fragment_key} } ) { | |
149 | push( @ordered_core_genes, $ordered_genes{$fragment_key}{$order_within_fragement} ); | |
150 | } | |
151 | } | |
152 | return \@ordered_core_genes; | |
154 | 153 | } |
155 | 154 | |
156 | sub _build_ordered_core_genes | |
157 | { | |
158 | my ($self) = @_; | |
159 | my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ); | |
160 | $self->_setup_column_mappings($header_row); | |
155 | sub _build_ordered_core_genes { | |
156 | my ($self) = @_; | |
157 | my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ); | |
158 | $self->_setup_column_mappings($header_row); | |
161 | 159 | |
162 | return $self->_ordered_core_genes(); | |
160 | return $self->_ordered_core_genes(); | |
163 | 161 | } |
164 | ||
165 | 162 | |
166 | 163 | no Moose; |
167 | 164 | __PACKAGE__->meta->make_immutable; |
71 | 71 | my ( $self, $sample_name, $gene_file ) = @_; |
72 | 72 | |
73 | 73 | # loop over this to get the geneIDs |
74 | for my $gene_id ( keys %{ $self->_gene_to_sequence->{$gene_file} } ) { | |
74 | for my $gene_id ( sort keys %{ $self->_gene_to_sequence->{$gene_file} } ) { | |
75 | 75 | if ( defined( $self->sample_names_to_genes->{$sample_name}->{$gene_id} ) ) { |
76 | 76 | return $self->_gene_to_sequence->{$gene_file}->{$gene_id}; |
77 | 77 | } |
47 | 47 | has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 ); |
48 | 48 | has 'mafft' => ( is => 'ro', isa => 'Bool', default => 0 ); |
49 | 49 | has 'inflation_value' => ( is => 'rw', isa => 'Num', default => 1.5 ); |
50 | has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); | |
50 | 51 | |
51 | 52 | has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 ); |
52 | 53 | |
135 | 136 | core_definition => $self->core_definition, |
136 | 137 | verbose => $self->verbose, |
137 | 138 | mafft => $self->mafft, |
139 | allow_paralogs => $self->allow_paralogs, | |
138 | 140 | ); |
139 | 141 | $post_analysis->run(); |
140 | 142 |
12 | 12 | |
13 | 13 | my $obj; |
14 | 14 | |
15 | ok($obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new( | |
16 | spreadsheet => 't/data/core_group_statistics.csv', | |
17 | ),'initalise obj'); | |
18 | is_deeply($obj->ordered_core_genes, ['argF','speH','group_5'], 'Correct ordering'); | |
19 | is_deeply($obj->sample_names_to_genes, { | |
20 | 'query_2' => { | |
21 | '2_3' => 1, | |
22 | '2_7' => 1, | |
23 | '2_2' => 1 | |
24 | }, | |
25 | 'query_1' => { | |
26 | '1_6' => 1, | |
27 | '1_3' => 1, | |
28 | '1_2' => 1 | |
29 | } | |
30 | }, 'Correct of sample names to genes is correct'); | |
15 | ok( | |
16 | $obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new( | |
17 | spreadsheet => 't/data/core_group_statistics.csv', | |
18 | ), | |
19 | 'initalise obj' | |
20 | ); | |
21 | is_deeply( $obj->ordered_core_genes, [ 'argF', 'speH', 'group_5' ], 'Correct ordering' ); | |
22 | is_deeply( | |
23 | $obj->sample_names_to_genes, | |
24 | { | |
25 | 'query_2' => { | |
26 | '2_3' => 1, | |
27 | '2_7' => 1, | |
28 | '2_2' => 1 | |
29 | }, | |
30 | 'query_1' => { | |
31 | '1_6' => 1, | |
32 | '1_3' => 1, | |
33 | '1_2' => 1 | |
34 | } | |
35 | }, | |
36 | 'Correct of sample names to genes is correct' | |
37 | ); | |
38 | ||
39 | ok( | |
40 | $obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new( | |
41 | spreadsheet => 't/data/core_group_statistics.csv', | |
42 | allow_paralogs => 1, | |
43 | ), | |
44 | 'initalise obj where paralogs allowed' | |
45 | ); | |
46 | is_deeply( $obj->ordered_core_genes, [ 'argF', 'hly', 'speH', 'group_5' ], 'Correct ordering where paralogs allowed' ); | |
47 | ||
48 | is_deeply( | |
49 | $obj->sample_names_to_genes, | |
50 | { | |
51 | 'query_2' => { | |
52 | '2_3' => 1, | |
53 | '2_7' => 1, | |
54 | '2_1' => 1, | |
55 | '2_2' => 1 | |
56 | }, | |
57 | 'query_1' => { | |
58 | '1_6' => 1, | |
59 | '1_3' => 1, | |
60 | '1_1' => 1, | |
61 | '1_2' => 1 | |
62 | } | |
63 | }, | |
64 | 'Correct of sample names to genes is correct where paralogs allowed' | |
65 | ); | |
31 | 66 | |
32 | 67 | done_testing(); |