From 4603dc5b81af06b7cbffb6ee108b9343cb81350d Mon Sep 17 00:00:00 2001 From: Patrick Meidl <pm2@sanger.ac.uk> Date: Fri, 29 Jun 2007 14:35:43 +0000 Subject: [PATCH] latest state of the code (bugfixes and adaptations due to changes in ConfParser and Logger) --- modules/Bio/EnsEMBL/IdMapping/Cache.pm | 66 +++++---- .../Bio/EnsEMBL/IdMapping/ExonScoreBuilder.pm | 140 +++++++++--------- .../Bio/EnsEMBL/IdMapping/GeneScoreBuilder.pm | 60 ++++---- modules/Bio/EnsEMBL/IdMapping/ScoreBuilder.pm | 8 +- .../EnsEMBL/IdMapping/ScoredMappingMatrix.pm | 21 ++- .../IdMapping/TranscriptScoreBuilder.pm | 53 +++---- 6 files changed, 188 insertions(+), 160 deletions(-) diff --git a/modules/Bio/EnsEMBL/IdMapping/Cache.pm b/modules/Bio/EnsEMBL/IdMapping/Cache.pm index d4e8234161..2162d943a0 100644 --- a/modules/Bio/EnsEMBL/IdMapping/Cache.pm +++ b/modules/Bio/EnsEMBL/IdMapping/Cache.pm @@ -47,6 +47,10 @@ use Storable qw(nfreeze thaw nstore retrieve); # define available cache names here my @cache_names = qw( exons_by_id + transcripts_by_id + transcripts_by_exon_id + genes_by_id + genes_by_transcript_id ); @@ -101,12 +105,17 @@ sub build_cache { my $i = scalar(@$genes); # find common coord_system - $self->find_common_coord_systems; + my $common_cs_found = $self->find_common_coord_systems; - # find out whether native and common coord_system are identical + # find out whether native coord_system is a common coord_system. + # if so, you don't need to project. + # also don't project if no common coord_system present my $need_project = 1; - my $csid = join(':', $slice->coord_system_name, $slice->coord_system->version); - $need_project = 0 if ($self->is_common_cs($csid)); + my $csid = join(':', $slice->coord_system_name, + $slice->coord_system->version); + if ($self->is_common_cs($csid) or !$self->highest_common_cs) { + $need_project = 0; + } # build cache my $type = "$dbtype.$slice_name"; @@ -163,8 +172,8 @@ sub build_cache_from_genes { ]); # build gene caches - #$self->add($type, 'genes_by_id', $gene->dbID, $lgene); - #$self->add($type, 'genes_by_stable_id', $gene->stable_id, $lgene); + $self->add('genes_by_id', $type, $gene->dbID, $lgene); + #$self->add('genes_by_stable_id', $type, $gene->stable_id, $lgene); # transcripts foreach my $tr (@{ $gene->get_all_Transcripts }) { @@ -180,9 +189,9 @@ sub build_cache_from_genes { #$lgene->add_Transcript($ltr); # build transcript caches - $self->add($type, 'transcripts_by_id', $tr->dbID, $ltr); - #$self->add($type, 'transcripts_by_stable_id', $tr->stable_id, $ltr); - $self->add($type, 'genes_by_transcript_id', $tr->dbID, $lgene); + $self->add('transcripts_by_id', $type, $tr->dbID, $ltr); + #$self->add('transcripts_by_stable_id', $type, $tr->stable_id, $ltr); + $self->add('genes_by_transcript_id', $type, $tr->dbID, $lgene); # translation (if there is one) if (my $tl = $tr->translation) { @@ -193,9 +202,9 @@ sub build_cache_from_genes { #$ltr->add_Translation($ltl); - #$self->add($type, 'translations_by_id', $tl->dbID, $ltl); - #$self->add($type, 'translations_by_stable_id', $tl->stable_id, $ltl); - #$self->add($type, 'translations_by_transcript_id', $tr->dbID, $ltl); + #$self->add('translations_by_id', $type, $tl->dbID, $ltl); + #$self->add('translations_by_stable_id', $type, $tl->stable_id, $ltl); + #$self->add('translations_by_transcript_id', $type, $tr->dbID, $ltl); undef $tl; } @@ -233,8 +242,8 @@ sub build_cache_from_genes { $ltr->add_Exon($lexon); $self->add('exons_by_id', $type, $exon->dbID, $lexon); - #$self->add($type, 'genes_by_exon_id', $exon->dbID, $lgene); - $self->add_list($type, 'transcripts_by_exon_id', $exon->dbID, $ltr); + #$self->add('genes_by_exon_id', $type, $exon->dbID, $lgene); + $self->add_list('transcripts_by_exon_id', $type, $exon->dbID, $ltr); undef $exon; } @@ -295,7 +304,7 @@ sub get_by_key { # transparently load cache from file unless already loaded unless ($self->{'instance'}->{'loaded'}->{"$name:$type"}) { - $self->load_and_merge($name, $type); + $self->read_and_merge($name, $type); } return $self->{'cache'}->{$name}->{$type}->{$key}; @@ -311,7 +320,7 @@ sub get_by_name { # transparently load cache from file unless already loaded unless ($self->{'instance'}->{'loaded'}->{"$name:$type"}) { - $self->load_and_merge($name, $type); + $self->read_and_merge($name, $type); } return $self->{'cache'}->{$name}->{$type} || {}; @@ -328,7 +337,7 @@ sub get_count_by_name { # transparently load cache from file unless already loaded unless ($self->{'instance'}->{'loaded'}->{"$name:$type"}) { - $self->load_and_merge($name, $type); + $self->read_and_merge($name, $type); } return scalar(keys %{ $self->get_by_name($name, $type) }); @@ -382,6 +391,7 @@ sub find_common_coord_systems { } } + return $found_highest; } @@ -427,7 +437,7 @@ sub seq_regions_compatible { if ($equal/$s_count > 0.5 and $equal/$t_count > 0.5) { return(1); } else { - $self->logger->log("Only $equal seq_regions identical for ".$cs->name." ".$cs->version."\n"); + $self->logger->info("Only $equal seq_regions identical for ".$cs->name." ".$cs->version."\n"); return(0); } @@ -491,10 +501,10 @@ sub cache_file_exists { my $cache_file = $self->cache_file($name, $type); if (-s $cache_file) { - $self->logger->log("Cache file found. Will read from $cache_file.\n", 3); + $self->logger->info("Cache file found. Will read from $cache_file.\n", 3); return 1; } else { - $self->logger->log("No cache file found. Will build cache from db.\n", 3); + $self->logger->info("No cache file found for $name/$type. Will build cache from db.\n", 3); return 0; } } @@ -573,7 +583,7 @@ sub write_to_file { throw("You must provide a cache type.") unless $type; unless ($self->{'cache'}->{$name}->{$type}) { - $self->logger->log_warning("No features found in $name/$type. Won't write cache file.\n"); + $self->logger->warning("No features found in $name/$type. Won't write cache file.\n"); return; } @@ -639,13 +649,13 @@ sub read_from_file { throw("No valid cache file found at $cache_file."); } - #$self->logger->log_stamped("Reading cache from file...\n"); - #$self->logger->log("Cache file $cache_file.\n", 1); + #$self->logger->info("Reading cache from file...\n", 0, 'stamped'); + #$self->logger->info("Cache file $cache_file.\n", 1); eval { $self->{'cache'}->{$name}->{$type} = retrieve($cache_file); }; if ($@) { throw("Unable to retrieve cache: $@"); } - #$self->logger->log_stamped("Done.\n"); + #$self->logger->info("Done.\n", 0, 'stamped'); return $self->{'cache'}->{$name}->{$type}; } @@ -662,7 +672,7 @@ sub merge { foreach my $key (keys %{ $self->{'cache'}->{$name}->{$type} || {} }) { if (defined $self->{'cache'}->{$name}->{$merged_type}->{$key}) { - warning("Duplicate key in cache: $name|$merged_type|$key. Skipping.\n"); + # warning("Duplicate key in cache: $name|$merged_type|$key. Skipping.\n"); } else { $self->{'cache'}->{$name}->{$merged_type}->{$key} = $self->{'cache'}->{$name}->{$type}->{$key}; @@ -715,7 +725,11 @@ sub slice_names { } elsif ($self->conf->param('region')) { # filter by region (specific slice) - my $slice = $sa->fetch_by_name($self->conf->param('region')); + # don't use SliceAdaptor->fetch_by_name() since this will fail if assembly + # versions are different for source and target db + my ($cs, $version, $name, $start, $end, $strand) = + split(/:/, $self->conf->param('region')); + my $slice = $sa->fetch_by_region($cs, $name, $start, $end); push @slice_names, $slice->name; } else { diff --git a/modules/Bio/EnsEMBL/IdMapping/ExonScoreBuilder.pm b/modules/Bio/EnsEMBL/IdMapping/ExonScoreBuilder.pm index 1e3e415167..8eee9a73bb 100644 --- a/modules/Bio/EnsEMBL/IdMapping/ExonScoreBuilder.pm +++ b/modules/Bio/EnsEMBL/IdMapping/ExonScoreBuilder.pm @@ -52,28 +52,28 @@ use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix; sub score_exons { my $self = shift; - $self->logger->log_stamped("Starting exon scoring...\n\n"); + $self->logger->info("Starting exon scoring...\n\n", 0, 'stamped'); # score using overlaps, then exonerate my $matrix = $self->overlap_score; my $exonerate_matrix = $self->exonerate_score($matrix); # log stats before matrix merging - $self->logger->log("\nOverlap scoring matrix:\n"); + $self->logger->info("\nOverlap scoring matrix:\n"); $self->log_matrix_stats($matrix); - $self->logger->log("\nExonerate scoring matrix:\n"); + $self->logger->info("\nExonerate scoring matrix:\n"); $self->log_matrix_stats($exonerate_matrix); # merge matrices - $self->logger->log_stamped("\nMerging scoring matrices...\n"); + $self->logger->info("\nMerging scoring matrices...\n", 0, 'stamped'); $matrix->merge($exonerate_matrix); - $self->logger->log_stamped("Done.\n\n"); + $self->logger->info("Done.\n\n", 0, 'stamped'); # log stats of combined matrix - $self->logger->log("Combined scoring matrix:\n"); + $self->logger->info("Combined scoring matrix:\n"); $self->log_matrix_stats($matrix); - $self->logger->log_stamped("\nDone with exon scoring.\n\n"); + $self->logger->info("\nDone with exon scoring.\n\n", 0, 'stamped'); return $matrix; } @@ -95,20 +95,20 @@ sub overlap_score { if (-s $overlap_cache) { # read from file - $self->logger->log_stamped("Reading exon overlap scoring matrix from file...\n"); - $self->logger->log("Cache file $overlap_cache.\n", 1); + $self->logger->info("Reading exon overlap scoring matrix from file...\n", 0, 'stamped'); + $self->logger->info("Cache file $overlap_cache.\n", 1); $matrix->read_from_file; - $self->logger->log_stamped("Done.\n"); + $self->logger->info("Done.\n", 0, 'stamped'); } else { # build scoring matrix - $self->logger->log("No exon overlap scoring matrix found. Will build new one.\n"); + $self->logger->info("No exon overlap scoring matrix found. Will build new one.\n"); if ($self->cache->highest_common_cs) { - $self->logger->log_stamped("Overlap scoring...\n"); + $self->logger->info("Overlap scoring...\n", 0, 'stamped'); $matrix = $self->build_overlap_scores($matrix); - $self->logger->log_stamped("Done.\n"); + $self->logger->info("Done.\n", 0, 'stamped'); } # write scoring matrix to file @@ -142,15 +142,15 @@ sub exonerate_score { if (-s $exonerate_cache) { # read from file - $self->logger->log_stamped("Reading exonerate matrix from file...\n"); - $self->logger->log("Cache file $exonerate_cache.\n", 1); + $self->logger->info("Reading exonerate matrix from file...\n", 0, 'stamped'); + $self->logger->info("Cache file $exonerate_cache.\n", 1); $exonerate_matrix->read_from_file; - $self->logger->log_stamped("Done.\n"); + $self->logger->info("Done.\n", 0, 'stamped'); } else { # build scoring matrix - $self->logger->log("No exonerate matrix found. Will build new one.\n"); + $self->logger->info("No exonerate matrix found. Will build new one.\n"); # dump exons to fasta files my $dump_count = $self->dump_filtered_exons($matrix); @@ -164,7 +164,7 @@ sub exonerate_score { } else { - $self->logger->log("No source and/or target exons dumped, so don't need to run exonerate.\n"); + $self->logger->info("No source and/or target exons dumped, so don't need to run exonerate.\n"); } @@ -193,7 +193,7 @@ sub build_overlap_scores { } # get sorted list of exon containers - $self->logger->log_stamped("Reading sorted exons from cache...\n", 1); + $self->logger->info("Reading sorted exons from cache...\n", 1, 'stamped'); my @source_exons = $self->sort_exons( [values %{ $self->cache->get_by_name('exons_by_id', 'source') }] @@ -202,7 +202,7 @@ sub build_overlap_scores { [values %{ $self->cache->get_by_name('exons_by_id', 'target') }] ); - $self->logger->log_stamped("Done.\n", 1); + $self->logger->info("Done.\n", 1, 'stamped'); # get first source and target exon container my $source_ec = shift(@source_exons); @@ -211,7 +211,7 @@ sub build_overlap_scores { my %source_overlap = (); my %target_overlap = (); - $self->logger->log_stamped("Scoring...\n", 1); + $self->logger->info("Scoring...\n", 1, 'stamped'); while ($source_ec or $target_ec) { @@ -243,7 +243,8 @@ sub build_overlap_scores { next if (defined($matrix->get_score( $source_ec->[0]->id, $target_exon->id))); - $self->overlap_score($source_ec->[0], $target_exon, $matrix); + $self->calc_overlap_score($source_ec->[0], $target_exon, + $matrix); } } @@ -265,7 +266,7 @@ sub build_overlap_scores { next if (defined($matrix->get_score( $source_exon->id, $target_ec->[0]->id))); - $self->overlap_score($source_exon, $target_ec->[0], $matrix); + $self->calc_overlap_score($source_exon, $target_ec->[0], $matrix); } } @@ -274,7 +275,7 @@ sub build_overlap_scores { } } - $self->logger->log_stamped("Done.\n", 1); + $self->logger->info("Done.\n", 1, 'stamped'); return $matrix; } @@ -312,7 +313,7 @@ sub compare_exon_containers { # region by exons sizes. 1.0 is full overlap on both exons. Score of at least # 0,5 are added to the exon scoring matrix. # -sub overlap_score { +sub calc_overlap_score { my $self = shift; my $source_exon = shift; my $target_exon = shift; @@ -361,8 +362,8 @@ sub run_exonerate { my $source_file = $self->exon_fasta_file('source'); my $target_file = $self->exon_fasta_file('target'); - my $source_size = -s $source_size; - my $target_size = -s $target_size; + my $source_size = -s $source_file; + my $target_size = -s $target_file; # check if fasta files exist and are not empty unless ($source_size and $target_size) { @@ -373,21 +374,21 @@ sub run_exonerate { my $logpath = ($self->conf->param('logpath')||$self->cache->dump_path). '/lsf_exonerate'; system("rm -rf $logpath") == 0 or - $self->logger->log_error("Unable to delete lsf log dir $logpath: $!\n"); + $self->logger->error("Unable to delete lsf log dir $logpath: $!\n"); system("mkdir -p $logpath") == 0 or - $self->logger->log_error("Can't create lsf log dir $logpath: $!\n"); + $self->logger->error("Can't create lsf log dir $logpath: $!\n"); # delete exonerate output from previous runs my $dumppath = $self->cache->dump_path; opendir(DUMPDIR, $dumppath) or - $self->logger->log_error("Can't open $dumppath for reading: $!"); + $self->logger->error("Can't open $dumppath for reading: $!"); while (defined(my $file = readdir(DUMPDIR))) { next unless /exonerate_map\.\d+/; unlink("$dumppath/$file") or - $self->logger->log_error("Can't delete $dumppath/$file: $!"); + $self->logger->error("Can't delete $dumppath/$file: $!"); } closedir(DUMPDIR); @@ -404,29 +405,37 @@ sub run_exonerate { # # run exonerate jobs using lsf # - my $exonerate_job = "bsub -J $lsf_name[1-$num_jobs] " . - " -o $logpath/exonerate.%I.out -e $logpath/exonerate.%I.err" . - " -q normal -m bc_hosts " . - " $exonerate_path $source_file $target_file" . - " --querychunkid \$LSB_JOBINDEX --querychunktotal $num_jobs" . - " --model affine:local -M 900 --showalignment FALSE --subopt no" . - " --percent $percent --ryo \"myinfo: \%qi \%ti \%et \%ql \%tl\\n\"" . - " | grep '^myinfo:' > $dumppath/exonerate_map.\$LSB_JOBINDEX"; - - $self->logger->log("Submitting $num_jobs exonerate jobs to lsf:\n\n"); - $self->logger->log("$exonerate_job\n\n"); + local *BSUB; + open BSUB, "|bsub -J$lsf_name\[1-$num_jobs\] -o $logpath/exonerate.\%I.out" + or $self->logger->error("Could not open open pipe to bsub: $!\n"); + + my $exonerate_job = qq{$exonerate_path } . + qq{--query $source_file --target $target_file } . + q{--querychunkid $LSB_JOBINDEX } . + qq{--querychunktotal $num_jobs } . + q{--model affine:local -M 900 --showalignment FALSE --subopt no } . + qq{--percent $percent } . + q{--ryo 'myinfo: %qi %ti %et %ql %tl\n' } . + qq{| grep '^myinfo:' > $dumppath/exonerate_map.\$LSB_JOBINDEX} . "\n"; + + $self->logger->info("Submitting $num_jobs exonerate jobs to lsf:\n\n"); + $self->logger->info("$exonerate_job\n\n"); - system("$exonerate_job") == 0 - or $self->logger->log_error("Error submitting exonerate jobs: $!"); + print BSUB $exonerate_job; + $self->logger->error("Error submitting exonerate jobs: $!\n") + unless ($? == 0); + close BSUB; # submit depended job to monitor finishing of exonerate jobs - $self->logger->log_stamped("Waiting for exonerate jobs to finish...\n"); + $self->logger->info("Waiting for exonerate jobs to finish...\n", 0, 'stamped'); - my $depended_job = "bsub -K -w ended($lsf_name) -q small " . - " -o $logpath/exonerate_depend.out -e $logpath/exonerate_depend.err" . - " /bin/true"; + my $dependent_job = qq{bsub -K -w "ended($lsf_name)" -q small } . + qq{-o $logpath/exonerate_depend.out /bin/true}; - $self->logger->log_stamepd("All exonerate jobs finished.\n"); + system($dependent_job) == 0 or + $self->logger->error("Error submitting dependent job: $!\n"); + + $self->logger->info("All exonerate jobs finished.\n", 0, 'stamped'); # # check results @@ -446,18 +455,18 @@ sub run_exonerate { } if (@missing) { - $self->logger->log("Couldn't find all exonerate output files. These are missing:\n"); + $self->logger->info("Couldn't find all exonerate output files. These are missing:\n"); foreach (@missing) { - $self->logger->log("$_\n", 1); + $self->logger->info("$_\n", 1); } exit(1); } if (@error) { - $self->logger->log("One or more exonerate jobs failed. Check these error files:\n"); + $self->logger->info("One or more exonerate jobs failed. Check these error files:\n"); foreach (@error) { - $self->logger->log("$_\n", 1); + $self->logger->info("$_\n", 1); } exit(1); @@ -506,7 +515,7 @@ sub write_filtered_exons { throw('You must provide a ScoredMappingMatrix.'); } - $self->logger->log_stamped("\nDumping $type exons to fasta file...\n"); + $self->logger->info("\nDumping $type exons to fasta file...\n", 0, 'stamped'); # don't dump exons shorter than this my $min_exon_length = $self->conf->param('min_exon_length') || 15; @@ -552,11 +561,11 @@ sub write_filtered_exons { # log my $fmt = "%-30s%10s\n"; - my $size = -e $file; - $self->logger->log(sprintf($fmt, 'Total exons:', $total_exons), 1); - $self->logger->log(sprintf($fmt, 'Dumped exons:', $dumped_exons), 1); - $self->logger->log(sprintf($fmt, 'Dump file size:', parse_bytes($size)), 1); - $self->logger->log_stamped("Done.\n\n"); + my $size = -s $file; + $self->logger->info(sprintf($fmt, 'Total exons:', $total_exons), 1); + $self->logger->info(sprintf($fmt, 'Dumped exons:', $dumped_exons), 1); + $self->logger->info(sprintf($fmt, 'Dump file size:', parse_bytes($size)), 1); + $self->logger->info("Done.\n\n", 0, 'stamped'); return $dumped_exons; } @@ -571,21 +580,20 @@ sub parse_exonerate_results { throw('You must provide a ScoredMappingMatrix.'); } - $self->logger->log_stamped("Parsing exonerate results...\n"); + $self->logger->info("Parsing exonerate results...\n", 0, 'stamped'); # loop over all result files my $dumppath = $self->cache->dump_path; my $num_files = 0; + my $num_lines = 0; opendir(DUMPDIR, $dumppath) or - $self->logger->log_error("Can't open $dumppath for reading: $!"); + $self->logger->error("Can't open $dumppath for reading: $!"); while (defined(my $file = readdir(DUMPDIR))) { - next unless /exonerate_map\.\d+/; + next unless $file =~ /exonerate_map\.\d+/; - # counters $num_files++; - my $num_lines = 0; open(F, '<', "$dumppath/$file"); @@ -601,7 +609,7 @@ sub parse_exonerate_results { my $score = 0; if ($source_length == 0 or $target_length == 0) { - $self->logger->log_warning("Alignment length is 0 for $source_id/$target_id.\n"); + $self->logger->warning("Alignment length is 0 for $source_id/$target_id.\n"); } else { $score = 2 * $match_length / ($source_length + $target_length); } @@ -615,7 +623,7 @@ sub parse_exonerate_results { closedir(DUMPDIR); - $self->logger->log_stamped("Done parsing $num_lines lines from $num_files result files.\n"); + $self->logger->info("Done parsing $num_lines lines from $num_files result files.\n", 0, 'stamped'); return $exonerate_matrix; } diff --git a/modules/Bio/EnsEMBL/IdMapping/GeneScoreBuilder.pm b/modules/Bio/EnsEMBL/IdMapping/GeneScoreBuilder.pm index dec16faba6..f06213822d 100644 --- a/modules/Bio/EnsEMBL/IdMapping/GeneScoreBuilder.pm +++ b/modules/Bio/EnsEMBL/IdMapping/GeneScoreBuilder.pm @@ -52,7 +52,7 @@ sub score_genes { throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); } - $self->logger->log_stamped("Starting gene scoring...\n\n"); + $self->logger->info("Starting gene scoring...\n\n", 0, 'stamped'); # build scores based on transcript scores my $matrix = $self->scores_from_transcript_scores($transcript_matrix); @@ -60,23 +60,23 @@ sub score_genes { # log stats of combined matrix my $fmt = "%-40s%10.0f\n"; - $self->logger->log("Scoring matrix:\n"); + $self->logger->info("Scoring matrix:\n"); - $self->logger->log(sprintf($fmt, "Total source genes:", - $self->cache->get_count_by_name('genes_by_id', 'source'), 1); + $self->logger->info(sprintf($fmt, "Total source genes:", + $self->cache->get_count_by_name('genes_by_id', 'source')), 1); - $self->logger->log(sprintf($fmt, "Scored source genes:", + $self->logger->info(sprintf($fmt, "Scored source genes:", $matrix->get_source_count), 1); - $self->logger->log(sprintf($fmt, "Total target genes:", - $self->cache->get_count_by_name('genes_by_id', 'target'), 1); + $self->logger->info(sprintf($fmt, "Total target genes:", + $self->cache->get_count_by_name('genes_by_id', 'target')), 1); - $self->logger->log(sprintf($fmt, "Scored target genes:", + $self->logger->info(sprintf($fmt, "Scored target genes:", $matrix->get_target_count), 1); $self->log_matrix_stats($matrix); - $self->logger->log("\nDone with transcript scoring.\n\n"); + $self->logger->info("\nDone with transcript scoring.\n\n"); return $matrix; } @@ -101,19 +101,19 @@ sub scores_from_transcript_scores { if (-s $gene_cache) { # read from file - $self->logger->log_stamped("Reading gene scoring matrix from file...\n"); - $self->logger->log("Cache file $gene_cache.\n", 1); + $self->logger->info("Reading gene scoring matrix from file...\n", 0, 'stamped'); + $self->logger->info("Cache file $gene_cache.\n", 1); $matrix->read_from_file; - $self->logger->log_stamped("Done.\n"); + $self->logger->info("Done.\n", 0, 'stamped'); } else { # build scoring matrix - $self->logger->log("No gene scoring matrix found. Will build new one.\n"); + $self->logger->info("No gene scoring matrix found. Will build new one.\n"); - $self->logger->log_stamped("Transcript scoring...\n"); + $self->logger->info("Gene scoring...\n", 0, 'stamped'); $matrix = $self->build_scores($matrix, $transcript_matrix); - $self->logger->log_stamped("Done.\n"); + $self->logger->info("Done.\n", 0, 'stamped'); # write scoring matrix to file $matrix->write_to_file; @@ -144,7 +144,7 @@ sub build_scores { $self->flag_matrix_from_transcript_scores($matrix, $transcript_matrix); # now calculate the actual scores for the genes in the flag matrix - $final_matrix = $self->score_matrix_from_flag_matrix($matrix, + my $final_matrix = $self->score_matrix_from_flag_matrix($matrix, $transcript_matrix); return $final_matrix; @@ -161,7 +161,7 @@ sub flag_matrix_from_transcript_scores { my $num_genes = scalar(keys %{ $self->cache->get_by_name('genes_by_id', 'source') }); - $self->logger->log("Creating flag matrix...\n", 1); + $self->logger->info("Creating flag matrix...\n", 1); # for every transcript scoring matrix entry, make an entry in the gene flag # matrix. @@ -178,7 +178,7 @@ sub flag_matrix_from_transcript_scores { $matrix->add_score($source_gene->id, $target_gene->id, 1); } - $self->logger->log("\n\n"); + $self->logger->info("\n\n"); return $matrix; } @@ -201,7 +201,7 @@ sub score_matrix_from_flag_matrix { my $num_genes = scalar(keys %{ $self->cache->get_by_name('genes_by_id', 'source') }); - $self->logger->log("Creating score matrix from flag matrix...\n", 1); + $self->logger->info("Creating score matrix from flag matrix...\n", 1); # loop over flag matrix and do proper scoring for each entry foreach my $entry (@{ $flag_matrix->get_all_Entries }) { @@ -230,7 +230,7 @@ sub score_matrix_from_flag_matrix { $matrix->add($entry->source, $entry->target, $score); } - $self->logger->log("\n\n"); + $self->logger->info("\n\n"); return $matrix; } @@ -254,7 +254,7 @@ sub complex_gene_gene_score { @{ $target_gene->get_all_Transcripts }; # loop over source transcripts - foreach my $source_transcript (@{ $source_gene->get_all_Transcripts ) { + foreach my $source_transcript (@{ $source_gene->get_all_Transcripts }) { # now loop over target transcripts and find the highest scoring target # transcript belonging to the target gene @@ -262,7 +262,7 @@ sub complex_gene_gene_score { foreach my $target_transcript_id (@{ $transcript_matrix->get_targets_for_source($source_transcript->id) }) { - next unless (%target_transcripts{$target_transcript_id}); + next unless ($target_transcripts{$target_transcript_id}); my $score = $transcript_matrix->get_score( $source_transcript->id, $target_transcript_id); @@ -281,7 +281,7 @@ sub complex_gene_gene_score { @{ $source_gene->get_all_Transcripts }; # loop over target transcripts - foreach my $target_transcript (@{ $target_gene->get_all_Transcripts ) { + foreach my $target_transcript (@{ $target_gene->get_all_Transcripts }) { # now loop over source transcripts and find the highest scoring source # transcript belonging to the source gene @@ -289,7 +289,7 @@ sub complex_gene_gene_score { foreach my $source_transcript_id (@{ $transcript_matrix->get_sources_for_target($target_transcript->id) }) { - next unless (%source_transcripts{$source_transcript_id}); + next unless ($source_transcripts{$source_transcript_id}); my $score = $transcript_matrix->get_score( $source_transcript_id, $target_transcript->id); @@ -306,14 +306,14 @@ sub complex_gene_gene_score { # calculate overall score for this gene my $gene_score = 0; - if (($source_gene_length + $target_gene_length) > 0) { + if (($source_gene->length + $target_gene->length) > 0) { $gene_score = ($source_gene_score + $target_gene_score) / - ($source_gene_length + $target_gene_length); + ($source_gene->length + $target_gene->length); } else { - $self->logger->log_warning("Combined length of source (".$source_gene->id.") and target (".$target_gene->id.") gene is zero!\n", 1); + $self->logger->warning("Combined length of source (".$source_gene->id.") and target (".$target_gene->id.") gene is zero!\n", 1); } @@ -326,7 +326,7 @@ sub complex_gene_gene_score { # This is used when the more elaborate gene representing score does not # distinguish very well. # -sub complex_gene_gene_score { +sub simple_gene_gene_score { my $self = shift; my $source_gene = shift; my $target_gene = shift; @@ -334,8 +334,8 @@ sub complex_gene_gene_score { my $gene_score = 0; - foreach my $source_transcript (@{ $source_gene->get_all_Transcripts ) { - foreach my $target_transcript (@{ $target_gene->get_all_Transcripts ) { + foreach my $source_transcript (@{ $source_gene->get_all_Transcripts }) { + foreach my $target_transcript (@{ $target_gene->get_all_Transcripts }) { my $score = $transcript_matrix->get_score($source_transcript->id, $target_transcript->id); diff --git a/modules/Bio/EnsEMBL/IdMapping/ScoreBuilder.pm b/modules/Bio/EnsEMBL/IdMapping/ScoreBuilder.pm index 15aaf86661..4f6403e68e 100644 --- a/modules/Bio/EnsEMBL/IdMapping/ScoreBuilder.pm +++ b/modules/Bio/EnsEMBL/IdMapping/ScoreBuilder.pm @@ -111,15 +111,15 @@ sub log_matrix_stats { my $fmt1 = "%-40s%10.0f\n"; my $fmt2 = "%-40s%10.2f\n"; - $self->logger->log(sprintf($fmt1, "Scoring matrix entries:", + $self->logger->info(sprintf($fmt1, "Scoring matrix entries:", $matrix->get_entry_count), 1); - $self->logger->log(sprintf($fmt2, "Average score:", + $self->logger->info(sprintf($fmt2, "Average score:", $matrix->get_average_score), 1); my ($min, $max) = @{ $matrix->get_min_max_scores }; - $self->logger->log(sprintf($fmt2, "Min. score:", $min), 1); - $self->logger->log(sprintf($fmt2, "Max. score:", $max), 1); + $self->logger->info(sprintf($fmt2, "Min. score:", $min), 1); + $self->logger->info(sprintf($fmt2, "Max. score:", $max), 1); } diff --git a/modules/Bio/EnsEMBL/IdMapping/ScoredMappingMatrix.pm b/modules/Bio/EnsEMBL/IdMapping/ScoredMappingMatrix.pm index e06371e7aa..4ff78b6ad3 100644 --- a/modules/Bio/EnsEMBL/IdMapping/ScoredMappingMatrix.pm +++ b/modules/Bio/EnsEMBL/IdMapping/ScoredMappingMatrix.pm @@ -156,27 +156,32 @@ sub get_all_Entries { sub get_all_sources { - return [keys %{ $_->{'source_list'} }]; + my $self = shift; + return [keys %{ $self->{'cache'}->{'source_list'} }]; } sub get_all_targets { - return [keys %{ $_->{'target_list'} }]; + my $self = shift; + return [keys %{ $self->{'cache'}->{'target_list'} }]; } sub get_entry_count { - return scalar(keys %{ $_->{'matrix'} }); + my $self = shift; + return scalar(keys %{ $self->{'cache'}->{'matrix'} }); } sub get_source_count { - return scalar(keys %{ $_->{'source_list'} }); + my $self = shift; + return scalar(keys %{ $self->{'cache'}->{'source_list'} }); } sub get_target_count { - return scalar(keys %{ $_->{'target_list'} }); + my $self = shift; + return scalar(keys %{ $self->{'cache'}->{'target_list'} }); } @@ -228,10 +233,10 @@ sub merge { my $c = 0; - foreach my $key (keys %{ $matrix->{'matrix'} }) { + foreach my $key (keys %{ $matrix->{'cache'}->{'matrix'} }) { if (!defined($self->{'cache'}->{'matrix'}->{$key}) or - $self->{'cache'}->{'matrix'}->{$key} < $matrix->{'matrix'}->{$key}) { - $self->{'cache'}->{'matrix'}->{$key} = $matrix->{'matrix'}->{$key}; + $self->{'cache'}->{'matrix'}->{$key} < $matrix->{'cache'}->{'matrix'}->{$key}) { + $self->{'cache'}->{'matrix'}->{$key} = $matrix->{'cache'}->{'matrix'}->{$key}; $c++; } } diff --git a/modules/Bio/EnsEMBL/IdMapping/TranscriptScoreBuilder.pm b/modules/Bio/EnsEMBL/IdMapping/TranscriptScoreBuilder.pm index 815a8c64b1..354c07a988 100644 --- a/modules/Bio/EnsEMBL/IdMapping/TranscriptScoreBuilder.pm +++ b/modules/Bio/EnsEMBL/IdMapping/TranscriptScoreBuilder.pm @@ -52,7 +52,7 @@ sub score_transcripts { throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); } - $self->logger->log_stamped("Starting transcript scoring...\n\n"); + $self->logger->info("Starting transcript scoring...\n\n", 0, 'stamped'); # build scores based on exon scores my $matrix = $self->scores_from_exon_scores($exon_matrix); @@ -60,23 +60,23 @@ sub score_transcripts { # log stats of combined matrix my $fmt = "%-40s%10.0f\n"; - $self->logger->log("Scoring matrix:\n"); + $self->logger->info("Scoring matrix:\n"); - $self->logger->log(sprintf($fmt, "Total source transcripts:", - $self->cache->get_count_by_name('transcripts_by_id', 'source'), 1); + $self->logger->info(sprintf($fmt, "Total source transcripts:", + $self->cache->get_count_by_name('transcripts_by_id', 'source')), 1); - $self->logger->log(sprintf($fmt, "Scored source transcripts:", + $self->logger->info(sprintf($fmt, "Scored source transcripts:", $matrix->get_source_count), 1); - $self->logger->log(sprintf($fmt, "Total target transcripts:", - $self->cache->get_count_by_name('transcripts_by_id', 'target'), 1); + $self->logger->info(sprintf($fmt, "Total target transcripts:", + $self->cache->get_count_by_name('transcripts_by_id', 'target')), 1); - $self->logger->log(sprintf($fmt, "Scored target transcripts:", + $self->logger->info(sprintf($fmt, "Scored target transcripts:", $matrix->get_target_count), 1); $self->log_matrix_stats($matrix); - $self->logger->log("\nDone with transcript scoring.\n\n"); + $self->logger->info("\nDone with transcript scoring.\n\n"); return $matrix; } @@ -101,19 +101,19 @@ sub scores_from_exon_scores { if (-s $transcript_cache) { # read from file - $self->logger->log_stamped("Reading transcript scoring matrix from file...\n"); - $self->logger->log("Cache file $transcript_cache.\n", 1); + $self->logger->info("Reading transcript scoring matrix from file...\n", 0, 'stamped'); + $self->logger->info("Cache file $transcript_cache.\n", 1); $matrix->read_from_file; - $self->logger->log_stamped("Done.\n"); + $self->logger->info("Done.\n", 0, 'stamped'); } else { # build scoring matrix - $self->logger->log("No transcript scoring matrix found. Will build new one.\n"); + $self->logger->info("No transcript scoring matrix found. Will build new one.\n"); - $self->logger->log_stamped("Transcript scoring...\n"); + $self->logger->info("Transcript scoring...\n", 0, 'stamped'); $matrix = $self->build_scores($matrix, $exon_matrix); - $self->logger->log_stamped("Done.\n"); + $self->logger->info("Done.\n", 0, 'stamped'); # write scoring matrix to file $matrix->write_to_file; @@ -144,7 +144,8 @@ sub build_scores { $self->flag_matrix_from_exon_scores($matrix, $exon_matrix); # now calculate the actual scores for the transcripts in the flag matrix - $final_matrix = $self->score_matrix_from_flag_matrix($matrix, $exon_matrix); + my $final_matrix = + $self->score_matrix_from_flag_matrix($matrix, $exon_matrix); return $final_matrix; } @@ -160,7 +161,7 @@ sub flag_matrix_from_exon_scores { my $num_transcripts = scalar(keys %{ $self->cache->get_by_name('transcripts_by_id', 'source') }); - $self->logger->log("Creating flag matrix...\n", 1); + $self->logger->info("Creating flag matrix...\n", 1); # loop over source transcripts foreach my $source_transcript (values %{ $self->cache->get_by_name('transcripts_by_id', 'source') }) { @@ -169,7 +170,7 @@ sub flag_matrix_from_exon_scores { $self->logger->log_progress($num_transcripts, ++$i, 20, 1, 0); # get all exons for the source transcript - foreach my $source_exon (@{ $souce_transcript->get_all_Exons }) { + foreach my $source_exon (@{ $source_transcript->get_all_Exons }) { # get target exons for this source exon from scoring matrix foreach my $target_exon_id (@{ $exon_matrix->get_targets_for_source($source_exon->id) }) { @@ -178,14 +179,14 @@ sub flag_matrix_from_exon_scores { foreach my $target_transcript (@{ $self->cache->get_by_key('transcripts_by_exon_id', 'target', $target_exon_id) }) { # add scoring flag for these two transcripts - $matrix->add_score($source_transcript->id, $target_transcript->id, 1)Ã; + $matrix->add_score($source_transcript->id, $target_transcript->id, 1); } } } } - $self->logger->log("\n\n"); + $self->logger->info("\n\n"); return $matrix; } @@ -210,7 +211,7 @@ sub score_matrix_from_flag_matrix { my $num_transcripts = scalar(keys %{ $self->cache->get_by_name('transcripts_by_id', 'source') }); - $self->logger->log("Creating score matrix from flag matrix...\n", 1); + $self->logger->info("Creating score matrix from flag matrix...\n", 1); # loop over source transcripts foreach my $source_transcript (values %{ $self->cache->get_by_name('transcripts_by_id', 'source') }) { @@ -247,7 +248,7 @@ sub score_matrix_from_flag_matrix { foreach my $target_exon_id (@{ $exon_matrix->get_targets_for_source($source_exon->id) }) { - next unless (%target_exon{$target_exon_id}); + next unless ($target_exons{$target_exon_id}); my $score = $exon_matrix->get_score( $source_exon->id, $target_exon_id); @@ -266,7 +267,7 @@ sub score_matrix_from_flag_matrix { foreach my $source_exon_id (@{ $exon_matrix->get_sources_for_target($target_exon->id) }) { - next unless (%source_exon{$source_exon_id}); + next unless ($source_exons{$source_exon_id}); my $score = $exon_matrix->get_score( $source_exon_id, $target_exon->id); @@ -287,7 +288,7 @@ sub score_matrix_from_flag_matrix { if (($source_transcript_score > $source_transcript_length) or ($target_transcript_score > $target_transcript_length)) { - $self->logger->log_warning("Score > length for source ($source_target_score <> $source_transcript_length) or target ($target_transcript_score <> $target_transcript_length).\n", 1); + $self->logger->warning("Score > length for source ($source_transcript_score <> $source_transcript_length) or target ($target_transcript_score <> $target_transcript_length).\n", 1); } else { @@ -305,14 +306,14 @@ sub score_matrix_from_flag_matrix { } else { - $self->logger->log_warning("Combined length of source (".$source_transcript->id.") and target (".$target_transcript->id.") transcript is zero!\n", 1); + $self->logger->warning("Combined length of source (".$source_transcript->id.") and target (".$target_transcript->id.") transcript is zero!\n", 1); } } } - $self->logger->log("\n\n"); + $self->logger->info("\n\n"); return $matrix; -- GitLab