From 418419dba9e6e628ba6a27c11fc83dcfc0413680 Mon Sep 17 00:00:00 2001 From: Graham McVicker <mcvicker@sanger.ac.uk> Date: Tue, 7 Jan 2003 11:02:59 +0000 Subject: [PATCH] removed old modules now present in pipeline --- modules/Bio/EnsEMBL/Utils/GeneCluster.pm | 655 ------ modules/Bio/EnsEMBL/Utils/GeneComparison.pm | 1874 ----------------- .../Bio/EnsEMBL/Utils/GeneComparison_conf.pl | 50 - modules/Bio/EnsEMBL/Utils/Report.pm | 183 -- .../Bio/EnsEMBL/Utils/TranscriptCluster.pm | 560 ----- 5 files changed, 3322 deletions(-) delete mode 100644 modules/Bio/EnsEMBL/Utils/GeneCluster.pm delete mode 100644 modules/Bio/EnsEMBL/Utils/GeneComparison.pm delete mode 100644 modules/Bio/EnsEMBL/Utils/GeneComparison_conf.pl delete mode 100644 modules/Bio/EnsEMBL/Utils/Report.pm delete mode 100644 modules/Bio/EnsEMBL/Utils/TranscriptCluster.pm diff --git a/modules/Bio/EnsEMBL/Utils/GeneCluster.pm b/modules/Bio/EnsEMBL/Utils/GeneCluster.pm deleted file mode 100644 index 6b6d678ffa..0000000000 --- a/modules/Bio/EnsEMBL/Utils/GeneCluster.pm +++ /dev/null @@ -1,655 +0,0 @@ -=head1 NAME - -GeneCluster - -=head1 SYNOPSIS - - -=head1 DESCRIPTION - -This object holds one or more genes which has been clustered according to -comparison criteria external to this class (for instance, in the -methods compare and _compare_Genes methods of the class GeneComparison). -Each GeneCluster object holds the IDs of the genes clustered and the beginning and end coordinates -of each one (taken from the start and end coordinates of the first and last exon in the correspondig -get_all_Exons array) - -=head1 CONTACT - -eae@sanger.ac.uk - -=cut - -# Let the code begin ... - -package Bio::EnsEMBL::Utils::GeneCluster; - -use vars qw(@ISA); -use strict; - -use Bio::EnsEMBL::Gene; -use Bio::EnsEMBL::Root; - -@ISA = qw(Bio::EnsEMBL::Root); - -=head1 METHODS - -=cut - -######################################################################### - - -=head2 new() - -new() initializes the attributes: - -$self->{'_benchmark_types'} -$self->{'_prediction_types'} -$self->{'_benchmark_genes'} -$self->{'_prediction_genes'} - -=cut - -sub new { - my ($class,$whatever)=@_; - - if (ref($class)){ - $class = ref($class); - } - my $self = {}; - bless($self,$class); - -if ($whatever){ - $self->throw( "Can't pass an object to new() method. Use put_Genes() to include Bio::EnsEMBL::Gene in cluster"); - } - - $self->{'_ys_gene'}= {}; - %{$self->{'_statistics'}}=(); - - return $self; -} - -######################################################################### - -=head2 put_Genes() - - function to include one or more genes in the cluster. - Useful when creating a cluster. It takes as argument an array of genes, it returns nothing. - -=cut - -sub put_Genes { - my ($self, @new_genes)= @_; - if ( !defined( $self->{'_benchmark_types'} ) || !defined( $self->{'_prediction_types'} ) ){ - $self->throw( "Cluster lacks references to gene-types, unable to put the gene"); - } - - GENE: - foreach my $gene (@new_genes){ - foreach my $type ( @{ $self->{'_benchmark_types'} } ){ - if ($gene->type eq $type){ - push ( @{ $self->{'_benchmark_genes'} }, $gene ); - next GENE; - } - } - foreach my $type ( @{ $self->{'_prediction_types'} } ){ - if ($gene->type eq $type){ - push ( @{ $self->{'_prediction_genes'} }, $gene ); - next GENE; - } - } - } -} - -######################################################################### - -=head2 get_Genes() - - it returns the array of genes in the GeneCluster object - -=cut - -sub get_Genes { - my $self = shift @_; - my @genes; - if ( !defined( $self->{'_benchmark_genes'} ) && !defined( $self->{'_prediction_genes'} ) ){ - $self->warn("The gene array you try to retrieve is empty"); - @genes = (); - } - if ( $self->{'_benchmark_genes'} ){ - push( @genes, @{ $self->{'_benchmark_genes'} } ); - } - if ( $self->{'_prediction_genes'} ){ - push( @genes, @{ $self->{'_prediction_genes'} } ); - } - return @genes; -} - -######################################################################### - -=head2 get_separated_Genes() - - Handy method to get the genes in the genes in the cluster separated by type. - It returns two arrayrefs. - -=cut - - -sub get_separated_Genes { - my ($self) = @_; - return ( $self->{'_benchmark_genes'}, $self->{'_prediction_genes'} ); -} - -######################################################################### - -=head2 get_Gene_Count() - - it returns the number of genes in the GeneCluster object - -=cut - -sub get_Gene_Count { - my $self = shift @_; - my $count =0; - if ( $self->{'_benchmark_genes'} ){ - $count += scalar( @{ $self->{'_benchmark_genes'} } ); - } - if ( $self->{'_prediction_genes'} ){ - $count += scalar( @{ $self->{'_prediction_genes'} } ); - } - #print STDERR "In GeneCluster.get_Gene_Count(), Count = ".$count."\n"; - return $count; -} - -######################################################################### - -=head2 gene_Types() - - It accepts two array references to set the types. One array holds the gene-types for the - benchmark genes and the other on for the predicted genes. - It can also be used to get the two type-arrays: ($types1, $types2) = $cluster->gene_Types; - The conventions throughout are (first entry: benchmark, second entry: prediction) - -=cut - -sub gene_Types { - my ($self, $benchmark_types, $prediction_types) = @_; - if ( $benchmark_types && $prediction_types ) { - $self->{'_benchmark_types'} = $benchmark_types; - $self->{'_prediction_types'} = $prediction_types; - } - return ($self->{'_benchmark_types'},$self->{'_prediction_types'}); -} - -######################################################################### - -=head2 get_Genes_by_Type() - - We can get the genes in each cluster of a given type. - We pass an arrayref containing the types we want to retrieve. - -=cut - - sub get_Genes_by_Type() { - my ($self,$types) = @_; - unless ($types){ - $self->throw( "must provide a type"); - } - my @genes = $self->get_Genes; # this should give them in order, but we check anyway - my @selected_genes; - foreach my $type ( @{ $types } ){ - push ( @selected_genes, grep { $_->type eq $type } @genes ); - } - return @selected_genes; - } - -######################################################################### - -=head2 pair_Transcripts() - - Title : pair_Transcripts() - Usage : my @array_of_pairs = $gene_cluster->pair_Transcripts - Function: This method make pairs of transcripts according to maximum reciprocal exon overlap. - Transcripts can eventually be repeated because the maximum exon_overlap may coincide - in two different pairs. - Example : look for instance in the method find_missing_Exons - Returns : three arrayrefs = - 1.- a list of Bio::EnsEMBL::Utils::Transcripts, each holding a pair of transcripts, - 2.- a list with the unpaired transcripts, and - 3.- a list those transcripts which have been paired up twice or more - Args : nothing - -=cut - -sub pair_Transcripts { - my ($self) = @_; - - # get the genes separated by type list 'benchmark'-like or 'prediction'-like - my ( $genes2, $genes1 ) = $self->get_separated_Genes; - my (@transcripts1,@transcripts2); - my (@trans1,@trans2); - foreach my $gene ( @$genes1 ){ - push( @trans1, $gene->each_Transcript ); - } - foreach my $gene ( @$genes2 ){ - push( @trans2, $gene->each_Transcript ); - } - - # first sort the transcripts by their start position coordinate - my %start_table1; - my %start_table2; - my $i=0; - foreach my $tran ( @trans1 ) { - $start_table1{$i} = $tran->start_exon->start; - $i++; - } - my $j=0; - foreach my $tra ( @trans2 ) { - $start_table2{$j} = $tra->start_exon->start; - $j++; - } - foreach my $pos ( sort { $start_table1{$a} <=> $start_table1{$b} } keys %start_table1 ){ - push (@transcripts1, $trans1[$pos]); - } - foreach my $pos ( sort { $start_table2{$a} <=> $start_table2{$b} } keys %start_table2 ){ - push (@transcripts2, $trans2[$pos]); - } - - # pair the transcripts, but first, some variable definition - - my %seen1; # these keep track of those transcript linked and with how much overlap - my %seen2; # ditto, for @transcripts2 - my @pairs; # list of (Bio::EnsEMBL::Utils::TranscriptCluster) transcript-pairs being created - my @unpaired; # list of Bio::EnsEMBL::Transcript which are left unpaired - my @doubled; # those which have been paired up twice - my $overlap_matrix; # matrix holding the number of exon overaps for each pair of transcripts - my $link; # matrix with 1 for the pairs linked and undef otherwise - my @overlap_pairs; # each entry holds an array with the overlap and the two transcripts being compared - my %repeated; # to keep track of repeated transcripts - - # first calculate all possible overlaps - foreach my $tran1 ( @transcripts1 ){ - foreach my $tran2 ( @transcripts2 ){ - $$overlap_matrix{ $tran1 }{ $tran2 } = _compare_Transcripts( $tran1, $tran2 ); - my @list = ( $$overlap_matrix{ $tran1 }{ $tran2 }, $tran1, $tran2 ); - push ( @overlap_pairs, \@list ); - #print STDERR "Overlap( ".$tran1->stable_id.",".$tran2->stable_id." ) = " - #.$$overlap_matrix{ $tran1 }{ $tran2 }."\n"; - } - } - # sort the list of @overlap_pairs on the overlap - my @sorted_pairs = sort { $$b[0] <=> $$a[0] } @overlap_pairs; - - # take the first pair of the list - my $first = shift @sorted_pairs; - my ($max_overlap,$tran1,$tran2) = @$first; - $seen1{ $tran1 } = $max_overlap; - $seen2{ $tran2 } = $max_overlap; - $$link{ $tran1 }{ $tran2 } = 1; - - # scan through each overlap - PAIR: - foreach my $list ( @sorted_pairs ){ - # each list contains @$list = ( overlap, transcript1, transcript2 ) - - # first of all, if the overlap is zero, ditch it - if ( $$list[0] == 0 ){ - next PAIR; - } - - # if we've seen both transcripts already reject them - if ( $$link{ $$list[1] }{ $$list[2] } && defined( $seen1{ $$list[1] } ) && defined( $seen2{ $$list[2] } ) ){ - next PAIR; - } - - # if the same score... - if ( $$list[0] == $max_overlap ) { - - # if we've seen both transcripts already, check they have the highest score - if ( defined( $seen1{ $$list[1] } ) && defined( $seen2{ $$list[2] } ) ){ - if ( $$list[0] == $seen1{ $$list[1] } && $$list[0] == $seen2{ $$list[2] } ){ - $$link{ $$list[1] }{ $$list[2] } = 1; - } - next PAIR; - } - - # if the pair is entirely new, we accept it - if ( !defined( $seen1{ $$list[1] } ) && !defined( $seen2{ $$list[2] } ) ){ - $$link{ $$list[1] }{ $$list[2] } = 1; - $seen1{ $$list[1] } = $$list[0]; - $seen2{ $$list[2] } = $$list[0]; - next PAIR; - } - - # we accept repeats only if this is their maximum overlap as well - if ( !defined( $seen2{$$list[2]} ) && defined( $seen1{$$list[1]} ) && $$list[0] == $seen1{$$list[1]} ){ - $$link{ $$list[1] }{ $$list[2] } = 1; - $seen2{ $$list[2] } = $$list[0]; - if ( !defined( $repeated{ $$list[1] } ) ){ - push( @doubled, $$list[1] ); - $repeated{ $$list[1] } = 1; - } - next PAIR; - } - if ( !defined( $seen1{$$list[1]} ) && defined( $seen2{$$list[2]} ) && ($$list[0] == $seen2{$$list[2]}) ){ - $$link{ $$list[1] }{ $$list[2] } = 1; - $seen1{ $$list[1] } = $$list[0]; - if ( !defined( $repeated{ $$list[2] } ) ){ - push( @doubled, $$list[2] ); - $repeated{ $$list[2] } = 1; - } - next PAIR; - } - } - - # if the score is lower, only accept if the pair is completely new - if ( $$list[0] < $max_overlap ){ - if ( !defined( $seen1{ $$list[1] } ) && !defined( $seen2{ $$list[2] } ) ){ - $$link{ $$list[1] }{ $$list[2] } = 1; - $seen1{ $$list[1] } = $$list[0]; - $seen2{ $$list[2] } = $$list[0]; - $max_overlap = $$list[0]; - next PAIR; - } - } - } - - # create a new cluster for each pair linked - foreach my $tran1 ( @transcripts1 ){ - foreach my $tran2 ( @transcripts2 ){ - if ( $$link{ $tran1 }{ $tran2} && $$link{ $tran1 }{ $tran2 } == 1 ){ - my $pair = Bio::EnsEMBL::Utils::TranscriptCluster->new(); - $pair->put_Transcripts( $tran1, $tran2 ); - push( @pairs, $pair ); - } - } - } - - # finally, check for the unseen ones - foreach my $tran1 ( @transcripts1 ){ - if ( !defined( $seen1{ $tran1 } ) ){ - push( @unpaired, $tran1 ); - } - } - foreach my $tran2 ( @transcripts2 ){ - if ( !defined( $seen2{ $tran2 } ) ){ - push( @unpaired, $tran2 ); - } - } - print STDERR scalar(@pairs)." transcript pairs created\n"; - #my $count2=1; - #foreach my $pair ( @pairs ){ - # print STDERR "pair $count2:\n".$pair->to_String; - # $count2++; - #} - #$count2=1; - #print STDERR scalar(@unpaired)." unpaired transcripts\n"; - #foreach my $unpaired ( @unpaired ){ - # print STDERR "unpaired $count2: ".$unpaired->stable_id."\n"; - #} - # return the pairs, the unpaired transcripts, and those transcript which have been taken twice or more - return (\@pairs,\@unpaired,\@doubled); -} - -######################################################################### - - -=head2 _compare_Transcripts() - - Title: _compare_Transcripts() - Usage: this internal function compares the exons of two transcripts according to overlap - and returns the number of overlaps -=cut - -sub _compare_Transcripts { - my ($transcript1,$transcript2) = @_; - my @exons1 = $transcript1->get_all_Exons; - my @exons2 = $transcript2->get_all_Exons; - my $overlaps = 0; - - foreach my $exon1 (@exons1){ - - foreach my $exon2 (@exons2){ - - if ( ($exon1->overlaps($exon2)) && ($exon1->strand == $exon2->strand) ){ - $overlaps++; - } - } - } - return $overlaps; # we keep track of the number of overlaps to be able to choose the best match -} - - - -######################################################################### - -=head2 get_first_Gene() - - it returns the first gene in the cluster, which usually would be the benchmark gene - -=cut - -sub get_first_Gene { - my $self = shift @_; - return @{$self->{'_benchmark_genes'}}[0]; -} - -######################################################################### - -=head2 to_String() - - it returns a string containing the information about the genes contained in the - GeneCluster object - -=cut - -sub to_String { - my $self = shift @_; - my $data=''; - foreach my $gene ( $self->get_Genes ){ - my @exons = $gene->get_all_Exons; - - $data .= sprintf "Id: %-16s" , $gene->stable_id; - $data .= sprintf "Contig: %-20s" , $exons[0]->contig->id; - $data .= sprintf "Exons: %-3d" , scalar(@exons); - $data .= sprintf "Start: %-9d" , $self->_get_start($gene); - $data .= sprintf "End: %-9d" , $self->_get_end ($gene); - $data .= sprintf "Strand: %-2d\n" , $exons[0]->strand; - } - return $data; -} - -######################################################################### - -=head2 _get_start() - - function to get the start position of a gene - it reads the gene object and it returns - the start position of the first exon - -=cut - -sub _get_start { - my ($self,$gene) = @_; - my @exons = $gene->get_all_Exons; - my $st; - - if ($exons[0]->strand == 1) { - @exons = sort {$a->start <=> $b->start} @exons; - $st = $exons[0]->start; - } else { - @exons = sort {$b->start <=> $a->start} @exons; # they're read in opposite direction (from right to left) - $st = $exons[0]->end; # the start is the end coordinate of the right-most exon - } # which is here the first of the list of sorted @exons - return $st; -} - -######################################################################### - -=head2 _get_end() - - function to get the end position of a gene - it reads the gene object and it returns - the end position of the last exon - -=cut - -sub _get_end { - my ($self,$gene) = @_; - my @exons = $gene-get_all_Exons; - my $end; - - if ($exons[0]->strand == 1) { - @exons = sort {$a->start <=> $b->start} @exons; - $end = $exons[$#exons]->end; - } else { - @exons = sort {$b->start <=> $a->start} @exons; # they're read in opposite direction (from right to left) - $end = $exons[$#exons]->start; # the end is the start coordinate of the left-most exon - } # which is here the last of the list @exons - return $end; -} - -######################################################################### -#Adding new methods to calculate the prediction accuracies of the genes in this cluster -######################################################################### - -=head2 nucleotide_level_accuracy() - - function that calculates the difference between the annotated and predicted genes at a nucleotide level - returns the average sensitivity and specificity of the predictions - -=cut - -sub nucleotide_level_accuracy { - my ($self) = @_; - my @genes = $self->get_Genes; - my %statistics; - shift @genes; # the first gene in the array should be the yardstick gene - - GENE: foreach my $gene (@genes){ - my $count =0; - my ($sum_sn,$sum_sp) = (0,0); - my ($gene_sn,$gene_sp) ; - TRANS: foreach my $trans ($gene->each_Transcript) { - - my @results=$self->evaluate_Transcripts($trans); - next TRANS unless @results; - $count ++; #provides a running counter of transcripts which overlap. - my ($trans_sn, $trans_sp) = @results; - $sum_sn += $trans_sn; - $sum_sp += $trans_sp; - - - } - $gene_sn = $sum_sn/$count; - $gene_sp = $sum_sp/$count; - my @gene_stats = ($gene_sn, $gene_sp); - $statistics{$gene->stable_id} = [@gene_stats]; - } - $self->statistics(%statistics); -} - - -######################################################################### - -=head2 statistics() - - returns a hash containing the statistics of each gene in this cluster - -=cut - -sub statistics { - my ($self,%stats) = @_; - if (%stats){ - %{$self->{'_statistics'}}=%stats; - } - return %{$self->{'_statistics'}}; -} - - -######################################################################### - -=head2 evaluate_Transcripts() - - function that compares a transcript with each transcript of the annotated gene in this cluster - -=cut - -sub evaluate_Transcripts { - - my ($self,$trans) = @_; - - my $yardstick = $self->get_first_Gene(); - - my $count=0; - my ($sum_sn, $sum_sp,$sn,$sp) = (0,0); # The sums and average sensitivity and specificity of this particular transcript WRT - # all transcripts in the yardstick gene in this cluster. - - TRANS: foreach my $ys_trans($yardstick->each_Transcript) { - my $trans_tp = 0; - my @ys_exons = $ys_trans->translateable_exons; - - foreach my $ys_exon (@ys_exons){ - - my @exons = $trans->translateable_exons; - EXON: while (@exons){ - - my $exon = shift @exons; - - next EXON unless ($exon->overlaps($ys_exon) && ($exon->strand eq $ys_exon->strand)); - my $overlap; - my ($exon_start,$exon_end) = ($exon->start,$exon->end); - my ($ys_start,$ys_end) = ($ys_exon->start,$ys_exon->end); - my ($start,$end); - - if ($exon_start > $ys_start) { $start = $exon_start;} - else {$start = $ys_start;} - if ($exon_end < $ys_end) { $end = $exon_end;} - else {$end = $ys_end;} - - $overlap = $end - $start; - - $trans_tp += $overlap; - } - - } - next TRANS unless ($trans_tp ne 0); - - $count ++; #provides a running counter of transcripts which overlap. - my $trans_ap = _translateable_exon_length($ys_trans); - my $trans_pp = _translateable_exon_length($trans); - - $sum_sn += sprintf("%.2f",($trans_tp)/($trans_ap)*100); - $sum_sp += sprintf("%.2f",($trans_tp)/($trans_pp)*100); - #$sum_sn += $trans_tp/$trans_ap; # sensitivity - #$sum_sp += $trans_tp/$trans_pp; #specificity - } - if ($count eq 0){ - print STDERR "Count eq 0. is there an error?\n"; - return 0 ; - } - $sn = $sum_sn/$count; - $sp = $sum_sp/$count; - - my @statistics = ( $sn,$sp); - return @statistics; -} - -######################################################################### - -=head2 _translateable_exon_length() - - internal function that returns the length of the translateable exons - -=cut - -sub _translateable_exon_length { - my ($trans)= @_; - my @exons = $trans->translateable_exons; - my $length = 0; - foreach my $ex (@exons) { - $length += $ex->length; - } - return $length; - -} - -1; diff --git a/modules/Bio/EnsEMBL/Utils/GeneComparison.pm b/modules/Bio/EnsEMBL/Utils/GeneComparison.pm deleted file mode 100644 index e46c72855b..0000000000 --- a/modules/Bio/EnsEMBL/Utils/GeneComparison.pm +++ /dev/null @@ -1,1874 +0,0 @@ -=head1 NAME - Bio::EnsEMBL::Utils::GeneComparison - -=head1 DESCRIPTION - -Perl Class for comparison of two sets of genes. -It can read two references to two arrays of genes, e.g. EnsEMBL built genes and human annotated genes, -and it compares them using different methods (see Synopsis). - -The object must be created passing two arrayref with the list of genes to be comapred. - -Each GeneComparison object can contain data fields specifying the arrays of genes to be compared. -There are also data fields in the form of two arrays, which contain -the gene types present in those both gene arrays. - -=head1 SYNOPSIS - - my $gene_comparison = Bio::EnsEMBL::Utils::GeneComparison->new(\@genes1,\@genes2); - - my @clusters = $gene_comparison->cluster_Genes; - -get the list of unmatched genes, this returns two array references of GeneCluster objects as well, -but only containing the unmatched ones: - - my ($ens_unmatched,$hum_unmatched) = $gene_comparison->get_unmatched_Genes; - -get the list of fragmented genes: - - my @fragmented = $gene_comparison->get_fragmented_Genes (@clusters); - -cluster the transcripts using the gene clusters obtained above -(first cluster all genes and then cluster the transcripts within each gene): - - my @transcript_clusters = $gene_comparison->cluster_Transcripts_by_Gene(@clusters); - -Also, one can cluster the transcripts of the genes in _gene_array1 and gene_array2 directly -(cluster all transcripts without going through gene-clustering) - - my @same_transcript_clusters = $gene_comparison->cluster_Transcripts; - -One can get the number of exons per percentage overlap using whole exons - - my %statistics = $gene_comparison->get_Exon_Statistics; - -Or only coding exons - - my %coding_statistics = $gene_comparison->get_Coding_Exon_Statistics; - -The hashes hold the number of occurences as values and integer percentage overlap as keys -and can be used to produce a histogram: - - for (my $i=1; $i<= 100; $i++){ - if ( $statistics{$i} ){ - print $i." :\t".$statistics{$i}."\n"; - } - else{ - print $i." :\n"; - } - } - -for more info about how to use it look in the example script -...ensembl/misc-scripts/utilities/gene_comparison_script.pl - -=head1 CONTACT - -eae@sanger.ac.uk - -=cut - -# Let the code begin ... - -package Bio::EnsEMBL::Utils::GeneComparison; - -use vars qw(@ISA); -use strict; - -use Bio::EnsEMBL::DBSQL::DBAdaptor; -use Bio::EnsEMBL::Utils::GeneCluster; -use Bio::EnsEMBL::Utils::TranscriptCluster; -use Bio::EnsEMBL::Root; - -do "Bio/EnsEMBL/Utils/GeneComparison_conf.pl"; - -@ISA = qw(Bio::EnsEMBL::Root); - -#################################################################################### - -=head2 new() - -the new() method accepts two array references - -=cut - -sub new { - - my ($class,$gene_array1,$gene_array2) = @_; - # as convention, we put first the annotated (or benchmark) genes and second the predicted genes - # Anyway, the comparisons are made from the gene_array2 with respect to the gene_array1 - # so 'missing_exons' means when gene_array2 misses exons with respect to gene_array1 and - # overpredicted exons means when gene_array2 have exons in excess with respect to the gene_array1 - - if (ref($class)){ - $class = ref($class); - } - my $self = {}; - bless($self,$class); - $self->{'_unclustered_genes'}= []; - $self->{'_gene_clusters'}= []; - - if ( $gene_array1 && $gene_array2 ){ - $self->{'_gene_array1'}= $gene_array1; - $self->{'_gene_array2'}= $gene_array2; - - # we keep track of the gene types present in both arrays - - my %types1; - foreach my $gene ( @{ $gene_array1 } ){ - $types1{$gene->type}=1; - } - foreach my $k ( keys(%types1) ){ # put in the array type_array1 - push( @{ $self->{'_type_array1'} }, $k); # the gene types present in gene_array1 - } - - my %types2; - foreach my $gene ( @{ $gene_array2 } ){ - $types2{$gene->type}=1; - } - foreach my $k ( keys(%types2) ){ # put in the array _type_array2 - push( @{ $self->{'_type_array2'} }, $k); # the gene types present in gene_array2 - } - } - else{ - $self->throw( "Can't create a Bio::EnsEMBL::Utils::GeneComparison object without passing in two gene arrayref"); - } - if ( scalar( @{ $gene_array1 } ) == 0 || scalar( @{ $gene_array2 } ) == 0 ){ - $self->throw( "At least one of the lists of genes to compare is empty. Cannot create a GeneComparison object"); - } - return $self; -} - -###################################################################################### - -=head2 gene_Types() - -this function sets or returns two arrayref with the types of the genes to be compared. -All the comparisons are made from the gene_array1 with respect to the gene_array2. - -=cut - -sub gene_Types { - my ($self,$type1,$type2) = @_; - if ( $type1 && $type2 ){ - $self->{'_type_array1'} = $type1; - $self->{'_type_array2'} = $type2; - } - return ( $self->{'_type_array1'}, $self->{'_type_array2'} ); -} - -###################################################################################### - -=head2 cluster_Genes - - This method takes an array of genes and cluster them - according to their exon overlap. As a default it takes the genes stored in the GeneComparison object - as data fields (or attributes) '_gene_array1' and '_gene_array2'. It can also accept instead as argument - an array of genes to be clustered, but then information about their gene-type is lost (to be solved). - This method returns an array of GeneCluster objects. - -=cut - -sub cluster_Genes { - - my ($self) = @_; - my @genes = ( @{ $self->{'_gene_array1'} }, @{ $self->{'_gene_array2'} } ); - - #### first sort the genes by the left-most position coordinate #### - my %start_table; - my $i=0; - foreach my $gene (@genes){ - $start_table{$i}=_get_start_of_Gene($gene); - $i++; - } - my @sorted_genes=(); - foreach my $k ( sort { $start_table{$a} <=> $start_table{$b} } keys %start_table ){ - push (@sorted_genes, $genes[$k]); - } - - print "Clustering ".scalar( @sorted_genes )." genes...\n"; - #my $label=1; - #foreach my $gene (@sorted_genes){ - # print $label." gene ".$gene->stable_id."\t\t"._get_start_of_Gene($gene)." "._get_strand_of_Gene($gene)."\n"; - # $label++; - #} - my $found; - my $time1=time(); - -##### old clustering algorithm ########################################### - -# my @clusters=(); # this will hold an array of GeneCluster objects -# my $lookups=0; -# my $count = 1; -# my $new_cluster_count=1; -# my $jumpy=0; - -# foreach my $gene (@sorted_genes){ -# print STDERR $count." gene ".$gene->stable_id." being located..."; -# $count++; -# $found=0; -# my $cluster_count=1; -# LOOP: -# foreach my $cluster (@clusters){ -# foreach my $gene_in_cluster ( $cluster->get_Genes ){ # read all genes from GeneCluster object -# $lookups++; -# if ( _compare_Genes($gene,$gene_in_cluster) ){ -# print STDERR "put in cluster ".$cluster_count."\n"; -# if ( $cluster_count != ($new_cluster_count-1) ){ -# print STDERR "\nONE JUMPING AROUND!!\n\n"; -# $jumpy++; -# } - -# $cluster->put_Genes($gene); # put gene into GeneCluster object -# $found=1; -# last LOOP; -# } -# } -# $cluster_count++; -# } -# if ($found==0){ -# my $new_cluster=Bio::EnsEMBL::Utils::GeneCluster->new(); # create a GeneCluser object -# print STDERR "put in new cluster [".$new_cluster_count."]\n"; -# $new_cluster_count++; -# $new_cluster->gene_Types($self->gene_Types); -# $new_cluster->put_Genes($gene); -# push(@clusters,$new_cluster); -# } -# } -# my $time2 = time(); -# print STDERR "\ntime for clustering: ".($time2-$time1)."\n"; -# print STDERR "number of lookups: ".$lookups."\n"; -# print STDERR "number of jumpies: ".$jumpy."\n\n"; -# return @clusters; -# # put all unclustered genes (annotated and predicted) into one separate array -# $self->flush_gene_Clusters; -# foreach my $cl (@clusters){ -# if ( $cl->get_Gene_Count == 1 ){ -# $self->unclustered_Genes($cl); # this push the cluster into array @{ $self->{'_unclustered_genes'} } -# } -# else{ -# $self->clusters( $cl ); -# } -# } -# return $self->clusters; - -######################################################################### - - #### new clustering algorithm, faster than the old one #### - #### however, the order of the genes does not guarantee that genes are not skipped, since this algorithm - #### only checks the current cluster and sometimes - - # create a new cluster - my $cluster=Bio::EnsEMBL::Utils::GeneCluster->new(); - my $cluster_count = 1; - my @clusters; - - # pass in the types we're using - my ($types1,$types2) = $self->gene_Types; - $cluster->gene_Types($types1,$types2); - - # put the first gene into these cluster - $cluster->put_Genes( $sorted_genes[0] ); - push (@clusters, $cluster); - $self->gene_Clusters($cluster); - - # loop over the rest of the genes - LOOP1: - for (my $c=1; $c<=$#sorted_genes; $c++){ - $found=0; - LOOP: - foreach my $gene_in_cluster ( $cluster->get_Genes ){ - if ( _compare_Genes( $sorted_genes[$c], $gene_in_cluster ) ){ - $cluster->put_Genes( $sorted_genes[$c] ); - $found=1; - next LOOP1; - } - } - # if not in this cluster compare to the ($limit) previous clusters - - my $limit = 14; - if ( $found == 0 && $cluster_count > 1 ){ - my $lookup = 1; - while ( !($cluster_count <= $lookup) && !($lookup > $limit) ){ - #print STDERR "cluster_count: $cluster_count, looking at ".($cluster_count - $lookup)."\n"; - my $previous_cluster = $clusters[ $cluster_count - 1 - $lookup ]; - foreach my $gene_in_cluster ( $previous_cluster->get_Genes ){ - if ( _compare_Genes( $sorted_genes[$c], $gene_in_cluster ) ){ - $previous_cluster->put_Genes( $sorted_genes[$c] ); - $found=1; - next LOOP1; - } - } - $lookup++; - } - } - if ($found==0){ # if not-clustered create a new GeneCluser - $cluster = new Bio::EnsEMBL::Utils::GeneCluster; - $cluster->gene_Types($types1,$types2); - $cluster->put_Genes( $sorted_genes[$c] ); - $cluster_count++; - push( @clusters, $cluster ); - $self->gene_Clusters( $cluster ); - } - } - # put all unclustered genes (annotated and predicted) into one separate array - my $time2 = time(); - print STDERR "time for clustering: ".($time2-$time1)."\n"; - my @new_clusters; - foreach my $cl ($self->gene_Clusters){ - if ( $cl->get_Gene_Count == 1 ){ - $self->unclustered_Genes($cl); # this push the cluster into array @{ $self->{'_unclustered_genes'} } - } - else{ - push( @new_clusters, $cl ); - } - } - $self->flush_gene_Clusters; - $self->gene_Clusters(@new_clusters); - my @unclustered = $self->unclustered_Genes; - return @new_clusters; -} - -###################################################################################### - -=head2 pair_Genes - - This method crates one GeneCluster object per benchmark gene and then PAIR them with predicted genes - according to their exon overlap. As a default it takes the genes stored in the GeneComparison object - as data fields (or attributes) '_gene_array1' and '_gene_array2'. It can also accept instead as argument - an array of genes to be paired, but then information about their gene-type is lost (to be solved). - The result is put into $self->{'_gene_clusters'} and $self->{'_unclustered_genes'} - -=cut - -sub pair_Genes { - - my ($self) = @_; - my (@genes1,@genes2); - - @genes1 = @{ $self->{'_gene_array1'} }; - @genes2 = @{ $self->{'_gene_array2'} }; - - #### first sort the genes by the left-most position coordinate #### - my %start_table; - my $i=0; - foreach my $gene (@genes1){ - $start_table{$i}=_get_start_of_Gene($gene); - $i++; - } - my @sorted_genes=(); - foreach my $k ( sort { $start_table{$a} <=> $start_table{$b} } keys %start_table ){ - push (@sorted_genes, $genes1[$k]); - } - @genes1 = @sorted_genes; - - %start_table = (); - $i=0; - foreach my $gene (@genes2){ - $start_table{$i}=_get_start_of_Gene($gene); - $i++; - } - @sorted_genes=(); - foreach my $k ( sort { $start_table{$a} <=> $start_table{$b} } keys %start_table ){ - push (@sorted_genes, $genes2[$k]); - } - @genes2 = @sorted_genes; - - #### PAIR the genes #### - #### creating a new cluster for each gene in the benchmark set ### - - foreach my $gene (@genes2){ - # create a GeneCluser object - my $cluster=Bio::EnsEMBL::Utils::GeneCluster->new(); - my ($type1,$type2) = $self->gene_Types; - $cluster->gene_Types($type1,$type2); - $cluster->put($gene); - $self->gene_Clusters($cluster); - } - - GENE: while (@genes1){ - my $gene = shift @genes1; - - foreach my $cluster ($self->gene_Clusters){ - my $gene_in_cluster = $cluster->get_first_Gene ; # read first gene from GeneCluster object - - if ( _compare_Genes($gene,$gene_in_cluster) ){ - $cluster->put_Genes($gene); # put gene into GeneCluster object - next GENE; - } - } - #### an arbitary cluster containing the set of predicted genes that do not overlap with any from the - #### benchmark set - my $unclustered = new Bio::EnsEMBL::Utils::GeneCluster; - $unclustered->gene_Types($self->gene_Types); - $unclustered->put_Genes($gene); # NOTE: this holds unclustered predicted genes, but does not - # know about unpaired benchmark genes - - $self->unclustered_Genes($unclustered); # push the cluster into an array - } - return 1; -} -######################################################################### - -=head2 _get_start_of_Gene() - - function to get the left-most coordinate of the exons of the gene (start position of a gene). - For genes in the strand 1 it reads the gene object and it returns the start position of the - first exon. For genes in the strand -1 it picks the last exon and reads the start position; this - is not the proper start of the gene but it is the left-most coordinate. - -=cut - -sub _get_start_of_Gene { - my $gene = shift @_; - my @exons = $gene->get_all_Exons; - my $st; - if ($exons[0]->strand == 1) { - @exons = sort {$a->start <=> $b->start} @exons; - $st = $exons[0]->start; - } else { - @exons = sort {$b->start <=> $a->start} @exons; - $st = $exons[$#exons]->start; - } - return $st; -} - -########################################################################## - -=head2 _get_strand_of_Gene() - -=cut - -sub _get_strand_of_Gene { - my $gene = shift @_; - my @exons = $gene->get_all_Exons; - - if ($exons[0]->strand == 1) { - return 1; - } - else{ - return -1; - } - return 0; -} - - - -#################################################################################### - -=head2 cluster_Transcripts_by_Gene() - -it first clusters all genes using cluster_Genes and then cluster the transcripts within each gene -cluster. If one has already made a gene-clustering, one can pass the array of clusters -to cluster_Transcripts_by_Gene() to avoid repetition of work - -See cluster_Transcripts for a different way of clustering transcripts. - -=cut - -sub cluster_Transcripts_by_Gene { - my ($self,$array) = @_; - my @gene_clusters; - my @transcript_clusters; - if ($array){ - push(@gene_clusters,@$array); - } - else{ - @gene_clusters = $self->cluster_Genes; - } - foreach my $cluster (@gene_clusters){ - - my @transcripts; - foreach my $gene ( $cluster->get_Genes ){ - push ( @transcripts, $gene->each_Transcript ); - } - push ( @transcript_clusters, $self->cluster_Transcripts(@transcripts) ); - # we pass the array of transcripts to be clustered to the method cluster_Transcripts - } - return @transcript_clusters; -} - -#################################################################################### - -=head2 cluster_Transcripts() - - This method cluster all the transcripts in the gene arrays passed to the GeneComparison constructor new(). - It also accepts an array of transcripts to be clustered. The clustering is done according to exon - overlaps, which is implemented in the function _compare_Transripts. - This method returns an array of TranscriptCluster objects. - -=cut - -sub cluster_Transcripts { - my ($self,@transcripts) = @_; - - if ( !defined(@transcripts) ){ - my @genes = ( @{ $self->{'_gene_array1'} }, @{ $self->{'_gene_array2'} } ); - foreach my $gene (@genes){ - my @more_transcripts = $gene->each_Transcript; - push ( @transcripts, @more_transcripts ); - } - } - # we do the clustering with the array @transcripts like we do with genes - - # first sort the transcripts by their start position coordinate - my %start_table; - my $i=0; - foreach my $transcript (@transcripts){ - my $start; - my $seqname; - my @exons = $transcript->get_all_Exons; - @exons = sort { $a->start <=> $b->start } @exons; - if ( $exons[0]->start > $exons[0]->end){ - $start = $exons[0]->end; - } - else{ - $start = $exons[0]->start; - } - $start = $transcript->start_exon->start; - $start_table{$i} = $start; - $i++; - - # if some exons in the transcript fall outside the vc, they will not be converted to vc coordinates - foreach my $exon (@exons){ - unless ( $seqname ){ - $seqname = $exon->seqname; - } - unless ($seqname eq $exon->seqname){ - $self->warn("transcript ".$transcript->stable_id." (".$transcript->dbID.") is partly". - " outside the contig"); - last; - } - } - } - my @sorted_transcripts=(); - foreach my $pos ( sort { $start_table{$a} <=> $start_table{$b} } keys %start_table ){ - push (@sorted_transcripts, $transcripts[$pos]); - } - @transcripts = @sorted_transcripts; -# # test -# foreach my $tran (@transcripts){ -# print STDERR "\ntranscript: ".$tran->stable_id."internal_id: ".$tran->dbID."\n"; -# foreach my $exon ($tran->get_all_Exons){ -# print STDERR $exon->seqname." ".$exon->start.":".$exon->end."\n"; -# } -# print STDERR "\n"; -# } - my $time1 = time(); - my @clusters; - - # create a new cluster - my $cluster=Bio::EnsEMBL::Utils::TranscriptCluster->new(); - my $cluster_count = 1; - - # put the first transcript into these cluster - $cluster->put_Transcripts( $sorted_transcripts[0] ); - push( @clusters, $cluster ); - # $self->transcript_Clusters($cluster); - - # loop over the rest of the genes - LOOP1: - for (my $c=1; $c<=$#sorted_transcripts; $c++){ - my $found=0; - LOOP: - foreach my $t_in_cluster ( $cluster->get_Transcripts){ - if ( _compare_Transcripts( $sorted_transcripts[$c], $t_in_cluster ) ){ - $cluster->put_Transcripts( $sorted_transcripts[$c] ); - $found=1; - next LOOP1; - } - } - # if not in this cluster compare to the previous clusters: - - # to set a limit in the number of previous cluster to look up - # set for example my $limit = 6; and then add to the while condition : - # while ( !(...) && !($lookup > $limit) ){ - - my $limit = 14; - if ( $found == 0 && $cluster_count > 1 ){ - my $lookup = 1; - while ( !($cluster_count <= $lookup ) && !($lookup > $limit) ){ - #print STDERR "cluster_count: $cluster_count, looking at ".($cluster_count - $lookup)."\n"; - my $previous_cluster = $clusters[ $cluster_count - 1 - $lookup ]; - foreach my $t_in_cluster ( $previous_cluster->get_Transcripts ){ - if ( _compare_Transcripts( $sorted_transcripts[$c], $t_in_cluster ) ){ - $previous_cluster->put_Transcripts( $sorted_transcripts[$c] ); - $found=1; - next LOOP1; - } - } - $lookup++; - } - } - # if not-clustered create a new TranscriptCluser - if ($found==0){ - $cluster = new Bio::EnsEMBL::Utils::TranscriptCluster; - $cluster->put_Transcripts( $sorted_transcripts[$c] ); - $cluster_count++; - push( @clusters, $cluster ); - # $self->transcript_Clusters( $cluster ); - } - } - my $time2 = time(); - my $time = $time2-$time1; - print STDERR "clustering time: ".$time."\n"; - return @clusters; -} - - -#################################################################################### - -sub compare_CDS{ - my ($self,$clusters) = @_; - my @total_ann_unpaired; - my @total_pred_unpaired; - my @total_ann_doubled; - my @total_pred_doubled; - my $pairs_count = 0; - - GENE: - foreach my $gene_cluster (@$clusters){ - my ( $pairs, $ann_unpaired, $pred_unpaired, $ann_doubled, $pred_doubled) = $gene_cluster->pair_Transcripts; - my @pairs = @{ $pairs }; - my @ann_unpaired; - my @pred_unpaired; - my @ann_doubled; - my @pred_doubled; - push ( @ann_unpaired , @{ $ann_unpaired } ); - push ( @pred_unpaired, @{ $pred_unpaired } ); - push ( @ann_doubled , @{ $ann_doubled } ); - push ( @pred_doubled , @{ $pred_doubled } ); - - push ( @total_ann_doubled , @ann_doubled ); - push ( @total_pred_doubled , @pred_doubled ); - push ( @total_ann_unpaired , @ann_unpaired ); - push ( @total_pred_unpaired, @pred_unpaired ); - - # for each pair we keep track of the exon comparison - - PAIR: - foreach my $pair ( @pairs ){ - $pairs_count++; - my ($prediction,$annotation) = $pair->get_Transcripts; - } - } -} - -#################################################################################### - -=head2 compare_Exons() - - Title : compare_Exons() - Usage : my %stats = $gene_comparison->compare_Exons(\@gene_clusters); - Function: This method takes an array of GeneCluster objects, pairs up all the transcripts in each - cluster and then go through each transcript pair trying to match the exons. - It keeps track of the exons present in the gene_array2 passed to new() that are missing in - the corresponding transcript of gene_array1, also of those exons that are overpredicted in - in gene_array1 with respect to gene_array2. It also gives a generic exon_mismatch result. - Setting the flag 'coding' in the arguments we can compare the CDSs - - Example : look in ...ensembl/misc-scripts/utilities/run_GeneComparison for a pipeline-like example - Returns : a hash with the arrays of transcript pairs (each pair being a Bio::EnsEMBL::Utils::TranscriptCluster) - as values, and the number of missing exons as keys, useful to make a histogram - Args : an arrayref of Bio::EnsEMBL::Utils::GeneCluster objects - -=cut - -sub compare_Exons{ - my ($self,$clusters,$coding) = @_; - - my @pairs_missing; # holds the transcript pairs that have one or more exons missing - my $pairs_count; # this will count the total number of pairs compared - my $total_missing_exon_count = 0; # counts the number of overpredicted exons - my $total_over_exon_count = 0; # same for overpredictions - my $total_exon_mismatches = 0; # includes the above two cases - - my %over_exon_position; # keeps track of the positions of the exons - my %missing_exon_position; # keeps track of the positions of the exons - - my $total_prediction_matched = 0; # numerator in the computation of the sensitivity/specificity - # ( = TruePositive ) - - my $total_annotation_length = 0; # denominator in the computation of the sensitivity - # ( = TruePositive + FalseNegative ) - - my $total_prediction_length = 0; # denominator in the computation of the specificity - # ( = TruePositive + FalsePositive ) - - my $sum_sensitivity = 0; - my $sum_specificity = 0; - my $average_sensitivity = 0; - my $average_specificity = 0; - my $total_sensitivity = 0; - my $total_specificity = 0; - - if ( !defined( $clusters ) ){ - $self->throw( "Must pass an arrayref of Bio::EnsEMBL::Utils::GeneCluster objects"); - } - if ( !$$clusters[0]->isa( 'Bio::EnsEMBL::Utils::GeneCluster' ) ){ - $self->throw( "Can't process a [$$clusters[0]], you must pass a Bio::EnsEMBL::Utils::GeneCluster" ); - } - - # warn about the comparison that is about to happen - my ($annotation_types,$prediction_types) = $$clusters[0]->gene_Types; - my ($message1,$message2) = ( '',''); - my $messageA = "Comparing exons in gene-types: "; - foreach my $type ( @$annotation_types ){ - $message1 .= $type.", "; - } - my $messageB = " with exons in gene-types: "; - foreach my $type ( @$prediction_types ){ - $message2 .= $type.", "; - } - print STDERR $messageA.$message1.$messageB.$message2."...\n"; - - my %missing; # this hash holds the transcript pairs with one, two, etc... missing exons - my %overpredicted; # similarly for overpredicted exons - my %mismatches; # includes all of the above - - my @total_ann_unpaired; - my @total_pred_unpaired; - my @total_ann_doubled; - my @total_pred_doubled; - my $exact_matches = 0; # counts the number of exact matching exons - my $exon_pair_count = 0; # counts the number of exons - - my $cluster_count = 1; - - # we check for missing exons in genes of type $type2 in each gene cluster - GENE: - foreach my $gene_cluster (@$clusters){ - print STDERR "\nIn gene-cluster $cluster_count\n"; - $cluster_count++; - - # pair_Transcripts returns (\@pairs,\@ann_unpaired,\@pred_unpaired,\@ann_doubled,\@pred_doubled) - print STDERR "pairing up transcripts ...\n"; - my ( $pairs, $ann_unpaired, $pred_unpaired, $ann_doubled, $pred_doubled) = $gene_cluster->pair_Transcripts; - my @pairs = @{ $pairs }; - my @ann_unpaired; - my @pred_unpaired; - my @ann_doubled; - my @pred_doubled; - push ( @ann_unpaired , @{ $ann_unpaired } ); - push ( @pred_unpaired, @{ $pred_unpaired } ); - push ( @ann_doubled , @{ $ann_doubled } ); - push ( @pred_doubled , @{ $pred_doubled } ); - - push ( @total_ann_doubled , @ann_doubled ); - push ( @total_pred_doubled , @pred_doubled ); - push ( @total_ann_unpaired , @ann_unpaired ); - push ( @total_pred_unpaired, @pred_unpaired ); - - # for each pair we keep track of the exon comparison - - PAIR: - foreach my $pair ( @pairs ){ - $pairs_count++; - - # we match the exons in the pair - my ($printout, $missing_stats, $over_stats, $mismatch_stats, $match_stats) = - $self->_match_Exons($pair, $coding); - - # where - # printout{ pair }{ exon_number } = [ exon/no link, exon/no link, extra comments ]; - # $missing_stats = [ $missing_exon_count , \%missing_exon_position ]; - # $over_stats = [ $over_exon_count , \%over_exon_position ]; - # $mismatch_stats = [ $exon_mismatch_count ]; - # $match_stats = [ $exon_pair_count , $thispair_exact_matches ]; - - - my $missing_exon_count = $$missing_stats[0]; - my %thispair_missing_exon_position = %{ $$missing_stats[1] }; - - my $over_exon_count = $$over_stats[0]; - my %thispair_over_exon_position = %{ $$over_stats[1] }; - - my $exon_mismatch_count = $$mismatch_stats[0]; - - my $thispair_exon_pair_count = $$match_stats[0]; - my $thispair_exact_matches = $$match_stats[1]; - - #$exon_mismatch_count = $missing_exon_count + $over_exon_count; - push ( @{ $missing{ $missing_exon_count } } , $pair ); - push ( @{ $overpredicted{ $over_exon_count } } , $pair ); - push ( @{ $mismatches{ $exon_mismatch_count } }, $pair ); - - $total_missing_exon_count += $missing_exon_count; - $total_over_exon_count += $over_exon_count; - $total_exon_mismatches += $exon_mismatch_count; - $exact_matches += $thispair_exact_matches; - $exon_pair_count += $thispair_exon_pair_count; - - foreach my $key ( keys( %thispair_missing_exon_position ) ){ - $missing_exon_position{ $key } += $thispair_missing_exon_position{ $key }; - } - - foreach my $key ( keys( %thispair_over_exon_position ) ){ - $over_exon_position{ $key } += $thispair_over_exon_position{ $key }; - } - - # print out info about this pair - $self->_to_String( $pair, $printout ); - - } # end of PAIR loop - } # end of GENE loop - - - # we recover info from this transcript pair comparison - - # print out the results - print STDERR "Total number of transcript pairs: ".$pairs_count."\n"; - print STDERR "Transcripts unpaired: ".scalar( @total_ann_unpaired )." from annotation, and ". - scalar( @total_pred_unpaired )." from prediction\n"; - foreach my $tran ( @total_ann_unpaired ){ - print STDERR $tran->stable_id."\n"; - } - foreach my $tran ( @total_pred_unpaired ){ - print STDERR $tran->stable_id."\n"; - } - - print STDERR "Transcripts repeated: ".scalar( @total_ann_doubled )." from annotation, and ". - scalar( @total_pred_doubled )." from prediction\n"; - foreach my $tran ( @total_ann_doubled ){ - print STDERR $tran->stable_id."\n"; - } - foreach my $tran ( @total_pred_doubled ){ - print STDERR $tran->stable_id."\n"; - } - - print STDERR "\n"; - print STDERR "Exact matches : ".$exact_matches." out of ".$exon_pair_count."\n"; - print STDERR "Total exon mismatches : ".$total_exon_mismatches."\n"; - print STDERR "Exons missed by the prediction: ".$total_missing_exon_count."\n"; - foreach my $key ( keys( %missing_exon_position ) ){ - printf STDERR " position %5s = %2d missed\n", ($key , $missing_exon_position{$key}); - } - print STDERR "Exons overpredicted : ".$total_over_exon_count."\n"; - foreach my $key ( keys( %over_exon_position ) ){ - printf STDERR " position %2s = %2d overpredicted\n", ($key , $over_exon_position{$key}); - } - print STDERR "\n"; - - # print out the transcripts pairs with at least one exon missing - print STDERR "Exons of genes ".$message1." which are missing in genes ".$message2.":\n"; - foreach my $key ( sort { $a <=> $b } ( keys( %missing ) ) ){ - my $these_pairs = scalar( @{ $missing{$key} } ); - my $percentage = sprintf "%.2f", 100*$these_pairs/$pairs_count; - print STDERR "\n$these_pairs transcript pairs with $key exon(s) missed: $percentage percent\n"; - foreach my $pair ( @{ $missing{ $key } } ){ - print STDERR $pair->to_String."\n"; - } - } - - # print out the transcripts pairs with at least one exon overpredicted - print STDERR "Exons of genes ".$message2." which are overpredicted with respect to genes ".$message1.":\n"; - foreach my $key ( sort { $a <=> $b } ( keys( %overpredicted ) ) ){ - my $these_pairs = scalar( @{ $overpredicted{$key} } ); - my $percentage = sprintf "%.2f", 100*$these_pairs/$pairs_count; - print STDERR "\n$these_pairs transcript pairs with $key exon(s) overpredicted: $percentage percent\n"; - foreach my $pair ( @{ $overpredicted{ $key } } ){ - print STDERR $pair->to_String."\n"; - } - } - - # print out the transcripts pairs with at least one exon mismatch - print STDERR "Exons mismatches between genes ".$message1." and ".$message2.":\n"; - foreach my $key ( sort { $a <=> $b } ( keys( %mismatches ) ) ){ - my $these_pairs = scalar( @{ $mismatches{$key} } ); - my $percentage = sprintf "%.2f", 100*$these_pairs/$pairs_count; - print STDERR "\n$these_pairs transcript pairs with $key exon(s) mismatch: $percentage percent\n"; - foreach my $pair ( @{ $mismatches{ $key } } ){ - print STDERR $pair->to_String."\n"; - } - } - # we return the hash with the arrays of transcript pairs as values, and - # the number of missing exons as keys, that can be used to make a histogram - return %missing; -} - -#################################################################################### - -=head2 _match_Exons() - - Title : _match_Exons - Function: this is the actual method that tries to match exons and store the info - in hashes and then return them. This is done per Transcript Pair - Args : a pair of transcripts in the form of a TranscriptCluster object - -=cut - -sub _match_Exons{ - my ($self, $pair, $coding ) = @_; - - my $prediction_matched = 0; # ( = TruePositive ) - my $annotation_length = 0; # ( = TruePositive + FalseNegative ) - my $prediction_length = 0; # ( = TruePositive + FalsePositive ) - - my $missing_exon_count = 0; - my $over_exon_count = 0; # counts the number of overpredicted exons - my $exon_mismatch_count = 0; - my $exon_pair_count = 0; - my $exact_matches = 0; - - my %missing_exon_position; - my %over_exon_position; - - # the order is given by the order they are put into the pair in GeneCluster - my ($prediction,$annotation) = $pair->get_Transcripts; - - # get the exons - my (@ann_exons,@pred_exons); - if ($coding){ - - # get only the CDS from the exons... if there is a translation - if ( $annotation->translation ){ - @ann_exons = $annotation->translateable_exons; - } - else{ - print STDERR "transcript ".$annotation->stable_id." has no translation, skipping this pair:\n"; - print STDERR $pair->to_String."\n"; - next PAIR; - } - if ( $prediction->translation ){ - @pred_exons= $prediction->translateable_exons; - } - else{ - print STDERR "transcript ".$prediction->stable_id." has no translation, skipping this pair:\n"; - print STDERR $pair->to_String."\n"; - next PAIR; - } - } - else{ - @ann_exons = $annotation->get_all_Exons; - @pred_exons = $prediction->get_all_Exons; - } - - # compute TruePositive + FalseNegative - foreach my $exon (@ann_exons){ - $annotation_length += $exon->length; - } - - # compute TruePositive + FalsePositive - foreach my $exon (@pred_exons){ - $prediction_length += $exon->length; - } - - # order exons according to the strand - if ( $ann_exons[0]->strand == 1 ){ - @ann_exons = sort{ $a->start <=> $b->start } @ann_exons; - @pred_exons = sort{ $a->start <=> $b->start } @pred_exons; - } - if ( $ann_exons[0]->strand == -1 ){ - @ann_exons = sort{ $b->start <=> $a->start } @ann_exons; - @pred_exons = sort{ $b->start <=> $a->start } @pred_exons; - } - - # now we link the exons, but first, a bit of formatted info - print "\nComparing transcripts:\n"; - print STDERR $pair->to_String; - - my %link; - my $start=0; # start looking at the first one - my @buffer; # buffer that keeps track of the skipped exons in @exons2 - - my $sensitivity; - my $specificity; - my $printout; - my $printout_count = 1; - - $$printout{ $pair }{ 0 } = [ "annotation", "prediction", "" ]; - - # Note: variables start at ZERO, but in the print-outs we shift them to start at ONE - EXONS1: - for (my $i=0; $i<=$#ann_exons; $i++){ - my $foundlink = 0; - - EXONS2: - for (my $j=$start; $j<=$#pred_exons; $j++){ - - # compare $ann_exons[$i] with $pred_exons[$j] - if ( $ann_exons[$i]->overlaps($pred_exons[$j]) ){ - - # if you've found a link, check first whether there is anything left unmatched in @buffer - if ( @buffer && scalar(@buffer) != 0 ){ - foreach my $exon_number ( @buffer ){ - $$printout{ $pair }{ $printout_count } = ["no link",$exon_number,""]; - $printout_count++; - $over_exon_count++; - - if ( $exon_number == 1 ){ - $over_exon_position{'first'}++; - } - elsif ( $exon_number == $#pred_exons+1 ){ - $over_exon_position{'last'}++; - } - else { - $over_exon_position{ $exon_number }++; - } - } - } - $foundlink = 1; - $exon_pair_count++; - - # then check whether it is exact - if ( $ann_exons[$i]->equals( $pred_exons[$j] ) ){ - $$printout{ $pair }{ $printout_count } = [ ($i+1) , ($j+1) , "exact" ]; - $printout_count++; - - $prediction_matched += $pred_exons[$j]->length; - $exact_matches++; - } - - # or there is a mismatch in the number of bases - else{ - my $message = ''; - if ( $ann_exons[$i]->start != $pred_exons[$j]->start ){ - my $mismatch = ($ann_exons[$i]->start - $pred_exons[$j]->start); - my $absolute_mismatch = abs( $mismatch ); - my $msg; - if ( $mismatch > 0 ){ - $msg = "prediction has $absolute_mismatch extra bases in the"; - } - elsif( $mismatch < 0 ){ - $msg = "prediction misses $absolute_mismatch bases in the"; - } - if ( $ann_exons[$i]->strand == 1 ){ - $msg .= " 5' end, "; - } - elsif ( $ann_exons[$i]->strand == -1 ){ - $msg .= " 3' end, "; - } - $message .= $msg; - } - if ( $ann_exons[$i]->end != $pred_exons[$j]->end ){ - my $mismatch = ($ann_exons[$i]->end - $pred_exons[$j]->end ); - my $absolute_mismatch = abs( $mismatch ); - my $msg; - if ( $mismatch > 0 ){ - $msg = "prediction misses $absolute_mismatch bases in the"; - } - elsif( $mismatch < 0 ){ - $msg = "prediction has $absolute_mismatch extra bases in the"; - } - if ( $ann_exons[$i]->strand == 1 ){ - $msg .= " 3' end, "; - } - elsif ( $ann_exons[$i]->strand == -1 ){ - $msg .= " 5' end, "; - } - $message .= $msg; - } - $$printout{ $pair }{ $printout_count } = [ ($i+1) , ($j+1) , $message ]; - $printout_count++; - } - $start += scalar(@buffer)+1; - - # we start a new buffer - @buffer = (); - next EXONS1; - } - # if no overlap, skip this one - else { - # keep this info in a @buffer if you haven't exhausted all checks in @exons2 - if ( $j<$#pred_exons ){ - push ( @buffer, ($j+1) ); - } - # if you got to the end of @exons2 and found no link, ditch the @buffer - elsif ( $j == $#pred_exons ){ - @buffer = (); - } - # and get outta here - next EXONS2; - } - } # end of EXONS2 loop - - # found no link for $ann_exons[$i], go to the next one - if ( $foundlink == 0 ){ - $$printout{ $pair }{ $printout_count } = [ ($i+1) , "no link" , "" ]; - $printout_count++; - $missing_exon_count++; - if ( $i+1 == 1 ){ - $missing_exon_position{'first'}++; - } - elsif ( $i+1 == $#ann_exons+1 ){ - $missing_exon_position{'last'}++; - } - else { - $missing_exon_position{ $i+1 }++; - } - # see whether there is any feature we could have used - # WARNING: this method works but it is extremely slow - #my ($similarity_features, $prediction_features) = $self->_missed_exon_Evidence( $pair, $ann_exons[$i] ); - } - } # end of EXONS1 loop - - # stats - $exon_mismatch_count = $over_exon_count + $missing_exon_count; - my $missing_stats = [ $missing_exon_count , \%missing_exon_position ]; - my $over_stats = [ $over_exon_count , \%over_exon_position ]; - my $mismatch_stats = [ $exon_mismatch_count ]; - my $match_stats = [ $exon_pair_count , $exact_matches ]; - - # we return printout{ pair }{ exon_number } = [ exon/no link, exon/no link, extra comments ] - return ( $printout, $missing_stats, $over_stats, $mismatch_stats, $match_stats ); -} - -#################################################################################### - -sub input_id { my $self = shift @_; - my ($chr,$chrstart,$chrend) = @_; - if ( $chr && $chrstart && $chrend ){ - $self->{'_chr_name'} = $chr; - $self->{'_chr_start'} = $chrstart; - $self->{'_chr_end'} = $chrend; - } - return ( $self->{'_chr_name'}, $self->{'_chr_start'}, $self->{'_chr_end'} ); -} - -#################################################################################### - - -=head2 _missed_exon_Evidence - -Function: It searches for possible evidence that we could have used to predict the missed exon passed as argument - -=cut - -sub _missed_exon_Evidence{ - my ($self, $pair, $exon) = @_; - - # take the missed exon (virtual contig) coordinates - my ( $start, $end ) = ( $exon->start, $exon->end ); - - # we need to get info from the annotation database - my $dbname = $::db1_conf{'dbname'}; - my $dbuser = $::db1_conf{'user'}; - my $path = $::db1_conf{'path'}; - my $host = $::db1_conf{'host'}; - my $genetypes = $::db1_conf{'genetypes'}; # this is an arrayref - - my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor( - -host => $host, - -user => $dbuser, - -dbname => $dbname, - ); - - $db->assembly_type($path); - my $sgp = $db->get_StaticGoldenPathAdaptor; - - # get info of where we are sitting: - my ($chr,$chrstart,$chrend) = $self->input_id; - unless ( $chr && $chrstart && $chrend ){ - $self->throw( "You must provide a chr_name, chr_start and a chr_end" ); - } - - # get a virtual contig for that piece we're interested in - my ($start_pos,$end_pos) = ( $chrstart + $start, $chrstart + $end); - my $vcontig = $sgp->fetch_VirtualContig_by_chr_start_end($chr,$start_pos,$end_pos); - - # get features from that piece - my @similarity_features = $vcontig->get_all_SimilarityFeatures; - my @prediction_features = $vcontig->get_all_PredictionFeatures; - - print STDERR scalar(@similarity_features)." similarity features retrieved in this range\n"; - print STDERR scalar(@prediction_features)." prediction features retrieved in this range\n"; - - return ( \@similarity_features, \@prediction_features ); - -} - -##################################################################################### - -=head2 exon_Density() - -=cut - -sub exon_Density{ - my ($self, @transcripts) = @_; - my $sum_density; - foreach my $tran (@transcripts){ - my $exon_span; - my @exons = $tran->get_all_Exons; - @exons = sort { $a->start <=> $b->start } @exons; - my $transcript_length = $exons[$#exons]->end - $exons[0]->start; - foreach my $exon ( $tran->get_all_Exons ){ - $exon_span += $exon->length; - } - $sum_density += $exon_span/$transcript_length; - } - return $sum_density; -} - -#################################################################################### - - -=head2 _to_String(); - -=cut - -sub _to_String{ - my ($self, $pair, $printout ) = @_; - my $count = 0; - while ( $$printout{ $pair }{ $count } ){ - my $msg; - my ( $str1, $str2, $str3 ) = ( '', '', '' ); - - # exons in annotation (or 'no link' ) - $str1 = ${ $$printout{$pair}{$count} }[0]; - - # exons in prediction (or 'no link' ) - $str2 = ${ $$printout{$pair}{$count} }[1]; - - # comments: exact, mismatch bases - $str3 = ${ $$printout{$pair}{$count} }[2]; - - if ( $count == 0 ){ - $msg = sprintf "%10s %-2s %s\n", ( $str1, $str2, $str3 ); - } - else{ - $msg = sprintf "%8s <----> %-2s %s\n", ( $str1, $str2, $str3 ); - } - print STDERR $msg; - $count++; - } -} - -#################################################################################### - -=head2 get_Exon_Statistics() - - This method produces an histogram with the number of exon pairs per percentage overlap. - The percentage overlap of two exons, say e1 and e2, is calculated in _compare_Transcripts method as - 100*intersection(e1,e2)/max_length(e1,e2). It takes whole exons, i.e. without chopping out the - non-coding part. This method returns a hash containing the number of occurences as values and - the integer percentage overlap as keys - -=cut - -sub get_Exon_Statistics{ - - my ($self,$array1,$array2) = @_; - my (@transcripts1,@transcripts2); - my %stats; - if ($array1 && $array2){ # we can pass two arrays references (to transcripts) as argument - @transcripts1 = @{$array1}; - @transcripts2 = @{$array2}; - - } - else{ # or else read the transcripts from the gene_array data fields - my @genes1 = @{ $self->{'_gene_array1'} }; - foreach my $gene (@genes1){ - my @more_transcripts = $gene->each_Transcript; - push ( @transcripts1, @more_transcripts ); - } - my @genes2 = @{ $self->{'_gene_array2'} }; - foreach my $gene (@genes2){ - my @more_transcripts = $gene->each_Transcript; - push ( @transcripts2, @more_transcripts ); - } - } - foreach my $t1 (@transcripts1){ - foreach my $t2 (@transcripts2){ - if ( _compare_Transcripts($t1,$t2) ){ - my %partial_stats = _exon_Statistics($t1,$t2); # produces the stats for the current two transcripts - # The hash holds number of occurences as values and - # integer percentage overlap as keys - foreach my $k ( keys %partial_stats ) { - $stats{$k} += $partial_stats{$k}; # Here it adds up to the overall value - } - } - } - } - return %stats; -} - -######################################################################### - -=head2 _exon_Statistics() - -This internal function reads two transcripts and calculates the percentage of overlap -between their exons = (INTERSECT($exon1,$exon2))/MAX($exon1,$exon2) - -=cut - -sub _exon_Statistics { - my ($transcript1,$transcript2) = @_; - my @exons1 = $transcript1->get_all_Exons; # transcripts get their exons in order - my @exons2 = $transcript2->get_all_Exons; - - my %stats; - - foreach my $exon1 (@exons1){ - - foreach my $exon2 (@exons2){ - - if ( ($exon1->overlaps($exon2)) && ($exon1->strand == $exon2->strand) ){ - - # calculate the percentage of exon overlap = (INTERSECT($exon1,$exon2))/MAX($exon1,$exon2) - my $max; - if ( ($exon1->length) < ($exon2->length) ){ - $max = $exon2->length; - } - else{ - $max = $exon1->length; - } - # compute the overlap extent - - my ($s,$e); # start and end coord of the overlap - my ($s1,$e1) = ($exon1->start,$exon1->end); - my ($s2,$e2)= ($exon2->start,$exon2->end); - if ($s1<=$s2 && $s2<$e1){ - $s=$s2; - } - if ($s1>=$s2 && $e2>$s1){ - $s=$s1; - } - if ($e1<=$e2 && $e1>$s2){ - $e=$e1; - } - if ($e2<=$e1 && $e2>$s1){ - $e=$e2; - } - my $common = ($e - $s + 1); - my $percent = int( (100*$common)/$max ); - $stats{$percent}++; - } - } - } - return %stats; -} - -#################################################################################### - -=head2 get_Coding_Exon_Statistics() - - The same aim as the get_Exon_Statistics method but chopping the non-coding part from the exons. - This method returns a hash containing the number of occurences as values and - the integer percentage overlap as keys - -=cut - -sub get_Coding_Exon_Statistics{ - - my ($self,$array1,$array2) = @_; - my (@transcripts1,@transcripts2); - my %stats; - if ($array1 && $array2){ # we can pass two arrays references of transcripts as argument - @transcripts1 = @{$array1}; - @transcripts2 = @{$array2}; - - } - else{ # or else read the transcripts from the gene_array data fields - my @genes1 = @{ $self->{'_gene_array1'} }; - foreach my $gene (@genes1){ - my @more_transcripts = $gene->each_Transcript; - push ( @transcripts1, @more_transcripts ); - } - my @genes2 = @{ $self->{'_gene_array2'} }; - foreach my $gene (@genes2){ - my @more_transcripts = $gene->each_Transcript; - push ( @transcripts2, @more_transcripts ); - } - } - # in order to get info about the non-coding regions of the transcripts we have - # to get translation objects for them, however, some of them do not have it defined. - # In those cases we just ignore the comparison. - - foreach my $t1 (@transcripts1){ - foreach my $t2 (@transcripts2){ - if ($t1->translation && $t2->translation){ - my %partial_stats = _coding_Exon_Statistics($t1,$t2); - foreach my $k ( keys %partial_stats ) { - $stats{$k} += $partial_stats{$k}; - } - } - else{ - print "Transcript without translation:\n"; - if (!$t1->translation){ - print $t1->stable_id."\n"; - } - if (!$t2->translation){ - print $t2->stable_id."\n"; - } - print "\n"; - } - } - } - return %stats; -} - -#################################################################################### - -=head2 _coding_Exon_Statistics() - -This internal function reads two transcripts and calculates the percentage -of overlap between the coding exons = (INTERSECT($exon1,$exon2))/MAX($exon1,$exon2) - -=cut - -sub _coding_Exon_Statistics { - - # the coding region may start in any exon, not necessarily the first one - # and may end in any one as well - my ($transcript1,$transcript2) = @_; - - my @exons1 = $transcript1->get_all_Exons; - my @exons2 = $transcript2->get_all_Exons; - - my $translation1 = $transcript1->translation; - my $translation2 = $transcript2->translation; - - - # IDs of the exons where the coding region starts and ends - my ($s_id1,$e_id1) = ($translation1->start_exon_id,$translation1->end_exon_id); - my ($s_id2,$e_id2) = ($translation2->start_exon_id,$translation2->end_exon_id); - - - # identify those exons in each transcript - my ($s_exon1,$e_exon1); # these will be the exons where the coding region (starts,ends) in the 1st transcript - - foreach my $exon1 (@exons1){ - if ($exon1->dbID eq $s_id1){ - $s_exon1 = $exon1; - } - if ($exon1->dbID eq $e_id1){ - $e_exon1 = $exon1; - } - } - my ($s_exon2,$e_exon2); # these will be the exons where the coding region (starts,ends) in the 2nd transcript - foreach my $exon2 (@exons2){ - if ($exon2->dbID eq $s_id2){ - $s_exon2 = $exon2; - } - if ($exon2->dbID eq $e_id2){ - $e_exon2 = $exon2; - } - } -print "Exon of 2ndt transcript\n"; -foreach my $e2 (@exons2){ -print "Exon ".$e2->strand." ".$e2->start." ".$e2->end."\n"; -} - - - # take these exons and those in between these exons (since only these exons contain coding region) - my @coding_exons1; - - foreach my $exon1 (@exons1){ - if ($exon1->strand eq 1){ - if ( $exon1->start >= $s_exon1->start && $exon1->start <= $e_exon1->end ){ - push ( @coding_exons1, $exon1 ); - } - }else{ - if ( $exon1->start >= $e_exon1->end && $exon1->start <= $s_exon1->start ){ - push ( @coding_exons1, $exon1 ); - } - } - } -print "coding region start: ".$s_exon2->start." end: ".$e_exon2->end."\n"; - - - - my @coding_exons2; - - - foreach my $exon2 (@exons2){ - if ($exon2->strand eq 1){ - if ( $exon2->start >= $s_exon2->start && $exon2->start <= $e_exon2->end ){ - push ( @coding_exons2, $exon2 ); - } - }else{ - if ( $exon2->start >= $e_exon2->end && $exon2->start <= $s_exon2->start ){ - push ( @coding_exons2, $exon2 ); - } - } - } - - @exons1 = @coding_exons1; - @exons2 = @coding_exons2; - - # start and end of coding regions - # relative to the origin of the first and last exons where the coding region starts - my ($s_code1,$e_code1) = ($translation1->start,$translation1->end); - my ($s_code2,$e_code2) = ($translation2->start,$translation2->end); - - # here I hold my stats results - my %stats; - - foreach my $exon1 (@exons1){ - - foreach my $exon2 (@exons2){ - - if ( ($exon1->overlaps($exon2)) && ($exon1->strand == $exon2->strand) ){ - # start and end coord of the overlap - my ($s,$e)=(0,0); - - # exon positions - my ($s1,$e1) = ($exon1->start,$exon1->end); - my ($s2,$e2)= ($exon2->start,$exon2->end); - - #exon lengths - my ($l1,$l2) = ($exon1->length, $exon2->length); - - # One has to chop off the non-coding part out of the first and last exons in - # the coding region : ($s_id1,$e_id1) and ($s_id2,$e_id2) - # Recall that ($s_code1,$e_code1) and ($s_code2,$e_code2) are the start/end of the coding regionS - - my ($fs1,$fe1,$fs2,$fe2)=(0,0,0,0); # put a flag on the first and last exons to print them out - - if ($exon1->dbID eq $s_id1){ - $s1 = $s1 + $s_code1 - 1; - $l1 = $e1 - $s1 + 1; - $fs1=1; - } - if ($exon1->dbID eq $e_id1){ - $e1 = $s1 + $e_code1 - 1; - $l1 = $e1 - $s1 + 1; - $fe1=1; - } - if ($exon2->dbID eq $s_id2){ - $s2 = $s2 + $s_code2 - 1; - $l2 = $e2 - $s2 + 1; - $fs2=1; - } - if ($exon2->dbID eq $e_id2){ - $e2 = $s2 + $e_code2 - 1; - $l2 = $e2 - $s2 + 1; - $fe2=1; - } - - # calculate the percentage of exon overlap = (INTERSECT($exon1,$exon2))/MAX($exon1,$exon2) - my $max; - if ( $l1 < $l2 ){ - $max = $l2; - } - else{ - $max = $l1; - } - - # compute the overlap extent - if ($s1<=$s2 && $s2<$e1){ - $s=$s2; - } - if ($s1>=$s2 && $e2>$s1){ - $s=$s1; - } - if ($e1<=$e2 && $e1>$s2){ - $e=$e1; - } - if ($e2<=$e1 && $e2>$s1){ - $e=$e2; - } - my $common = ($e - $s + 1); - if ($common != 0){ # we check since after cutting the non-coding piece we might lose the overlap - my $percent = int( (100*$common)/$max ); - $stats{$percent}++; - if ($fs1 || $fs2 ) { - print "(".$s1.",".$e1.")"; - if ($fs1){ - print "-> start coding exon"; - } - print "\t".$exon1->stable_id."\n"; - - print "(".$s2.",".$e2.")"; - if ($fs2) { - print "-> start coding exon"; - } - print "\t".$exon2->stable_id."\n"; - } - if ( $fe1 || $fe2 ) { - print "(".$s1.",".$e1.")"; - if ($fe1){ - print "-> end coding exon"; - } - print "\t".$exon1->stable_id."\n"; - - print "(".$s2.",".$e2.")"; - if ($fe2) { print "-> end coding exon"; - } - print "\t".$exon2->stable_id."\n"; - } - if ($fs1 || $fs2 || $fe1 || $fe2){ print "(".$s.",".$e.") Overlap --> ".$percent."\n\n";} - } - } - } - } - return %stats; -} - - -#################################################################################### - -=head2 get_unmatched_Genes() - - This function returns those genes that have not been identified with any other gene in the - two arrays (of type1 and type2) passed to GeneComparison->new(). If we have clustered the genes - before, then we have probably filled the array $self->{'_unclustered_genes'}, so we can read out the - unmatched genes from there. If not, it derives the unmatched genes - directly from the gene_arrays passed to the GeneComaprison object. - - It returns two references to two arrays of GeneCluster objects. - The first one corresponds to the genes of type1 which have not been identified in type2, - and the second one holds the genes of type2 that have not been - identified with any of the genes of type1. - -=cut - -sub get_unmatched_Genes { - my $self = shift @_; - - # if we have already the unclustered genes, we can read them out from $self->{'_unclustered_genes'} - if ($self->unclustered ){ - my @types1 = @{ $self->{'_type_array1'} }; - my @types2 = @{ $self->{'_type_array2'} }; - my @unclustered = $self->unclustered; - my (@array1,@array2); - foreach my $cluster (@unclustered){ - my $gene = ${ $cluster->get_Genes }[0]; - foreach my $type1 ( @{ $self->{'_type_array1'} } ){ - if ($gene->type eq $type1){ - push ( @array1, $gene ); - } - } - foreach my $type2 ( @{ $self->{'_type_array2'} } ){ - if ($gene->type eq $type2){ - push ( @array2, $gene ); - } - } - } - return (\@array1,\@array2); - } - - # if not, we can compute them directly - my (%found1,%found2); - - foreach my $gene1 ( @{ $self->{'_gene_array1'} } ){ - foreach my $gene2 ( @{ $self->{'_gene_array2'} } ){ - if ( _compare_Genes($gene1,$gene2)){ - $found1{$gene1->stable_id} = 1; - $found2{$gene2->stable_id} = 1; - } - } - } - my @unmatched1; - foreach my $gene1 ( @{ $self->{'_gene_array1'} } ){ - unless ( $found1{$gene1->stable_id} ){ - my $new_cluster = Bio::EnsEMBL::Utils::GeneCluster->new(); - $new_cluster->gene_Types($self->gene_Types); - $new_cluster->put_Genes($gene1); - push (@unmatched1, $new_cluster); - } - } - my @unmatched2; - foreach my $gene2 ( @{ $self->{'_gene_array2'} } ){ - unless ( $found2{$gene2->stable_id} ){ - my $new_cluster = Bio::EnsEMBL::Utils::GeneCluster->new(); - $new_cluster->gene_Types($self->gene_Types); - $new_cluster->put_Genes($gene2); - push (@unmatched2, $new_cluster); - } - } - return (\@unmatched1,\@unmatched2); -} - -######################################################################### - -=head2 get_fragmented_Genes() - -it returns an array of GeneCluster objects, where these objects contain -fragmented genes: genes of a given type which are overlapped with more than one -gene of the other type. In order to avoid repetition of work, if a gene-clustering -has been already performed, one can pass the array of clusters as an argument. -This, however, is only allowed if _type_array1 and _type_array2 are defined, since the method -makes use of them. - -=cut - -sub get_fragmented_Genes { - my ($self,@array) = @_; - my @clusters; - my @fragmented; - - if (@array && $self->{'_type_array1'} && $self->{'_type_array2'} ){ - @clusters = @array; - } - else{ - @clusters = $self->cluster_Genes; - } - - foreach my $cluster (@clusters){ - - my @genes = $cluster->get_Genes; - my (@type1,@type2); - - foreach my $gene ( @genes ){ - - my $type = $gene->type; - push( @type1, grep /$type/, @{ $self->{'_type_array1'} } ); - push( @type2, grep /$type/, @{ $self->{'_type_array2'} } ); - # @type1 and @type2 hold all the occurrences of gene-types 1 and 2, respectively - if ( ( @type1 && scalar(@type1)>1 ) || ( @type2 && scalar(@type2) >1 ) ) { - push (@fragmented, $cluster); - } - } - } - return @fragmented; -} - -######################################################################### - - -=head2 get_3prime_overlaps() - -=cut - -######################################################################### - -=head2 get_5prime_overlaps() - -=cut - -######################################################################### - - -=head2 _compare_Genes() - - Title: _compare_Genes - Usage: this internal function compares the exons of two genes on overlap - -=cut - -sub _compare_Genes { - my ($gene1,$gene2) = @_; - my @exons1 = $gene1->get_all_Exons; - my @exons2 = $gene2->get_all_Exons; - - foreach my $exon1 (@exons1){ - - foreach my $exon2 (@exons2){ - - if ( ($exon1->overlaps($exon2)) && ($exon1->strand == $exon2->strand) ){ - return 1; - } - } - } - return 0; -} -######################################################################### - - - -=head2 _compare_Transcripts() - - Title: _compare_Transcripts() - Usage: this internal function compares the exons of two transcripts according to overlap - and returns the number of overlaps -=cut - -sub _compare_Transcripts { - my ($transcript1,$transcript2) = @_; - my @exons1 = $transcript1->get_all_Exons; - my @exons2 = $transcript2->get_all_Exons; - my $overlaps = 0; - - foreach my $exon1 (@exons1){ - - foreach my $exon2 (@exons2){ - - if ( ($exon1->overlaps($exon2)) && ($exon1->strand == $exon2->strand) ){ - $overlaps++; - } - } - } - return $overlaps; # we keep track of the number of overlaps to be able to choose the best match -} - -######################################################################### - -=head2 unclustered_Genes() - - Title : unclustered_Genes() - Usage : This function stores and returns an array of GeneClusters with only one gene (unclustered) - Args : a Bio::EnsEMBL::Utils::GeneCluster object - Returns: an array of Bio::EnsEMBL::Utils::GeneCluster objects - -=cut - -sub unclustered_Genes{ - my ($self, @unclustered) = @_; - - if (@unclustered) - { - $self->throw("Input @unclustered is not a Bio::EnsEMBL::Utils::GeneCluster\n") - unless $unclustered[0]->isa('Bio::EnsEMBL::Utils::GeneCluster'); - - push ( @{ $self->{'_unclustered_genes'} }, @unclustered); - } - return @{ $self->{'_unclustered_genes'} }; -} - -######################################################################### - -=head2 gene_Clusters() - - Title: gene_Clusters() - Usage: This function stores and returns an array of clusters - -=cut -sub gene_Clusters { - my ($self, @clusters) = @_; - - if (@clusters) - { - push (@{$self->{'_gene_clusters'}}, @clusters); - } - return @{$self->{'_gene_clusters'}}; -} - -######################################################################### - -=head2 transcript_Clusters() - - Title: transcript_Clusters() - Usage: This function stores and returns an array of transcript_Clusters - -=cut -sub transcript_Clusters { - my ($self, @clusters) = @_; - - if (@clusters) - { - push (@{$self->{'_transcript_clusters'}}, @clusters); - } - return @{$self->{'_transcript_clusters'}}; -} -######################################################################### - -=head2 flush_transcript_Clusters() - - Usage: This function cleans up the array in $self->{'_transcript_clusters'} - -=cut - -sub flush_transcript_Clusters { - my ($self) = @_; - $self->{'_transcript_clusters'} = []; -} - - -######################################################################### - -=head2 flush_gene_Clusters() - - Usage: This function cleans up the array in $self->{'_gene_clusters'} - -=cut - -sub flush_gene_Clusters { - my ($self) = @_; - $self->{'_gene_clusters'} = []; -} - -##################################################################################### - - - -1; - diff --git a/modules/Bio/EnsEMBL/Utils/GeneComparison_conf.pl b/modules/Bio/EnsEMBL/Utils/GeneComparison_conf.pl deleted file mode 100644 index df9998065b..0000000000 --- a/modules/Bio/EnsEMBL/Utils/GeneComparison_conf.pl +++ /dev/null @@ -1,50 +0,0 @@ -# configuration file for the gene_comparison_script.pl -# based on the config files for the pipeline -# -# Written by Eduardo Eyras -# eae@sanger.ac.uk - -# Databases in 120 schema (schema as for the 120 release) -# -# ensembl110_new_schema@ensrv3, path_type='UCSC', genetype = '1' -# -# homo_sapiens_core_120@ensrv3, path_type='UCSC', genetype='ensembl' -# -# chr20_120@ecs1b, path_type='Sanger_02', genetypes='HUMACE-Novel_CDS','HUMACE-Known',etc... -# - - -BEGIN { -package main; - -%db1_conf = ( - 'host' => "ecs1b", - - 'dbname' => "chr20_120", - - 'path' => "Sanger_02", - - 'user' => "ensro", - - # genetypes should be array_ref, to allow for multiple types - 'genetypes' => ["HUMACE-Novel_CDS","HUMACE-Known"], - -); - - -%db2_conf = ( - 'host' => "ensrv3", - - 'dbname' => "homo_sapiens_core_120", - - 'path' => "UCSC", - - 'user' => "ensro", - - 'genetypes' => ["ensembl"], - -); - -} - -1; diff --git a/modules/Bio/EnsEMBL/Utils/Report.pm b/modules/Bio/EnsEMBL/Utils/Report.pm deleted file mode 100644 index 23e47b979c..0000000000 --- a/modules/Bio/EnsEMBL/Utils/Report.pm +++ /dev/null @@ -1,183 +0,0 @@ -=head1 NAME - Bio::EnsEMBL::Utils::Report.pm - -=head1 DESCRIPTION - -=head1 SYNOPSIS - -=head1 CONTACT - -eae@sanger.ac.uk - -=cut - -# Let the code begin ... - -package Bio::EnsEMBL::Utils::Report; - -use vars qw(@ISA); -use strict; -use diagnostics; - -use Bio::EnsEMBL::DBSQL::DBAdaptor; -use Bio::EnsEMBL::Utils::GeneCluster; -use Bio::EnsEMBL::Utils::TranscriptCluster; -use Bio::EnsEMBL::Utils::GeneComparison; -use Bio::EnsEMBL::Root; - -@ISA = qw(Bio::EnsEMBL::Root); - -#################################################################################### - -=head2 new() - -=cut - -sub new { - - my ($class) = @_; - - if (ref($class)){ - $class = ref($class); - } - my $self = {}; - bless($self,$class); - return $self; -} - -#################################################################################### - -=head2 header() - -get/set method for a string header - -=cut - -sub header{ - my ($self,$header) = @_; - if ($header){ - $self->{'_header'} = $header; - } - return $self->{'_header'}; -} - - -#################################################################################### - -=head2 histogram() - -get/set method for a hash, aimed to be used to create histograms - -=cut - -sub histogram{ - my ($self,$histogram) = @_; - if ($histogram){ - $self->{'_histogram'} = $histogram; - } - return $self->{'_histogram'} = $histogram; -} - -#################################################################################### - -=head2 list() - -get/set method for an array of arrays, aimed to be used to print out lists - -=cut - -sub list{ - my ($self,$list) = @_; - - # $list is a reference to an array, whose elementes are arrays - if ($list){ - $self->{'_list'} = $list; - } - return $self->{'_list'}; -} - -#################################################################################### - -=head2 to_plainTXT() - -sends the info in the containers {'_list'}, {'_histogram'}, ... to plain text output -to file passed as argument, or as default, to SDTOUT - -=cut - -sub to_plainTXT(){ - my ($self,$filename) = @_; - - my $title = $self->title; - my %histogram = %{ $self->histogram }; - my @list = @{ $self->list }; - - # here we should be able to choose where to send the output - # define OUTPUT handle - #if ($filename){ - # open OUT,">$filename"; - #} - #else{ - # OUT = STDOUT; - #} - print STDOUT $title."\n"; - foreach my $sublist ( @list ){ - foreach my $item ( @$sublist ){ - print STDOUT $item."\t"; - } - print STDOUT "\n"; - } - - foreach my $key ( keys(%histogram) ){ - print STDOUT $key." ---> ".$histogram{$key}."\n"; - } -} - -#################################################################################### - -=head2 histogram_to_R() - -prints out the {'_histogram'} in format to be read by R and make a histogram - -=cut - -sub histogram_to_R(){ - ($self, $filename) = @_; - %histogram = $self->histogram; - - print STDERR "printing histogram to file $filename...\n"; - - # open filehandle - my $opening = open OUT, ">$filename";; - unless ($opening){ - $self->throw("Can't open file $filename"); - } - - # here we process the contents of %histogram and put it into $filename - foreach my $key ( keys(%histogram) ){ - - # ... - # put here the right format - print OUT $key."\t".$histogram{$key}."\n"; - - } - - # close filehandle - my $closing = close OUT; - unless ($closing){ - $self->throw("Can't close file $filename"); - } -} - -#################################################################################### - -=head2 to_HTML - -sends the info in the containers {'_list'}, {'_histogram'}, ... to plain html format output - -=cut - -sub to_HTML{ - my ($self,$filename) = @_; -} - - diff --git a/modules/Bio/EnsEMBL/Utils/TranscriptCluster.pm b/modules/Bio/EnsEMBL/Utils/TranscriptCluster.pm deleted file mode 100644 index d50a1acb59..0000000000 --- a/modules/Bio/EnsEMBL/Utils/TranscriptCluster.pm +++ /dev/null @@ -1,560 +0,0 @@ -=head1 NAME - -TranscriptCluster - -=head1 SYNOPSIS - - -=head1 DESCRIPTION - -This object holds one or more transcripts which have been clustered according to -comparison criteria external to this class (for instance, in the -method _compare_Transcripts of the class GeneComparison). -Each TranscriptCluster object holds the IDs of the transcripts clustered and the beginning and end coordinates -of each one (taken from the start and end coordinates of the first and last exon in the correspondig -get_all_Exons array) - -It inherits from Bio::EnsEMBL::Root and Bio::RangeI. A TranscriptCluster is a range in the sense that it convers -a defined extent of genomic sequence. It is also possible to check whether two clusters overlap (in range), -is included into another cluster, etc... - -=head1 CONTACT - -eae@sanger.ac.uk - -=cut - -# Let the code begin ... - -package Bio::EnsEMBL::Utils::TranscriptCluster; -use Bio::EnsEMBL::Transcript; -use vars qw(@ISA); -use strict; - -use Bio::EnsEMBL::Gene; -use Bio::EnsEMBL::Root; -use Bio::RangeI; - -@ISA = qw(Bio::EnsEMBL::Root,Bio::RangeI); - -=head1 METHODS - -=cut - -######################################################################### - - -=head2 new() - -new() initializes the attributes: -_transcript_array -_transcriptID_array -_start -_end - -=cut - -sub new { - my ($class,$whatever)=@_; - - if (ref($class)){ - $class = ref($class); - } - my $self = {}; - bless($self,$class); - - if ($whatever){ - $self->throw( "Can't pass an object to new() method. Use put_Genes() to include Bio::EnsEMBL::Gene in cluster"); - } - - return $self; -} -######################################################################### - -=head1 Range-like methods - -Methods start and end are typical for a range. We also implement the boolean -and geometrical methods for a range. - -=head2 start() - - Title : start - Usage : $start = $transcript_cluster->end(); - Function: get/set the start of the range covered by the cluster. This is re-calculated and set everytime - a new transcript is added to the cluster - Returns : a number - Args : optionaly allows the start to be set - -=cut - -sub start{ - my ($self,$start) = @_; - if ($start){ - $self->throw( "$start is not an integer") unless $start =~/^[-+]?\d+$/; - $self->{'_start'} = $start; - } - return $self->{'_start'}; -} - -############################################################ - -=head2 end() - - Title : end - Usage : $end = $transcript_cluster->end(); - Function: get/set the end of the range covered by the cluster. This is re-calculated and set everytime - a new transcript is added to the cluster - Returns : the end of this range - Args : optionaly allows the end to be set - : using $range->end($end -=cut - -sub end{ - my ($self,$end) = @_; - if ($end){ - $self->throw( "$end is not an integer") unless $end =~/^[-+]?\d+$/; - $self->{'_end'} = $end; - } - return $self->{'_end'}; -} - -############################################################ - -=head2 length - - Title : length - Usage : $length = $range->length(); - Function: get/set the length of this range - Returns : the length of this range - Args : optionaly allows the length to be set - : using $range->length($length) - -=cut - -sub length{ - my $self = shift @_; - if (@_){ - $self->confess( ref($self)."->length() is read-only"); - } - return ( $self->{'_end'} - $self->{'_start'} + 1 ); -} - -############################################################ - -=head2 strand - - Title : strand - Usage : $strand = $transcript->strand(); - Function: get/set the strand of the transcripts in the cluster. - The strand is set in put_Transcripts when the first transcript is added to the cluster, in that - that method there is also a check for strand consistency everytime a new transcript is added - Returns : the strandidness (-1, 0, +1) - Args : optionaly allows the strand to be set - -=cut - -sub strand{ - my ($self,$strand) = @_; - if ($strand){ - $self->{'_strand'} = $strand; - } - return $self->{'_strand'}; -} - - -############################################################ - -=head1 Boolean Methods - -These methods return true or false. They throw an error if start and end are -not defined. They are implemented in Bio::RangeI. - - $cluster->overlaps($other_cluster) && print "Clusters overlap\n"; - -=head2 overlaps - - Title : overlaps - Usage : if($cluster1->overlaps($cluster)) { do stuff } - Function: tests if $cluster2 overlaps $cluster1 overlaps in the sense of genomic-range overlap, - it does NOT test for exon overlap. - Args : arg #1 = a range to compare this one to (mandatory) - arg #2 = strand option ('strong', 'weak', 'ignore') - Returns : true if the clusters overlap, false otherwise - -=cut - -=head2 contains - - Title : contains - Usage : if($cluster1->contains($cluster2) { do stuff } - Function: tests whether $cluster1 totally contains $cluster2 - Args : arg #1 = a range to compare this one to (mandatory) - alternatively, integer scalar to test - arg #2 = strand option ('strong', 'weak', 'ignore') (optional) - Returns : true if the argument is totaly contained within this range - -=cut - -=head2 equals - - Title : equals - Usage : if($cluster1->equals($cluster2)) - Function: test whether the range covered by $cluster1 has the same start, end, length as the range - covered by $cluster2 - Args : a range to test for equality - Returns : true if they are describing the same range - -=cut - -=head1 Geometrical methods - -These methods do things to the geometry of ranges, and return -Bio::RangeI compliant objects or triplets (start, stop, strand) from -which new ranges could be built. They are implemented here and not in Bio::RangeI, since we -want them to return a new TranscriptCluster object. - -=head2 overlap_extent - - Title : overlap_extent - Usage : ($a_unique,$common,$b_unique) = $a->overlap_extent($b) - Function: Provides actual amount of overlap between two different - ranges. Implemented already in RangeI - Example : - Returns : array of values for - - the amount unique to a - - the amount common to both - - the amount unique to b - Args : - -=cut - -######################################################################### - -=head2 intersection - - Title : intersection - Usage : $intersection_cluster = $cluster1->intersection($cluster2) - Function: gives a cluster with the transcripts which fall entirely within the intersecting range of - $cluster1 and $cluster2 - Args : arg #1 = a cluster to compare this one to (mandatory) - arg #2 = strand option ('strong', 'weak', 'ignore') (not implemented here) - Returns : a TranscriptCluster object, which is empty if the intersection does not contain - any transcript -=cut - -sub intersection{ -my ($self, $cluster) = @_; - -# if either is empty, return an empty cluster -if ( scalar( $self->get_Transcripts) == 0 - $self->warn( "cluster $self is empty, returning an empty TranscriptCluster"); - my $empty_cluster = Bio::EnsEMBL::Utils::TranscriptCluster->new(); - return $empty_cluster; -} - -if ( scalar( $cluster->get_Transcripts ) == 0 ){ - $self->warn( "cluster $cluster is empty, returning an empty TranscriptCluster"); - my $empty_cluster = Bio::EnsEMBL::Utils::TranscriptCluster->new(); - return $empty_cluster; -} - -my @transcripts = $self->get_Transcripts; -push( @transcripts, $cluster->get_Transcripts); - -# make an unique list of transcripts, in case they are repeated -my %list; -foreach my $transcript (@transcripts){ - $list{$transcript} = $transcript; -} -@transcripts = values( %list ); - -my ($inter_start,$inter_end); -my ($start1,$end1) = ($self->start , $self->end); -my ($start2,$end2) = ($cluster->start,$cluster->end); - -my $strand = $cluster->strand; - -# if clusters overlap, calculate the intersection -if ( $self->overlaps( $cluster ) ){ - if ( $start2 >= $start1 && $end2 >= $end1 ){ - if ( $strand == 1){ - $inter_start = $start2; - $inter_end = $end1; - } - else{ - $inter_start = $start1; - $inter_end = $end2; - } - } - if ( $start2 >= $start1 && $end2 < $end1){ - if ( $strand == 1){ - $inter_start = $start2; - $inter_end = $end2; - } - else{ - $inter_start = $start1; - $inter_end = $end1; - } - } - if ( $start2 < $start1 && $end2 < $end1 ){ - if ( $strand == 1){ - $inter_start = $start1; - $inter_end = $end2; - } - else{ - $inter_start = $start2; - $inter_end = $end1; - } - } - if ( $start2 < $start1 && $end2 >= $end1 ){ - if ( $strand == 1){ - $inter_start = $start1; - $inter_end = $end1; - } - else{ - $inter_start = $start2; - $inter_end = $end2; - } - } -} -else{ - $self->warn( "clusters $self and $cluster do not intersect range-wise, returning an empty TranscriptCluster"); - my $empty_cluster = Bio::EnsEMBL::Utils::TranscriptCluster->new(); - return $empty_cluster; -} - -my $inter_cluster = Bio::EnsEMBL::Utils::TranscriptCluster->new(); -my @inter_transcripts; - -# see whether any transcript falls within this intersection -foreach my $transcript ( @transcripts ){ - my ($start,$end) = ($self->_get_start($transcript), $self->_get_end($transcript)); - if ($strand == 1 && $start >= $inter_start && $end <= $inter_end ){ - $inter_cluster->put_Transcripts( $transcript ); - } - elsif ( $strand == -1 && $start <= $inter_start && $end >= $inter_end ){ - $inter_cluster->put_Transcripts( $transcript ); - } -} - -if ( scalar( $inter_cluster->get_Transcripts ) == 0 ){ - $self->warn( "cluster $inter_cluster is empty, returning an empty TranscriptCluster"); - return $inter_cluster; -} -else{ - -return $inter_cluster; - -} - - -############################################################ - -=head2 union - - Title : union - Usage : $union_cluster = $cluster1->union(@clusters); - Function: returns the union of clusters - Args : a TranscriptCluster or list of TranscriptClusters to find the union of - Returns : the TranscriptCluster object containing all of the ranges - -=cut - -sub union{ -my ($self,@clusters) = @_; - -if ( ref($self) ){ - unshift @clusters, $self; -} - -my $union_cluster = Bio::EnsEMBL::Utils::TranscriptCluster->new(); -my $union_strand; - -foreach my $cluster (@clusters){ - unless ($union_strand){ - $union_strand = $cluster->strand; - } - unless ( $cluster->strand == $union_strand){ - $self->warn("You're making the union of clusters in opposite strands"); - } - $union_cluster->put_Transcripts($cluster->get_Transcripts); -} - -return $union_cluster; - -} - -############################################################ - -=head2 put_Transcripts() - - function to include one or more transcripts in the cluster. - Useful when creating a cluster. It takes as argument an array of transcripts, it returns nothing. - -=cut - -sub put_Transcripts { - my ($self, @new_transcripts)= @_; - - if ( !$new_transcripts[0]->isa('Bio::EnsEMBL::Transcript') ){ - $self->throw( "Can't accept a [ $new_transcripts[0] ] instead of a Bio::EnsEMBL::Transcript"); - } - - my @starts = sort { $a <=> $b } map( { $self->_get_start($_) } @new_transcripts ); - my @ends = sort { $a <=> $b } map( { $self->_get_end($_) } @new_transcripts ); - - # check strand consistency among transcripts - foreach my $transcript (@new_transcripts){ - my @exons = $transcript->get_all_Exons; - unless ( $self->strand ){ - $self->strand( $exons[0]->strand ); - } - if ( $self->strand != $exons[0]->strand ){ - $self->warn( "You're trying to put $tran in a cluster of opposite strand"); - } - } - - # if start is not defined, set it - unless ( $self->start ){ - if ( $self->strand == 1 ){ - $self->start( $starts[0] ); - } - if ( $self->strand == -1 ){ - $self->start( $starts[$#starts] ); - } - } - - # if end is not defined, set it - unless ( $self->end ){ - if ( $self->strand == 1 ){ - $self->end( $ends[$#ends]); - } - if ( $self->strand == -1 ){ - $self->end( $ends[0] ); - } - } - - # extend start and end if necessary as we include more transcripts - if ( $self->strand == 1 && $starts[0] < $self->start ){ - $self->start( $starts[0] ); - } - if ( $self->strand == -1 && $starts[$#starts] > $self->start ){ - $self->start( $starts[$#starts] ); - } - if ( $self->strand == 1 && $ends[$#ends] > $self->end ){ - $self->end( $ends[$#ends] ); - } - if ( $self->strand == -1 && $ends[0] < $self->end ){ - $self->end( $ends[0] ); - } - - push ( @{ $self->{'_transcript_array'} }, @new_transcripts ); - -} - -######################################################################### - -=head2 get_Transcripts() - - it returns the array of transcripts in the GeneCluster object - -=cut - -sub get_Transcripts { - my $self = shift @_; - my @transcripts = @{ $self->{'_transcript_array'} }; - return @transcripts; -} - -######################################################################### - -=head2 to_String() - - it returns a string containing the information about the transcripts in the TranscriptCluster object - -=cut - -sub to_String { - my $self = shift @_; - my $data=''; - foreach my $tran ( @{ $self->{'_transcript_array'} } ){ - my @exons = $tran->get_all_Exons; - - $data .= sprintf "Id: %-16s" , $tran->stable_id; - $data .= sprintf "Contig: %-21s" , $exons[0]->contig->id; - $data .= sprintf "Exons: %-3d" , scalar(@exons); - $data .= sprintf "Start: %-9d" , $self->_get_start($tran); - $data .= sprintf "End: %-9d" , $self->_get_end ($tran); - $data .= sprintf "Strand: %-3d" , $exons[0]->strand; - $data .= sprintf "Exon-density: %3.2f\n", $self->exon_Density($tran); - } - return $data; -} - -######################################################################### - -sub exon_Density{ - my ($self, $transcript) = @_; - my $density; - my $exon_span; - my @exons = $transcript->get_all_Exons; - @exons = sort { $a->start <=> $b->start } @exons; - my $transcript_length = $exons[$#exons]->end - $exons[0]->start; - foreach my $exon ( @exons ){ - $exon_span += $exon->length; - } - $density = $exon_span/$transcript_length; - return $density; -} - -######################################################################### - -=head2 _get_start() - - function to get the start position of a transcript - it reads the Bio::EnsEMBL::Transcript - object and it returns the start position of the first exon - -=cut - -sub _get_start { - my ($self,$transcript) = @_; - my @exons = $transcript->get_all_Exons; - my $st; - - if ($exons[0]->strand == 1) { - @exons = sort {$a->start <=> $b->start} @exons; - $st = $exons[0]->start; - } else { - @exons = sort {$b->start <=> $a->start} @exons; - $st = $exons[0]->end; - } - - return $st; -} - -######################################################################### - -=head2 _get_end() - - function to get the end position of a transcript - it reads the Bio::EnsEMBL::Transcript - object and it returns the end position of the last exon - -=cut - -sub _get_end { - my ($self,$transcript) = @_; - my @exons = $transcript->get_all_Exons; - my $end; - - if ($exons[0]->strand == 1) { - @exons = sort {$a->start <=> $b->start} @exons; - $end = $exons[$#exons]->end; - } else { - @exons = sort {$b->start <=> $a->start} @exons; - $end = $exons[$#exons]->start; - } - return $end; -} -######################################################################### - -1; -- GitLab