Modified to keep track of the gene types

a77a01f1 · Eduardo Eyras · 13c3254c · a77a01f1
Commit a77a01f1 authored 23 years ago by Eduardo Eyras
--- a/modules/Bio/EnsEMBL/Utils/GeneCluster.pm
+++ b/modules/Bio/EnsEMBL/Utils/GeneCluster.pm
@@ -23,9 +23,15 @@ eae@sanger.ac.uk
 # Let the code begin ...

 package Bio::EnsEMBL::Utils::GeneCluster;
-use Bio::EnsEMBL::Gene;
+
+use vars qw(@ISA);
 use strict;

+use Bio::EnsEMBL::Gene;
+use Bio::Root::RootI;
+
+@ISA = qw(Bio::Root::RootI);
+
 =head1 METHODS

 =cut
@@ -36,32 +42,30 @@ use strict;
 =head2 new()

 new() initializes the attributes:
-_gene_array
-_geneID_array
-_start
-_end
+
+$self->{'_benchmark_types'}
+$self->{'_prediction_types'}
+$self->{'_benchmark_genes'}
+$self->{'_prediction_genes'}

 =cut

 sub new {
-  my ($class,@args)=@_;
+  my ($class,$whatever)=@_;

  if (ref($class)){
    $class = ref($class);
  }
  my $self = {};
  bless($self,$class);
-  $self->{'_gene_array'}= \@args; # array reference that holds the list of genes in the cluster
-  $self->{'_geneID_array'}=();    # array that holds the IDs of the genes
-  $self->{'_start'}={};           # hash that holds the start position of each gene
-  $self->{'_end'}={};             # hash that holds the end position of each gene
-  
-  foreach my $gene (@args){
-    my ( $start , $end)=( _get_start($gene) , _get_end($gene) );
-    push ( @ {$self->{'_geneID_array'} }, $gene->id );
-    $self->{'_start'}->{$gene->id}=$start;
-    $self-> {'_end'} ->{$gene->id}=$end;
+
+if ($whatever){
+    $self->throw( "Can't pass an object to new() method. Use put_Genes() to include Bio::EnsEMBL::Gene in cluster");
  }
+
+  $self->{'_ys_gene'}= {};
+  %{$self->{'_statistics'}}=();
+  
  return $self;
 }

@@ -75,14 +79,25 @@ sub new {
 =cut

 sub put_Genes {
-  my ($self, @args)= @_;
-  my @new_genes = @args;
-  push ( @{ $self->{'_gene_array'} }, @new_genes );
-  foreach my $new_gene (@new_genes){
-    push ( @{ $self->{'_geneID_array'} }, $new_gene->id );
-    my ( $start , $end)=( _get_start($new_gene) , _get_end($new_gene) );
-    $self->{'_start'}->{$new_gene->id}=$start;
-    $self-> {'_end'} ->{$new_gene->id}=$end;
+  my ($self, @new_genes)= @_;
+  if ( !defined( $self->{'_benchmark_types'} ) || !defined(  $self->{'_prediction_types'} ) ){
+    $self->throw( "Cluster lacks references to gene-types, unable to put the gene");
+  }
+
+ GENE:
+  foreach my $gene (@new_genes){
+    foreach my $type ( @{ $self->{'_benchmark_types'} } ){
+      if ($gene->type eq $type){
+	push ( @{ $self->{'_benchmark_genes'} }, $gene );
+	next GENE; 
+      }
+    }
+    foreach my $type ( @{ $self->{'_prediction_types'} } ){
+      if ($gene->type eq $type){
+	push ( @{ $self->{'_prediction_genes'} }, $gene );
+	next GENE;
+      }
+    }
  }
 }

@@ -96,10 +111,112 @@ sub put_Genes {

 sub get_Genes {
  my $self = shift @_;
-  my @genes = @{ $self->{'_gene_array'} };
+  my @genes;
+  if ( !defined( $self->{'_benchmark_genes'} ) && !defined( $self->{'_prediction_genes'} ) ){
+    $self->warn("The gene array you try to retrieve is empty");
+    @genes = ();
+  }
+  if ( $self->{'_benchmark_genes'} ){
+    push( @genes, @{ $self->{'_benchmark_genes'} } );
+  }
+  if ( $self->{'_prediction_genes'} ){
+    push( @genes, @{ $self->{'_prediction_genes'} } );
+  }
  return @genes;
 }

+
+=head2 get_separated_Genes()
+
+  Handy method to get the genes in the genes in the cluster separated by type.
+  It returns two arrayrefs.
+
+=cut
+
+
+sub get_separated_Genes {
+  my ($self) = @_;
+  return ( $self->{'_benchmark_genes'}, $self->{'_prediction_genes'} );
+}
+
+#########################################################################
+
+=head2 get_Gene_Count()
+
+  it returns the number of genes in the GeneCluster object
+
+=cut
+
+sub get_Gene_Count {
+  my $self = shift @_;
+  my $count =0;
+  if ( $self->{'_benchmark_genes'} ){
+    $count += scalar( @{ $self->{'_benchmark_genes'}  } ); 
+  }
+  if ( $self->{'_prediction_genes'} ){
+    $count += scalar( @{ $self->{'_prediction_genes'} } );
+  }
+  #print STDERR "In GeneCluster.get_Gene_Count(), Count = ".$count."\n";
+  return $count;
+}
+
+#########################################################################
+
+=head2 gene_Types()
+  
+  It accepts two array references to set the types. One array holds the gene-types for the 
+  benchmark genes and the other on for the predicted genes. 
+  It can also be used to get the two type-arrays: ($types1, $types2) = $cluster->gene_Types;
+  The conventions throughout are (first entry: benchmark, second entry: prediction)
+
+=cut
+
+sub gene_Types {
+  my ($self, $benchmark_types, $prediction_types) = @_;
+  if ( $benchmark_types && $prediction_types ) {
+    $self->{'_benchmark_types'}  = $benchmark_types;
+    $self->{'_prediction_types'} = $prediction_types;
+  }
+  return ($self->{'_benchmark_types'},$self->{'_prediction_types'});
+}
+
+#########################################################################
+
+=head2 get_Genes_by_Type()
+
+  We can get the genes in each cluster of a given type. 
+  We pass an arrayref containing the types we want to retrieve.
+
+=cut
+  
+  sub get_Genes_by_Type() {
+    my ($self,$types) = @_;
+    unless ($types){
+      $self->throw( "must provide a type");
+    }
+    my @genes = $self->get_Genes;  # this should give them in order, but we check anyway
+    my @selected_genes;
+    foreach my $type ( @{ $types } ){
+      push ( @selected_genes, grep { $_->type eq $type } @genes );
+    }
+    return @selected_genes;
+  }
+    
+
+
+#########################################################################
+
+=head2 get_first_Gene()
+
+  it returns the first gene in the cluster, which usually would be the benchmark gene 
+
+=cut
+
+sub get_first_Gene {
+  my $self = shift @_;
+  return @{$self->{'_benchmark_genes'}}[0];
+}
+
 #########################################################################

 =head2 string()
@@ -112,27 +229,43 @@ sub get_Genes {
 sub string {
  my $self = shift @_;
  my $data='';
-  foreach my $gene ( @{ $self->{'_gene_array'} } ){
-    my $id = $gene->id;
-    while (length($id)<16){
-      $id .=' ';
-    }
-    $data .= $id."  "._get_start($gene)."  "._get_end($gene)."\n";
+
+  foreach my $gene ( $self->get_Genes ){
+    $data .= $gene->id." ".$gene->type."\t".$self->get_start($gene)."  ".$self->get_end($gene)."\n";
  }
+
+    
+ # foreach my $gene ( @{ $self->{'_benchmark_genes'} } ){
+#    my $id = $gene->id;
+#    #while (length($id)<16){
+#    #  $id .=' ';
+#    #}
+#    $data .= $id."  ".$gene->type."\t".$self->get_start($gene)."  ".$self->get_end($gene)."\n";
+#  }
+#  foreach my $gene ( @{ $self->{'_prediction_genes'} } ){
+#    my $id = $gene->id;
+#    #while (length($id)<16){
+#    #  $id .=' ';
+#    #}
+#    $data .= $id."  ".$gene->type."\t".$self->get_start($gene)."  ".$self->get_end($gene)."\n";
+#  }
+
+
+
  return $data;
 }

 #########################################################################

-=head2 _get_start()
+=head2 get_start()

 function to get the start position of a gene - it reads the gene object and it returns
 the start position of the first exon

 =cut

-sub _get_start {
-  my $gene = shift @_;
+sub get_start {
+  my ($self,$gene) = @_;
  my @exons = $gene->each_unique_Exon;
  my $st;
  
@@ -142,21 +275,21 @@ sub _get_start {
  } else {
    @exons = sort {$b->start <=> $a->start} @exons; # they're read in opposite direction (from right to left)
    $st = $exons[0]->end;                           # the start is the end coordinate of the right-most exon
-  }                                                 # which is here the first of the list @exons
+  }                                                 # which is here the first of the list of sorted @exons
  return $st;
 }

 #########################################################################

-=head2 _get_end()
+=head2 get_end()

 function to get the end position of a gene - it reads the gene object and it returns
 the end position of the last exon

 =cut

-sub _get_end {
-  my $gene = shift @_;
+sub get_end {
+  my ($self,$gene) = @_;
  my @exons = $gene->each_unique_Exon;
  my $end;
  
@@ -170,6 +303,148 @@ sub _get_end {
  return $end;
 }
 #########################################################################
+#Adding new methods to calculate the prediction accuracies of the genes in this cluster
+#########################################################################
+
+=head2 nucleotide_level_accuracy()
+
+ function that calculates the difference between the annotated and predicted genes at a nucleotide level
+ returns the average sensitivity and specificity of the predictions

+=cut
+
+sub nucleotide_level_accuracy {
+    my ($self) = @_;	
+	my @genes = $self->get_Genes;
+    my %statistics;	
+	shift @genes; # the first gene in the array should be the yardstick gene
+
+	GENE: foreach my $gene (@genes){
+	    my $count =0;	
+		my ($sum_sn,$sum_sp) = (0,0);
+		my ($gene_sn,$gene_sp) ;
+ 		TRANS: foreach my $trans ($gene->each_Transcript) {
+			
+	    	my @results=$self->evaluate_Transcripts($trans);
+	   		next TRANS unless @results; 
+			$count ++;	#provides a running counter of transcripts which overlap.
+			my ($trans_sn, $trans_sp) = @results;
+			$sum_sn += $trans_sn;
+			$sum_sp += $trans_sp;
+			
+	       
+	    }
+		$gene_sn = $sum_sn/$count;	
+		$gene_sp = $sum_sp/$count;	
+		my @gene_stats = ($gene_sn, $gene_sp);	
+		$statistics{$gene->id} = [@gene_stats];
+	}
+	$self->statistics(%statistics);
+}
+
+
+#########################################################################
+
+=head2 statistics()
+
+ returns a hash containing the statistics of each gene in this cluster
+
+=cut
+
+sub statistics {
+  my ($self,%stats) = @_;
+  if (%stats){
+    %{$self->{'_statistics'}}=%stats;
+  }
+  return  %{$self->{'_statistics'}};
+}
+
+
+#########################################################################
+
+=head2 evaluate_Transcripts()
+
+ function that compares a transcript with each transcript of the annotated gene in this cluster
+
+=cut
+
+sub evaluate_Transcripts {
+
+    my ($self,$trans) = @_;
+
+    my $yardstick = $self->get_first_Gene();
+
+	my $count=0;
+	my ($sum_sn, $sum_sp,$sn,$sp) = (0,0);	# The sums and average sensitivity and specificity of this particular transcript WRT 
+				 	 						# all transcripts in the yardstick gene in this cluster.
+
+    TRANS: foreach my $ys_trans($yardstick->each_Transcript) {
+      my $trans_tp = 0;
+	  my @ys_exons = $ys_trans->translateable_exons;
+	
+	  foreach my $ys_exon (@ys_exons){
+	    
+	  my @exons = $trans->translateable_exons;
+	    EXON: while (@exons){
+	    
+	        my $exon = shift @exons;
+		
+			next EXON unless ($exon->overlaps($ys_exon) && ($exon->strand eq $ys_exon->strand));
+       		my $overlap;
+			my ($exon_start,$exon_end) =  ($exon->start,$exon->end);
+			my ($ys_start,$ys_end) =  ($ys_exon->start,$ys_exon->end);
+			my ($start,$end);
+
+			if ($exon_start > $ys_start) { $start = $exon_start;} 
+				else {$start = $ys_start;}
+			if ($exon_end  < $ys_end) { $end = $exon_end;} 
+				else {$end = $ys_end;}
+
+			$overlap = $end - $start;
+
+			$trans_tp += $overlap;
+	    }	
+		
+	  }
+    next TRANS unless ($trans_tp ne 0);
+
+	$count ++;	#provides a running counter of transcripts which overlap.
+    my $trans_ap = _translateable_exon_length($ys_trans);
+    my $trans_pp = _translateable_exon_length($trans);
+	
+	$sum_sn += sprintf("%.2f",($trans_tp)/($trans_ap)*100);
+	$sum_sp += sprintf("%.2f",($trans_tp)/($trans_pp)*100);
+    #$sum_sn += $trans_tp/$trans_ap; # sensitivity
+	#$sum_sp += $trans_tp/$trans_pp; #specificity 
+    }     	
+	if ($count eq 0){
+	print STDERR "Count eq 0. is there an error?\n";
+	return 0 ; 
+	}
+    $sn = $sum_sn/$count;    
+    $sp = $sum_sp/$count;    
+
+	my 	@statistics = (	$sn,$sp);
+	return @statistics; 
+}
+
+#########################################################################
+
+=head2 _translateable_exon_length()
+
+ internal function that returns the length of the translateable exons 
+
+=cut
+
+sub _translateable_exon_length {
+	my ($trans)= @_;
+	my @exons = $trans->translateable_exons;
+    my $length = 0;
+    foreach my $ex (@exons) {
+        $length += $ex->length;
+    }
+    return $length;
+
+}

 1;