From 418419dba9e6e628ba6a27c11fc83dcfc0413680 Mon Sep 17 00:00:00 2001
From: Graham McVicker <mcvicker@sanger.ac.uk>
Date: Tue, 7 Jan 2003 11:02:59 +0000
Subject: [PATCH] removed old modules now present in pipeline

---
 modules/Bio/EnsEMBL/Utils/GeneCluster.pm      |  655 ------
 modules/Bio/EnsEMBL/Utils/GeneComparison.pm   | 1874 -----------------
 .../Bio/EnsEMBL/Utils/GeneComparison_conf.pl  |   50 -
 modules/Bio/EnsEMBL/Utils/Report.pm           |  183 --
 .../Bio/EnsEMBL/Utils/TranscriptCluster.pm    |  560 -----
 5 files changed, 3322 deletions(-)
 delete mode 100644 modules/Bio/EnsEMBL/Utils/GeneCluster.pm
 delete mode 100644 modules/Bio/EnsEMBL/Utils/GeneComparison.pm
 delete mode 100644 modules/Bio/EnsEMBL/Utils/GeneComparison_conf.pl
 delete mode 100644 modules/Bio/EnsEMBL/Utils/Report.pm
 delete mode 100644 modules/Bio/EnsEMBL/Utils/TranscriptCluster.pm

diff --git a/modules/Bio/EnsEMBL/Utils/GeneCluster.pm b/modules/Bio/EnsEMBL/Utils/GeneCluster.pm
deleted file mode 100644
index 6b6d678ffa..0000000000
--- a/modules/Bio/EnsEMBL/Utils/GeneCluster.pm
+++ /dev/null
@@ -1,655 +0,0 @@
-=head1 NAME
-
-GeneCluster
-
-=head1 SYNOPSIS
-
-
-=head1 DESCRIPTION
-
-This object holds one or more genes which has been clustered according to 
-comparison criteria external to this class (for instance, in the 
-methods compare and _compare_Genes methods of the class GeneComparison).
-Each GeneCluster object holds the IDs of the genes clustered and the beginning and end coordinates
-of each one (taken from the start and end coordinates of the first and last exon in the correspondig
-get_all_Exons array)
-
-=head1 CONTACT
-
-eae@sanger.ac.uk
-
-=cut
-
-# Let the code begin ...
-
-package Bio::EnsEMBL::Utils::GeneCluster;
-
-use vars qw(@ISA);
-use strict;
-
-use Bio::EnsEMBL::Gene;
-use Bio::EnsEMBL::Root;
-
-@ISA = qw(Bio::EnsEMBL::Root);
-
-=head1 METHODS
-
-=cut
-
-#########################################################################
-
-
-=head2 new()
-
-new() initializes the attributes:
-
-$self->{'_benchmark_types'}
-$self->{'_prediction_types'}
-$self->{'_benchmark_genes'}
-$self->{'_prediction_genes'}
-
-=cut
-
-sub new {
-  my ($class,$whatever)=@_;
-
-  if (ref($class)){
-    $class = ref($class);
-  }
-  my $self = {};
-  bless($self,$class);
-
-if ($whatever){
-    $self->throw( "Can't pass an object to new() method. Use put_Genes() to include Bio::EnsEMBL::Gene in cluster");
-  }
-
-  $self->{'_ys_gene'}= {};
-  %{$self->{'_statistics'}}=();
-  
-  return $self;
-}
-
-#########################################################################
-
-=head2 put_Genes()
-
-  function to include one or more genes in the cluster.
-  Useful when creating a cluster. It takes as argument an array of genes, it returns nothing.
-
-=cut
-
-sub put_Genes {
-  my ($self, @new_genes)= @_;
-  if ( !defined( $self->{'_benchmark_types'} ) || !defined(  $self->{'_prediction_types'} ) ){
-    $self->throw( "Cluster lacks references to gene-types, unable to put the gene");
-  }
-
- GENE:
-  foreach my $gene (@new_genes){
-    foreach my $type ( @{ $self->{'_benchmark_types'} } ){
-      if ($gene->type eq $type){
-	push ( @{ $self->{'_benchmark_genes'} }, $gene );
-	next GENE; 
-      }
-    }
-    foreach my $type ( @{ $self->{'_prediction_types'} } ){
-      if ($gene->type eq $type){
-	push ( @{ $self->{'_prediction_genes'} }, $gene );
-	next GENE;
-      }
-    }
-  }
-}
-
-#########################################################################
-
-=head2 get_Genes()
-
-  it returns the array of genes in the GeneCluster object
-
-=cut
-
-sub get_Genes {
-  my $self = shift @_;
-  my @genes;
-  if ( !defined( $self->{'_benchmark_genes'} ) && !defined( $self->{'_prediction_genes'} ) ){
-    $self->warn("The gene array you try to retrieve is empty");
-    @genes = ();
-  }
-  if ( $self->{'_benchmark_genes'} ){
-    push( @genes, @{ $self->{'_benchmark_genes'} } );
-  }
-  if ( $self->{'_prediction_genes'} ){
-    push( @genes, @{ $self->{'_prediction_genes'} } );
-  }
-  return @genes;
-}
-
-#########################################################################
-
-=head2 get_separated_Genes()
-
-  Handy method to get the genes in the genes in the cluster separated by type.
-  It returns two arrayrefs.
-
-=cut
-
-
-sub get_separated_Genes {
-  my ($self) = @_;
-  return ( $self->{'_benchmark_genes'}, $self->{'_prediction_genes'} );
-}
-
-#########################################################################
-
-=head2 get_Gene_Count()
-
-  it returns the number of genes in the GeneCluster object
-
-=cut
-
-sub get_Gene_Count {
-  my $self = shift @_;
-  my $count =0;
-  if ( $self->{'_benchmark_genes'} ){
-    $count += scalar( @{ $self->{'_benchmark_genes'}  } ); 
-  }
-  if ( $self->{'_prediction_genes'} ){
-    $count += scalar( @{ $self->{'_prediction_genes'} } );
-  }
-  #print STDERR "In GeneCluster.get_Gene_Count(), Count = ".$count."\n";
-  return $count;
-}
-
-#########################################################################
-
-=head2 gene_Types()
-  
-  It accepts two array references to set the types. One array holds the gene-types for the 
-  benchmark genes and the other on for the predicted genes. 
-  It can also be used to get the two type-arrays: ($types1, $types2) = $cluster->gene_Types;
-  The conventions throughout are (first entry: benchmark, second entry: prediction)
-
-=cut
-
-sub gene_Types {
-  my ($self, $benchmark_types, $prediction_types) = @_;
-  if ( $benchmark_types && $prediction_types ) {
-    $self->{'_benchmark_types'}  = $benchmark_types;
-    $self->{'_prediction_types'} = $prediction_types;
-  }
-  return ($self->{'_benchmark_types'},$self->{'_prediction_types'});
-}
-
-#########################################################################
-
-=head2 get_Genes_by_Type()
-
-  We can get the genes in each cluster of a given type. 
-  We pass an arrayref containing the types we want to retrieve.
-
-=cut
-  
-  sub get_Genes_by_Type() {
-    my ($self,$types) = @_;
-    unless ($types){
-      $self->throw( "must provide a type");
-    }
-    my @genes = $self->get_Genes;  # this should give them in order, but we check anyway
-    my @selected_genes;
-    foreach my $type ( @{ $types } ){
-      push ( @selected_genes, grep { $_->type eq $type } @genes );
-    }
-    return @selected_genes;
-  }
-
-#########################################################################
-
-=head2 pair_Transcripts()
-
-  Title   : pair_Transcripts()
-  Usage   : my @array_of_pairs = $gene_cluster->pair_Transcripts
-  Function: This method make pairs of transcripts according to maximum reciprocal exon overlap. 
-            Transcripts can eventually be repeated because the maximum exon_overlap may coincide 
-            in two different pairs. 
-  Example : look for instance in the method find_missing_Exons
-  Returns : three arrayrefs =  
-            1.- a list of Bio::EnsEMBL::Utils::Transcripts, each holding a pair of transcripts, 
-            2.- a list with the unpaired transcripts, and 
-            3.- a list those transcripts which have been paired up twice or more
-  Args    : nothing
-
-=cut
-  
-sub pair_Transcripts {
-  my ($self) = @_;
-  
-  # get the genes separated by type list 'benchmark'-like or 'prediction'-like
-  my ( $genes2, $genes1 ) = $self->get_separated_Genes;
-  my (@transcripts1,@transcripts2);
-  my (@trans1,@trans2);
-  foreach my $gene ( @$genes1 ){
-    push( @trans1, $gene->each_Transcript );
-  }
-  foreach my $gene ( @$genes2 ){
-    push( @trans2, $gene->each_Transcript );
-  }
-  
-  # first sort the transcripts by their start position coordinate
-  my %start_table1;
-  my %start_table2;
-  my $i=0;
-  foreach my $tran ( @trans1 ) {
-    $start_table1{$i} = $tran->start_exon->start;
-    $i++;
-  }
-  my $j=0;
-  foreach my $tra ( @trans2  ) {
-    $start_table2{$j} = $tra->start_exon->start;
-    $j++;
-  }
-  foreach my $pos ( sort { $start_table1{$a} <=> $start_table1{$b} } keys %start_table1 ){
-    push (@transcripts1, $trans1[$pos]);
-  }
-  foreach my $pos ( sort { $start_table2{$a} <=> $start_table2{$b} } keys %start_table2 ){
-    push (@transcripts2, $trans2[$pos]);
-  }
-
-  # pair the transcripts, but first, some variable definition
-
-  my %seen1;           # these keep track of those transcript linked and with how much overlap
-  my %seen2;           # ditto, for @transcripts2
-  my @pairs;           # list of (Bio::EnsEMBL::Utils::TranscriptCluster) transcript-pairs being created 
-  my @unpaired;        # list of Bio::EnsEMBL::Transcript which are left unpaired
-  my @doubled;         # those which have been paired up twice
-  my $overlap_matrix;  # matrix holding the number of exon overaps for each pair of transcripts
-  my $link;            # matrix with 1 for the pairs linked and undef otherwise 
-  my @overlap_pairs;   # each entry holds an array with the overlap and the two transcripts being compared
-  my %repeated;        # to keep track of repeated transcripts
-
-  # first calculate all possible overlaps
-  foreach my $tran1 ( @transcripts1 ){
-    foreach my $tran2 ( @transcripts2 ){
-      $$overlap_matrix{ $tran1 }{ $tran2 } = _compare_Transcripts( $tran1, $tran2 );
-      my @list = ( $$overlap_matrix{ $tran1 }{ $tran2 }, $tran1, $tran2 );
-      push ( @overlap_pairs, \@list );
-      #print STDERR "Overlap( ".$tran1->stable_id.",".$tran2->stable_id." ) = "
-      #.$$overlap_matrix{ $tran1 }{ $tran2 }."\n";
-    }
-  }
-  # sort the list of @overlap_pairs on the overlap
-  my @sorted_pairs = sort { $$b[0] <=> $$a[0] } @overlap_pairs;
- 
-  # take the first pair of the list
-  my $first = shift @sorted_pairs;
-  my ($max_overlap,$tran1,$tran2) =  @$first;
-  $seen1{ $tran1 } = $max_overlap;
-  $seen2{ $tran2 } = $max_overlap;
-  $$link{ $tran1 }{ $tran2 } = 1;
-  
-  # scan through each overlap
- PAIR:
-  foreach my $list ( @sorted_pairs ){
-    # each list contains @$list = ( overlap, transcript1, transcript2 )
-    
-    # first of all, if the overlap is zero, ditch it
-    if ( $$list[0] == 0 ){
-      next PAIR;
-    }
-
-    # if we've seen both transcripts already reject them
-    if ( $$link{ $$list[1] }{ $$list[2] } && defined( $seen1{ $$list[1] } ) && defined( $seen2{ $$list[2] } ) ){
-      next PAIR;
-    }
-
-    # if the same score...
-    if ( $$list[0] == $max_overlap ) {
-      
-      # if we've seen both transcripts already, check they have the highest score
-      if ( defined( $seen1{ $$list[1] } ) && defined( $seen2{ $$list[2] } ) ){
-	if ( $$list[0] == $seen1{ $$list[1] } && $$list[0] == $seen2{ $$list[2] } ){
-	  $$link{ $$list[1] }{ $$list[2] } = 1;
-	}
-	 next PAIR;
-      }
-	
-      # if the pair is entirely new, we accept it
-      if ( !defined( $seen1{ $$list[1] } ) && !defined( $seen2{ $$list[2] } ) ){
-	$$link{ $$list[1] }{ $$list[2] } = 1;
-	$seen1{ $$list[1] } = $$list[0];
-	$seen2{ $$list[2] } = $$list[0];
-	next PAIR;
-      }
-
-      # we accept repeats only if this is their maximum overlap as well
-      if ( !defined( $seen2{$$list[2]} ) && defined( $seen1{$$list[1]} ) && $$list[0] == $seen1{$$list[1]} ){
-	$$link{ $$list[1] }{ $$list[2] } = 1;
-	$seen2{ $$list[2] } = $$list[0];
-	if ( !defined( $repeated{ $$list[1] } ) ){
-	  push( @doubled, $$list[1] );
-	  $repeated{ $$list[1] } = 1;
-	}
-	next PAIR;
-      }
-      if ( !defined( $seen1{$$list[1]} ) && defined( $seen2{$$list[2]} ) && ($$list[0] == $seen2{$$list[2]}) ){ 
-	$$link{ $$list[1] }{ $$list[2] } = 1;
-	$seen1{ $$list[1] } = $$list[0];
-	if ( !defined( $repeated{ $$list[2] } ) ){
-	  push( @doubled, $$list[2] );
-	  $repeated{ $$list[2] } = 1;
-	}
-	next PAIR;
-      }
-    }
-
-    # if the score is lower, only accept if the pair is completely new
-    if ( $$list[0] < $max_overlap ){
-      if ( !defined( $seen1{ $$list[1] } ) && !defined( $seen2{ $$list[2] } ) ){
-	$$link{ $$list[1] }{ $$list[2] } = 1;
-	$seen1{ $$list[1] } = $$list[0];
-	$seen2{ $$list[2] } = $$list[0];
-	$max_overlap = $$list[0];
-	next PAIR;
-      }
-    }
-  }
-  
-  # create a new cluster for each pair linked
-  foreach my $tran1 ( @transcripts1 ){
-    foreach my $tran2 ( @transcripts2 ){
-      if ( $$link{ $tran1 }{ $tran2} && $$link{ $tran1 }{ $tran2 } == 1 ){
-	my $pair = Bio::EnsEMBL::Utils::TranscriptCluster->new();
-	$pair->put_Transcripts( $tran1, $tran2 );
-	push( @pairs, $pair );
-      }
-    }
-  }
-
-  # finally, check for the unseen ones
-  foreach my $tran1 ( @transcripts1 ){
-    if ( !defined( $seen1{ $tran1 } ) ){
-      push( @unpaired, $tran1 );
-    }
-  }
-  foreach my $tran2 ( @transcripts2 ){
-    if ( !defined( $seen2{ $tran2 } ) ){
-      push( @unpaired, $tran2 );
-    }
-  }
-  print STDERR scalar(@pairs)." transcript pairs created\n";
-  #my $count2=1;
-  #foreach my $pair ( @pairs ){
-  #  print STDERR "pair $count2:\n".$pair->to_String;
-  #  $count2++;
-  #}
-  #$count2=1;
-  #print STDERR scalar(@unpaired)." unpaired transcripts\n";
-  #foreach my $unpaired ( @unpaired ){
-  #  print STDERR "unpaired $count2: ".$unpaired->stable_id."\n";
-  #}
-  # return the pairs, the unpaired transcripts, and those transcript which have been taken twice or more
-  return (\@pairs,\@unpaired,\@doubled);
-}
-
-#########################################################################
-   
-
-=head2 _compare_Transcripts()
-
- Title: _compare_Transcripts()
- Usage: this internal function compares the exons of two transcripts according to overlap
-        and returns the number of overlaps
-=cut
-
-sub _compare_Transcripts {         
-  my ($transcript1,$transcript2) = @_;
-  my @exons1   = $transcript1->get_all_Exons;
-  my @exons2   = $transcript2->get_all_Exons;
-  my $overlaps = 0;
-  
-  foreach my $exon1 (@exons1){
-    
-    foreach my $exon2 (@exons2){
-
-      if ( ($exon1->overlaps($exon2)) && ($exon1->strand == $exon2->strand) ){
-	$overlaps++;
-      }
-    }
-  }
-  return $overlaps;  # we keep track of the number of overlaps to be able to choose the best match
-}    
-
-
-
-#########################################################################
-
-=head2 get_first_Gene()
-
-  it returns the first gene in the cluster, which usually would be the benchmark gene 
-
-=cut
-
-sub get_first_Gene {
-  my $self = shift @_;
-  return @{$self->{'_benchmark_genes'}}[0];
-}
-
-#########################################################################
-
-=head2 to_String()
-
-  it returns a string containing the information about the genes contained in the
-  GeneCluster object
-
-=cut
-
-sub to_String {
-  my $self = shift @_;
-  my $data='';
-  foreach my $gene ( $self->get_Genes ){
-    my @exons = $gene->get_all_Exons;
-     
-    $data .= sprintf "Id: %-16s"      , $gene->stable_id;
-    $data .= sprintf "Contig: %-20s"  , $exons[0]->contig->id;
-    $data .= sprintf "Exons: %-3d"    , scalar(@exons);
-    $data .= sprintf "Start: %-9d"    , $self->_get_start($gene);
-    $data .= sprintf "End: %-9d"      , $self->_get_end  ($gene);
-    $data .= sprintf "Strand: %-2d\n" , $exons[0]->strand;
-  }
-  return $data;
-}
-
-#########################################################################
-
-=head2 _get_start()
-
- function to get the start position of a gene - it reads the gene object and it returns
- the start position of the first exon
-
-=cut
-
-sub _get_start {
-  my ($self,$gene) = @_;
-  my @exons = $gene->get_all_Exons;
-  my $st;
-  
-  if ($exons[0]->strand == 1) {
-    @exons = sort {$a->start <=> $b->start} @exons;
-    $st = $exons[0]->start;
-  } else {
-    @exons = sort {$b->start <=> $a->start} @exons; # they're read in opposite direction (from right to left)
-    $st = $exons[0]->end;                           # the start is the end coordinate of the right-most exon
-  }                                                 # which is here the first of the list of sorted @exons
-  return $st;
-}
-
-#########################################################################
-
-=head2 _get_end()
-
- function to get the end position of a gene - it reads the gene object and it returns
- the end position of the last exon
-
-=cut
-
-sub _get_end {
-  my ($self,$gene) = @_;
-  my @exons = $gene-get_all_Exons;
-  my $end;
-  
-  if ($exons[0]->strand == 1) {
-    @exons = sort {$a->start <=> $b->start} @exons;
-    $end = $exons[$#exons]->end;
-  } else {
-    @exons = sort {$b->start <=> $a->start} @exons; # they're read in opposite direction (from right to left)
-    $end = $exons[$#exons]->start;                  # the end is the start coordinate of the left-most exon  
-  }                                                 # which is here the last of the list @exons
-  return $end;
-}
-
-#########################################################################
-#Adding new methods to calculate the prediction accuracies of the genes in this cluster
-#########################################################################
-
-=head2 nucleotide_level_accuracy()
-
- function that calculates the difference between the annotated and predicted genes at a nucleotide level
- returns the average sensitivity and specificity of the predictions
-
-=cut
-
-sub nucleotide_level_accuracy {
-    my ($self) = @_;	
-	my @genes = $self->get_Genes;
-    my %statistics;	
-	shift @genes; # the first gene in the array should be the yardstick gene
-
-	GENE: foreach my $gene (@genes){
-	    my $count =0;	
-		my ($sum_sn,$sum_sp) = (0,0);
-		my ($gene_sn,$gene_sp) ;
- 		TRANS: foreach my $trans ($gene->each_Transcript) {
-			
-	    	my @results=$self->evaluate_Transcripts($trans);
-	   		next TRANS unless @results; 
-			$count ++;	#provides a running counter of transcripts which overlap.
-			my ($trans_sn, $trans_sp) = @results;
-			$sum_sn += $trans_sn;
-			$sum_sp += $trans_sp;
-			
-	       
-	    }
-		$gene_sn = $sum_sn/$count;	
-		$gene_sp = $sum_sp/$count;	
-		my @gene_stats = ($gene_sn, $gene_sp);	
-		$statistics{$gene->stable_id} = [@gene_stats];
-	}
-	$self->statistics(%statistics);
-}
-
-
-#########################################################################
-
-=head2 statistics()
-
- returns a hash containing the statistics of each gene in this cluster
-
-=cut
-
-sub statistics {
-  my ($self,%stats) = @_;
-  if (%stats){
-    %{$self->{'_statistics'}}=%stats;
-  }
-  return  %{$self->{'_statistics'}};
-}
-
-
-#########################################################################
-
-=head2 evaluate_Transcripts()
-
- function that compares a transcript with each transcript of the annotated gene in this cluster
-
-=cut
-
-sub evaluate_Transcripts {
-
-    my ($self,$trans) = @_;
-
-    my $yardstick = $self->get_first_Gene();
-
-	my $count=0;
-	my ($sum_sn, $sum_sp,$sn,$sp) = (0,0);	# The sums and average sensitivity and specificity of this particular transcript WRT 
-				 	 						# all transcripts in the yardstick gene in this cluster.
-
-    TRANS: foreach my $ys_trans($yardstick->each_Transcript) {
-      my $trans_tp = 0;
-	  my @ys_exons = $ys_trans->translateable_exons;
-	
-	  foreach my $ys_exon (@ys_exons){
-	    
-	  my @exons = $trans->translateable_exons;
-	    EXON: while (@exons){
-	    
-	        my $exon = shift @exons;
-		
-			next EXON unless ($exon->overlaps($ys_exon) && ($exon->strand eq $ys_exon->strand));
-       		my $overlap;
-			my ($exon_start,$exon_end) =  ($exon->start,$exon->end);
-			my ($ys_start,$ys_end) =  ($ys_exon->start,$ys_exon->end);
-			my ($start,$end);
-
-			if ($exon_start > $ys_start) { $start = $exon_start;} 
-				else {$start = $ys_start;}
-			if ($exon_end  < $ys_end) { $end = $exon_end;} 
-				else {$end = $ys_end;}
-
-			$overlap = $end - $start;
-
-			$trans_tp += $overlap;
-	    }	
-		
-	  }
-    next TRANS unless ($trans_tp ne 0);
-
-	$count ++;	#provides a running counter of transcripts which overlap.
-    my $trans_ap = _translateable_exon_length($ys_trans);
-    my $trans_pp = _translateable_exon_length($trans);
-	
-	$sum_sn += sprintf("%.2f",($trans_tp)/($trans_ap)*100);
-	$sum_sp += sprintf("%.2f",($trans_tp)/($trans_pp)*100);
-    #$sum_sn += $trans_tp/$trans_ap; # sensitivity
-	#$sum_sp += $trans_tp/$trans_pp; #specificity 
-    }     	
-	if ($count eq 0){
-	print STDERR "Count eq 0. is there an error?\n";
-	return 0 ; 
-	}
-    $sn = $sum_sn/$count;    
-    $sp = $sum_sp/$count;    
-
-	my 	@statistics = (	$sn,$sp);
-	return @statistics; 
-}
-
-#########################################################################
-
-=head2 _translateable_exon_length()
-
- internal function that returns the length of the translateable exons 
-
-=cut
-
-sub _translateable_exon_length {
-	my ($trans)= @_;
-	my @exons = $trans->translateable_exons;
-    my $length = 0;
-    foreach my $ex (@exons) {
-        $length += $ex->length;
-    }
-    return $length;
-
-}
-
-1;
diff --git a/modules/Bio/EnsEMBL/Utils/GeneComparison.pm b/modules/Bio/EnsEMBL/Utils/GeneComparison.pm
deleted file mode 100644
index e46c72855b..0000000000
--- a/modules/Bio/EnsEMBL/Utils/GeneComparison.pm
+++ /dev/null
@@ -1,1874 +0,0 @@
-=head1 NAME - Bio::EnsEMBL::Utils::GeneComparison
-
-=head1 DESCRIPTION
-
-Perl Class for comparison of two sets of genes.
-It can read two references to two arrays of genes, e.g. EnsEMBL built genes and human annotated genes,
-and it compares them using different methods (see Synopsis).
-
-The object must be created passing two arrayref with the list of genes to be comapred.
-
-Each GeneComparison object can contain data fields specifying the arrays of genes to be compared.
-There are also data fields in the form of two arrays, which contain 
-the gene types present in those both gene arrays.
-
-=head1 SYNOPSIS
-
-  my $gene_comparison = Bio::EnsEMBL::Utils::GeneComparison->new(\@genes1,\@genes2);
-
-  my @clusters = $gene_comparison->cluster_Genes;
-
-get the list of unmatched genes, this returns two array references of GeneCluster objects as well, 
-but only containing the unmatched ones:
-
-  my ($ens_unmatched,$hum_unmatched) = $gene_comparison->get_unmatched_Genes;
-
-get the list of fragmented genes:
-
-  my @fragmented = $gene_comparison->get_fragmented_Genes (@clusters);
-
-cluster the transcripts using the gene clusters obtained above
-(first cluster all genes and then cluster the transcripts within each gene):
-   
-  my @transcript_clusters = $gene_comparison->cluster_Transcripts_by_Gene(@clusters);
-
-Also, one can cluster the transcripts of the genes in _gene_array1 and gene_array2 directly
-(cluster all transcripts without going through gene-clustering)
-
-  my @same_transcript_clusters = $gene_comparison->cluster_Transcripts;
-
-One can get the number of exons per percentage overlap using whole exons
-  
-  my %statistics = $gene_comparison->get_Exon_Statistics;
-
-Or only coding exons
-
-  my %coding_statistics =  $gene_comparison->get_Coding_Exon_Statistics;
-
-The hashes hold the number of occurences as values and integer percentage overlap as keys
-and can be used to produce a histogram:
- 
-  for (my $i=1; $i<= 100; $i++){
-    if ( $statistics{$i} ){
-      print $i." :\t".$statistics{$i}."\n";
-    }
-    else{
-      print $i." :\n";
-    }
-  }
-
-for more info about how to use it look in the example script
-...ensembl/misc-scripts/utilities/gene_comparison_script.pl
-
-=head1 CONTACT
-
-eae@sanger.ac.uk
-
-=cut
-
-# Let the code begin ...
-
-package Bio::EnsEMBL::Utils::GeneComparison;
-
-use vars qw(@ISA);
-use strict;
-
-use Bio::EnsEMBL::DBSQL::DBAdaptor;
-use Bio::EnsEMBL::Utils::GeneCluster;
-use Bio::EnsEMBL::Utils::TranscriptCluster;
-use Bio::EnsEMBL::Root;
-
-do "Bio/EnsEMBL/Utils/GeneComparison_conf.pl";
-
-@ISA = qw(Bio::EnsEMBL::Root);
-
-####################################################################################
-
-=head2 new()
-
-the new() method accepts two array references
-
-=cut
-
-sub new {
-  
-  my ($class,$gene_array1,$gene_array2) = @_;
-  # as convention, we put first the annotated (or benchmark) genes and second the predicted genes
-  # Anyway, the comparisons are made from the gene_array2 with respect to the gene_array1
-  # so 'missing_exons' means when gene_array2 misses exons with respect to gene_array1 and
-  # overpredicted exons means when gene_array2 have exons in excess with respect to the gene_array1
-
-  if (ref($class)){
-    $class = ref($class);
-  }
-  my $self = {};
-  bless($self,$class);
-  $self->{'_unclustered_genes'}= [];
-  $self->{'_gene_clusters'}= [];
-
-  if ( $gene_array1 && $gene_array2 ){
-    $self->{'_gene_array1'}= $gene_array1;
-    $self->{'_gene_array2'}= $gene_array2;
-  
-    # we keep track of the gene types present in both arrays
-
-    my %types1;
-    foreach my $gene ( @{ $gene_array1 } ){
-      $types1{$gene->type}=1;
-    }
-    foreach my $k ( keys(%types1) ){            # put in the array type_array1 
-      push( @{ $self->{'_type_array1'} }, $k);  # the gene types present in gene_array1
-    }
-
-    my %types2;
-    foreach my $gene ( @{ $gene_array2 } ){
-      $types2{$gene->type}=1;
-    }
-    foreach my $k ( keys(%types2) ){            # put in the array _type_array2 
-      push( @{ $self->{'_type_array2'} }, $k);  # the gene types present in gene_array2
-    }
-  }
-  else{
-    $self->throw( "Can't create a Bio::EnsEMBL::Utils::GeneComparison object without passing in two gene arrayref");
-  }
-  if ( scalar( @{ $gene_array1 } ) == 0 || scalar( @{ $gene_array2 } ) == 0 ){
-    $self->throw( "At least one of the lists of genes to compare is empty. Cannot create a GeneComparison object");
-  }
-  return $self;
-}
-
-######################################################################################
-
-=head2 gene_Types()
-
-this function sets or returns two arrayref with the types of the genes to be compared.
-All the comparisons are made from the gene_array1 with respect to the gene_array2.
-
-=cut
-
-sub gene_Types {
-   my ($self,$type1,$type2) = @_;
-   if ( $type1 && $type2 ){
-     $self->{'_type_array1'} = $type1; 
-     $self->{'_type_array2'} = $type2;
-   }
-   return ( $self->{'_type_array1'}, $self->{'_type_array2'} );
-}
-
-######################################################################################
-
-=head2 cluster_Genes
-
-  This method takes an array of genes and cluster them
-  according to their exon overlap. As a default it takes the genes stored in the GeneComparison object 
-  as data fields (or attributes) '_gene_array1' and '_gene_array2'. It can also accept instead as argument
-  an array of genes to be clustered, but then information about their gene-type is lost (to be solved). 
-  This method returns an array of GeneCluster objects.
-
-=cut
-
-sub cluster_Genes {
-
-  my ($self) = @_;
-  my @genes = ( @{ $self->{'_gene_array1'} }, @{ $self->{'_gene_array2'} } );
-  
-  #### first sort the genes by the left-most position coordinate ####
-  my %start_table;
-  my $i=0;
-  foreach my $gene (@genes){
-    $start_table{$i}=_get_start_of_Gene($gene);
-    $i++;
-  }
-  my @sorted_genes=();
-  foreach my $k ( sort { $start_table{$a} <=> $start_table{$b} } keys %start_table ){
-    push (@sorted_genes, $genes[$k]);
-  }
-
-  print "Clustering ".scalar( @sorted_genes )." genes...\n";
-  #my $label=1;
-  #foreach my $gene (@sorted_genes){
-  #  print $label." gene ".$gene->stable_id."\t\t"._get_start_of_Gene($gene)." "._get_strand_of_Gene($gene)."\n";
-  #  $label++;
-  #}
-  my $found;
-  my $time1=time();
-
-##### old clustering algorithm ###########################################
-
-#  my @clusters=(); # this will hold an array of GeneCluster objects
-#  my $lookups=0;
-#  my $count = 1;
-#  my $new_cluster_count=1;
-#  my $jumpy=0;
-
-#  foreach my $gene (@sorted_genes){
-#    print STDERR $count." gene ".$gene->stable_id." being located...";
-#    $count++;
-#    $found=0;
-#    my $cluster_count=1;
-#  LOOP:
-#    foreach my $cluster (@clusters){
-#      foreach my $gene_in_cluster ( $cluster->get_Genes ){  # read all  genes from GeneCluster object
-#	$lookups++;
-#        if ( _compare_Genes($gene,$gene_in_cluster) ){
-#          print STDERR "put in cluster ".$cluster_count."\n";
-#	  if ( $cluster_count != ($new_cluster_count-1)  ){
-#	    print STDERR "\nONE JUMPING AROUND!!\n\n";
-#	    $jumpy++;
-#	  }
-	  
-#	  $cluster->put_Genes($gene);                       # put gene into GeneCluster object
-#          $found=1;
-#          last LOOP;
-#        }
-#      }
-#      $cluster_count++;
-#    }
-#    if ($found==0){
-#      my $new_cluster=Bio::EnsEMBL::Utils::GeneCluster->new();   # create a GeneCluser object
-#      print STDERR "put in new cluster [".$new_cluster_count."]\n";
-#      $new_cluster_count++;
-#      $new_cluster->gene_Types($self->gene_Types);
-#      $new_cluster->put_Genes($gene);
-#      push(@clusters,$new_cluster);
-#    }
-#  }
-#  my $time2 = time();
-#  print STDERR "\ntime for clustering: ".($time2-$time1)."\n";
-#  print STDERR "number of lookups: ".$lookups."\n";
-#  print STDERR "number of jumpies: ".$jumpy."\n\n";
-#  return @clusters;
-#  # put all unclustered genes (annotated and predicted) into one separate array
-#  $self->flush_gene_Clusters;
-#  foreach my $cl (@clusters){
-#    if ( $cl->get_Gene_Count == 1 ){
-#      $self->unclustered_Genes($cl); # this push the cluster into array @{ $self->{'_unclustered_genes'} }
-#    }
-#    else{
-#      $self->clusters( $cl );
-#    }
-#  }  
-#  return $self->clusters;
-
-#########################################################################
-
-  #### new clustering algorithm, faster than the old one ####
-  #### however, the order of the genes does not guarantee that genes are not skipped, since this algorithm
-  #### only checks the current cluster and sometimes
-
-  # create a new cluster 
-  my $cluster=Bio::EnsEMBL::Utils::GeneCluster->new();
-  my $cluster_count = 1;
-  my @clusters;
-
-  # pass in the types we're using
-  my ($types1,$types2) = $self->gene_Types;
-  $cluster->gene_Types($types1,$types2);
-
-  # put the first gene into these cluster
-  $cluster->put_Genes( $sorted_genes[0] );
-  push (@clusters, $cluster);
-  $self->gene_Clusters($cluster);
-  
-  # loop over the rest of the genes
- LOOP1:
-  for (my $c=1; $c<=$#sorted_genes; $c++){
-    $found=0;
-  LOOP:
-    foreach my $gene_in_cluster ( $cluster->get_Genes ){       
-      if ( _compare_Genes( $sorted_genes[$c], $gene_in_cluster ) ){	
-	$cluster->put_Genes( $sorted_genes[$c] );                       
-	$found=1;
-	next LOOP1;
-      }
-    }
-    # if not in this cluster compare to the ($limit) previous clusters
-
-    my $limit = 14;
-    if ( $found == 0 && $cluster_count > 1 ){
-      my $lookup = 1;
-      while ( !($cluster_count <= $lookup) && !($lookup > $limit) ){ 
-	#print STDERR "cluster_count: $cluster_count, looking at ".($cluster_count - $lookup)."\n";
-	my $previous_cluster = $clusters[ $cluster_count - 1 - $lookup ];
-	foreach my $gene_in_cluster ( $previous_cluster->get_Genes ){
-	  if ( _compare_Genes( $sorted_genes[$c], $gene_in_cluster ) ){	
-	    $previous_cluster->put_Genes( $sorted_genes[$c] );                       
-	    $found=1;
-	    next LOOP1;
-	  }
-	}
-	$lookup++;
-      }
-    }
-    if ($found==0){  # if not-clustered create a new GeneCluser
-      $cluster = new Bio::EnsEMBL::Utils::GeneCluster; 
-      $cluster->gene_Types($types1,$types2);
-      $cluster->put_Genes( $sorted_genes[$c] );
-      $cluster_count++;
-      push( @clusters, $cluster );
-      $self->gene_Clusters( $cluster );
-    }
-  }
-  # put all unclustered genes (annotated and predicted) into one separate array
-  my $time2 = time();
-  print STDERR "time for clustering: ".($time2-$time1)."\n";
-  my @new_clusters;  
-  foreach my $cl ($self->gene_Clusters){
-    if ( $cl->get_Gene_Count == 1 ){
-      $self->unclustered_Genes($cl); # this push the cluster into array @{ $self->{'_unclustered_genes'} }
-    }
-    else{
-      push( @new_clusters, $cl );
-    }
-  }
-  $self->flush_gene_Clusters;
-  $self->gene_Clusters(@new_clusters);
-  my @unclustered = $self->unclustered_Genes;
-  return @new_clusters;
-}
- 
-######################################################################################
-
-=head2 pair_Genes
-
-  This method crates one GeneCluster object per benchmark gene and then PAIR them with predicted genes
-  according to their exon overlap. As a default it takes the genes stored in the GeneComparison object 
-  as data fields (or attributes) '_gene_array1' and '_gene_array2'. It can also accept instead as argument
-  an array of genes to be paired, but then information about their gene-type is lost (to be solved). 
-  The result is put into $self->{'_gene_clusters'} and $self->{'_unclustered_genes'} 
-  
-=cut
-
-sub pair_Genes {
-
-  my ($self) = @_;
-  my (@genes1,@genes2);
-
-  @genes1 =  @{ $self->{'_gene_array1'} };
-  @genes2 =  @{ $self->{'_gene_array2'} };
-
-  #### first sort the genes by the left-most position coordinate ####
-  my %start_table;
-  my $i=0;
-  foreach my $gene (@genes1){
-    $start_table{$i}=_get_start_of_Gene($gene);
-    $i++;
-  }
-  my @sorted_genes=();
-  foreach my $k ( sort { $start_table{$a} <=> $start_table{$b} } keys %start_table ){
-    push (@sorted_genes, $genes1[$k]);
-  }
-  @genes1 = @sorted_genes;
-  
-  %start_table = (); 
-  $i=0;
-  foreach my $gene (@genes2){
-    $start_table{$i}=_get_start_of_Gene($gene);
-    $i++;
-  }
-  @sorted_genes=();
-  foreach my $k ( sort { $start_table{$a} <=> $start_table{$b} } keys %start_table ){
-    push (@sorted_genes, $genes2[$k]);
-  }
-  @genes2 = @sorted_genes;
-
-  #### PAIR the genes ####
-  #### creating a new cluster for each gene in the benchmark set ### 
-
-  foreach my $gene (@genes2){
-    # create a GeneCluser object
-    my $cluster=Bio::EnsEMBL::Utils::GeneCluster->new();
-    my ($type1,$type2) = $self->gene_Types;
-    $cluster->gene_Types($type1,$type2);
-    $cluster->put($gene);
-    $self->gene_Clusters($cluster);
-  }
-  
- GENE:   while (@genes1){
-    my $gene = shift @genes1;
-    
-    foreach my $cluster ($self->gene_Clusters){	
-      my $gene_in_cluster = $cluster->get_first_Gene ;  # read first gene from GeneCluster object
-      
-      if ( _compare_Genes($gene,$gene_in_cluster) ){	
-	$cluster->put_Genes($gene);       # put gene into GeneCluster object
-	next GENE;
-      }
-    }
-    #### an arbitary cluster containing the set of predicted genes that do not overlap with any from the 
-    #### benchmark set
-    my $unclustered = new Bio::EnsEMBL::Utils::GeneCluster;
-    $unclustered->gene_Types($self->gene_Types);
-    $unclustered->put_Genes($gene);  # NOTE: this holds unclustered predicted genes, but does not
-                                     # know about unpaired benchmark genes
-    
-    $self->unclustered_Genes($unclustered); # push the cluster into an array	 
-  }
-  return 1;
-}
-#########################################################################
-
-=head2 _get_start_of_Gene()
-
-  function to get the left-most coordinate of the exons of the gene (start position of a gene).
-  For genes in the strand 1 it reads the gene object and it returns the start position of the 
-  first exon. For genes in the strand -1 it picks the last exon and reads the start position; this
-  is not the proper start of the gene but it is the left-most coordinate.
-
-=cut
-
-sub _get_start_of_Gene {
-  my $gene = shift @_;
-  my @exons = $gene->get_all_Exons;
-  my $st;
-  if ($exons[0]->strand == 1) {
-    @exons = sort {$a->start <=> $b->start} @exons;
-    $st = $exons[0]->start;
-  } else {
-    @exons = sort {$b->start <=> $a->start} @exons;
-    $st = $exons[$#exons]->start;
-  }
-  return $st;
-}
-
-##########################################################################
-
-=head2 _get_strand_of_Gene()
-
-=cut
-
-sub _get_strand_of_Gene {
-  my $gene = shift @_;
-  my @exons = $gene->get_all_Exons;
-  
-  if ($exons[0]->strand == 1) {
-    return 1;
-  }
-  else{
-    return -1;
-  }
-  return 0;
-}
-
-
-
-####################################################################################
-
-=head2 cluster_Transcripts_by_Gene()
-
-it first clusters all genes using cluster_Genes and then cluster the transcripts within each gene
-cluster. If one has already made a gene-clustering, one can pass the array of clusters
-to cluster_Transcripts_by_Gene() to avoid repetition of work
-
-See cluster_Transcripts for a different way of clustering transcripts.
-
-=cut
-
-sub cluster_Transcripts_by_Gene {
-  my ($self,$array) = @_;
-  my @gene_clusters;    
-  my @transcript_clusters;
-  if ($array){
-    push(@gene_clusters,@$array);
-  }
-  else{
-    @gene_clusters = $self->cluster_Genes;
-  }
-  foreach my $cluster (@gene_clusters){
-    
-    my @transcripts;   
-    foreach my $gene ( $cluster->get_Genes ){
-      push ( @transcripts, $gene->each_Transcript );
-    }
-    push ( @transcript_clusters, $self->cluster_Transcripts(@transcripts) );
-    # we pass the array of transcripts to be clustered to the method cluster_Transcripts
-  }
-  return @transcript_clusters;
-}
-
-####################################################################################
-
-=head2 cluster_Transcripts()
-
-  This method cluster all the transcripts in the gene arrays passed to the GeneComparison constructor new().
-  It also accepts an array of transcripts to be clustered. The clustering is done according to exon
-  overlaps, which is implemented in the function _compare_Transripts.
-  This method returns an array of TranscriptCluster objects.
-
-=cut
-  
-sub cluster_Transcripts {
-  my ($self,@transcripts) = @_;
- 
-  if ( !defined(@transcripts) ){                        
-    my @genes = ( @{ $self->{'_gene_array1'} }, @{ $self->{'_gene_array2'} } );
-    foreach my $gene (@genes){
-      my @more_transcripts = $gene->each_Transcript;
-      push ( @transcripts, @more_transcripts );
-    }
-  }
-  # we do the clustering with the array @transcripts like we do with genes
-  
-  # first sort the transcripts by their start position coordinate
-  my %start_table;
-  my $i=0;
-  foreach my $transcript (@transcripts){
-    my $start;
-    my $seqname;
-    my @exons = $transcript->get_all_Exons;
-    @exons = sort { $a->start <=> $b->start } @exons;
-    if ( $exons[0]->start > $exons[0]->end){
-      $start = $exons[0]->end;
-    }
-    else{
-      $start = $exons[0]->start;
-    }
-    $start = $transcript->start_exon->start;
-    $start_table{$i} = $start;
-    $i++;
-    
-    # if some exons in the transcript fall outside the vc, they will not be converted to vc coordinates
-    foreach my $exon (@exons){
-      unless ( $seqname ){
-	$seqname = $exon->seqname;
-      }
-      unless ($seqname eq $exon->seqname){
-	$self->warn("transcript ".$transcript->stable_id." (".$transcript->dbID.") is partly".
-		    " outside the contig");
-	last;
-      }
-    }
-  }
-  my @sorted_transcripts=();
-  foreach my $pos ( sort { $start_table{$a} <=> $start_table{$b} } keys %start_table ){
-    push (@sorted_transcripts, $transcripts[$pos]);
-  }
-  @transcripts = @sorted_transcripts;
-#  # test
-#  foreach my $tran (@transcripts){
-#    print STDERR "\ntranscript: ".$tran->stable_id."internal_id: ".$tran->dbID."\n";
-#    foreach my $exon ($tran->get_all_Exons){
-#      print STDERR $exon->seqname." ".$exon->start.":".$exon->end."\n";
-#    }
-#    print STDERR "\n";
-#  }
-  my $time1 = time();
-  my @clusters;
-
-  # create a new cluster 
-  my $cluster=Bio::EnsEMBL::Utils::TranscriptCluster->new();
-  my $cluster_count = 1;
-
-  # put the first transcript into these cluster
-  $cluster->put_Transcripts( $sorted_transcripts[0] );
-  push( @clusters, $cluster );
-  # $self->transcript_Clusters($cluster);
-  
-  # loop over the rest of the genes
- LOOP1:
-  for (my $c=1; $c<=$#sorted_transcripts; $c++){
-    my $found=0;
-  LOOP:
-    foreach my $t_in_cluster ( $cluster->get_Transcripts){       
-      if ( _compare_Transcripts( $sorted_transcripts[$c], $t_in_cluster ) ){	
-	$cluster->put_Transcripts( $sorted_transcripts[$c] );                       
-	$found=1;
-	next LOOP1;
-      }
-    }
-    # if not in this cluster compare to the previous clusters:
-    
-    # to set a limit in the number of previous cluster to look up
-    # set for example my $limit = 6; and then add to the while condition :
-    # while ( !(...) &&  !($lookup > $limit) ){ 
-    
-    my $limit = 14;
-    if ( $found == 0 && $cluster_count > 1 ){
-      my $lookup = 1;
-      while ( !($cluster_count <= $lookup )  &&  !($lookup > $limit)  ){ 
-	#print STDERR "cluster_count: $cluster_count, looking at ".($cluster_count - $lookup)."\n";
-	my $previous_cluster = $clusters[ $cluster_count - 1 - $lookup ];
-	foreach my $t_in_cluster ( $previous_cluster->get_Transcripts ){
-	  if ( _compare_Transcripts( $sorted_transcripts[$c], $t_in_cluster ) ){	
-	    $previous_cluster->put_Transcripts( $sorted_transcripts[$c] );                       
-	    $found=1;
-	    next LOOP1;
-	  }
-	}
-	$lookup++;
-      }
-    }
-    # if not-clustered create a new TranscriptCluser
-    if ($found==0){  
-      $cluster = new Bio::EnsEMBL::Utils::TranscriptCluster; 
-      $cluster->put_Transcripts( $sorted_transcripts[$c] );
-      $cluster_count++;
-      push( @clusters, $cluster );
-      # $self->transcript_Clusters( $cluster );
-    }
-  }
-  my $time2 = time();
-  my $time = $time2-$time1;
-  print STDERR "clustering time: ".$time."\n";
-  return @clusters;
-}
-
-
-####################################################################################
-
-sub compare_CDS{
-  my ($self,$clusters) = @_;
-  my @total_ann_unpaired;
-  my @total_pred_unpaired;
-  my @total_ann_doubled;
-  my @total_pred_doubled;
-  my $pairs_count = 0;
-
- GENE:
-  foreach my $gene_cluster (@$clusters){
-  my ( $pairs, $ann_unpaired, $pred_unpaired, $ann_doubled, $pred_doubled) = $gene_cluster->pair_Transcripts;
-    my @pairs = @{ $pairs };
-    my @ann_unpaired;
-    my @pred_unpaired;
-    my @ann_doubled;
-    my @pred_doubled;
-    push ( @ann_unpaired , @{ $ann_unpaired }  );
-    push ( @pred_unpaired, @{ $pred_unpaired } );
-    push ( @ann_doubled  , @{ $ann_doubled }   );    
-    push ( @pred_doubled , @{ $pred_doubled }  );
-    
-    push ( @total_ann_doubled  , @ann_doubled   );
-    push ( @total_pred_doubled , @pred_doubled  );
-    push ( @total_ann_unpaired , @ann_unpaired  ); 
-    push ( @total_pred_unpaired, @pred_unpaired );
-
-    # for each pair we keep track of the exon comparison
- 
-   PAIR:
-    foreach my $pair ( @pairs ){
-      $pairs_count++;
-       my ($prediction,$annotation) = $pair->get_Transcripts;
-    }
-  }
-}
-
-####################################################################################
-
-=head2 compare_Exons()
-
-  Title   : compare_Exons()
-  Usage   : my %stats = $gene_comparison->compare_Exons(\@gene_clusters);
-  Function: This method takes an array of GeneCluster objects, pairs up all the transcripts in each 
-            cluster and then go through each transcript pair trying to match the exons. 
-            It keeps track of the exons present in the gene_array2 passed to new() that are missing in
-            the corresponding transcript of gene_array1, also of those exons that are overpredicted in
-            in gene_array1 with respect to gene_array2. It also gives a generic exon_mismatch result.
-            Setting the flag 'coding' in the arguments we can compare the CDSs
-  
-  Example : look in ...ensembl/misc-scripts/utilities/run_GeneComparison for a pipeline-like example
-  Returns : a hash with the arrays of transcript pairs (each pair being a Bio::EnsEMBL::Utils::TranscriptCluster)
-            as values, and the number of missing exons as keys, useful to make a histogram
-  Args    : an arrayref of Bio::EnsEMBL::Utils::GeneCluster objects 
-
-=cut
-  
-sub compare_Exons{
-  my ($self,$clusters,$coding) = @_;
-  
-  my @pairs_missing;                # holds the transcript pairs that have one or more exons missing
-  my $pairs_count;                  # this will count the total number of pairs compared
-  my $total_missing_exon_count = 0; # counts the number of overpredicted exons
-  my $total_over_exon_count    = 0; # same for overpredictions
-  my $total_exon_mismatches    = 0; # includes the above two cases
- 
-  my %over_exon_position;           # keeps track of the positions of the exons
-  my %missing_exon_position;        # keeps track of the positions of the exons
-  
-  my $total_prediction_matched = 0; # numerator in the computation of the sensitivity/specificity
-                                    #  ( = TruePositive )
-
-  my $total_annotation_length  = 0; # denominator in the computation of the sensitivity
-                                    #  ( = TruePositive + FalseNegative )
-
-  my $total_prediction_length  = 0; # denominator in the computation of the specificity
-                                    #  ( = TruePositive + FalsePositive )
-
-  my $sum_sensitivity          = 0;
-  my $sum_specificity          = 0;
-  my $average_sensitivity      = 0;
-  my $average_specificity      = 0;
-  my $total_sensitivity        = 0;
-  my $total_specificity        = 0;
-
-  if ( !defined( $clusters ) ){
-    $self->throw( "Must pass an arrayref of Bio::EnsEMBL::Utils::GeneCluster objects");
-  } 
-  if ( !$$clusters[0]->isa( 'Bio::EnsEMBL::Utils::GeneCluster' ) ){
-    $self->throw( "Can't process a [$$clusters[0]], you must pass a Bio::EnsEMBL::Utils::GeneCluster" );
-  }
-  
-  # warn about the comparison that is about to happen
-  my ($annotation_types,$prediction_types) = $$clusters[0]->gene_Types;
-  my ($message1,$message2) = ( '','');
-  my $messageA = "Comparing exons in gene-types: ";
-  foreach my $type ( @$annotation_types ){
-    $message1 .= $type.", ";
-  }
-  my $messageB = " with exons in gene-types: ";
-  foreach my $type ( @$prediction_types ){
-    $message2 .= $type.", ";
-  }
-  print STDERR $messageA.$message1.$messageB.$message2."...\n";
-
-  my %missing;       # this hash holds the transcript pairs with one, two, etc... missing exons
-  my %overpredicted; # similarly for overpredicted exons
-  my %mismatches;    # includes all of the above
-
-  my @total_ann_unpaired;
-  my @total_pred_unpaired;
-  my @total_ann_doubled;
-  my @total_pred_doubled;
-  my $exact_matches   = 0; # counts the number of exact matching exons
-  my $exon_pair_count = 0; # counts the number of exons
-
-  my $cluster_count   = 1;
-
-  # we check for missing exons in genes of type $type2 in each gene cluster
- GENE:
-  foreach my $gene_cluster (@$clusters){
-    print STDERR "\nIn gene-cluster $cluster_count\n";
-    $cluster_count++; 
-
-    # pair_Transcripts returns (\@pairs,\@ann_unpaired,\@pred_unpaired,\@ann_doubled,\@pred_doubled)
-    print STDERR "pairing up transcripts ...\n";
-    my ( $pairs, $ann_unpaired, $pred_unpaired, $ann_doubled, $pred_doubled) = $gene_cluster->pair_Transcripts;
-    my @pairs = @{ $pairs };
-    my @ann_unpaired;
-    my @pred_unpaired;
-    my @ann_doubled;
-    my @pred_doubled;
-    push ( @ann_unpaired , @{ $ann_unpaired }  );
-    push ( @pred_unpaired, @{ $pred_unpaired } );
-    push ( @ann_doubled  , @{ $ann_doubled }   );    
-    push ( @pred_doubled , @{ $pred_doubled }  );
-    
-    push ( @total_ann_doubled  , @ann_doubled   );
-    push ( @total_pred_doubled , @pred_doubled  );
-    push ( @total_ann_unpaired , @ann_unpaired  ); 
-    push ( @total_pred_unpaired, @pred_unpaired );
-
-    # for each pair we keep track of the exon comparison
- 
-   PAIR:
-    foreach my $pair ( @pairs ){
-      $pairs_count++;
-      
-      # we match the exons in the pair
-      my ($printout, $missing_stats, $over_stats, $mismatch_stats, $match_stats) = 
-	$self->_match_Exons($pair, $coding);
-  
-      # where 
-      # printout{ pair }{ exon_number }  = [ exon/no link, exon/no link, extra comments      ];
-      # $missing_stats                   = [ $missing_exon_count , \%missing_exon_position   ];
-      # $over_stats                      = [ $over_exon_count    , \%over_exon_position      ];
-      # $mismatch_stats                  = [ $exon_mismatch_count                            ];
-      # $match_stats                     = [ $exon_pair_count    , $thispair_exact_matches   ];
-      
-
-      my $missing_exon_count             = $$missing_stats[0];
-      my %thispair_missing_exon_position = %{ $$missing_stats[1] };
-      
-      my $over_exon_count                = $$over_stats[0];
-      my %thispair_over_exon_position    = %{ $$over_stats[1] };
-      
-      my $exon_mismatch_count            = $$mismatch_stats[0];
-      
-      my $thispair_exon_pair_count       = $$match_stats[0];
-      my $thispair_exact_matches         = $$match_stats[1];
-
-      #$exon_mismatch_count = $missing_exon_count + $over_exon_count;
-      push ( @{ $missing{ $missing_exon_count } }    , $pair );
-      push ( @{ $overpredicted{ $over_exon_count } } , $pair );
-      push ( @{ $mismatches{ $exon_mismatch_count } }, $pair );
-
-      $total_missing_exon_count += $missing_exon_count;
-      $total_over_exon_count    += $over_exon_count;
-      $total_exon_mismatches    += $exon_mismatch_count;
-      $exact_matches            += $thispair_exact_matches;
-      $exon_pair_count          += $thispair_exon_pair_count;
-
-      foreach my $key ( keys( %thispair_missing_exon_position ) ){
-	$missing_exon_position{ $key } += $thispair_missing_exon_position{ $key };
-      }
-
-      foreach my $key ( keys( %thispair_over_exon_position ) ){
-	$over_exon_position{ $key } += $thispair_over_exon_position{ $key };
-      }
-      
-      # print out info about this pair
-      $self->_to_String( $pair, $printout );
-
-    }        # end of  PAIR  loop      
-  }          # end of  GENE  loop
- 
-      
-  # we recover info from this transcript pair comparison
-
-  # print out the results
-  print STDERR "Total number of transcript pairs: ".$pairs_count."\n";
-  print STDERR "Transcripts unpaired: ".scalar( @total_ann_unpaired )." from annotation, and ". 
-    scalar( @total_pred_unpaired )." from prediction\n";
-  foreach my $tran ( @total_ann_unpaired ){
-    print STDERR $tran->stable_id."\n";
-  }
-  foreach my $tran ( @total_pred_unpaired ){
-    print STDERR $tran->stable_id."\n";
-  }
-
-  print STDERR "Transcripts repeated: ".scalar( @total_ann_doubled )." from annotation, and ".
-    scalar( @total_pred_doubled )." from prediction\n";
-  foreach my $tran ( @total_ann_doubled ){
-    print STDERR $tran->stable_id."\n";
-  }
-  foreach my $tran ( @total_pred_doubled ){
-    print STDERR $tran->stable_id."\n";
-  }
-
-  print STDERR "\n";
-  print STDERR "Exact matches                 : ".$exact_matches." out of ".$exon_pair_count."\n";  
-  print STDERR "Total exon mismatches         : ".$total_exon_mismatches."\n";
-  print STDERR "Exons missed by the prediction: ".$total_missing_exon_count."\n";
-  foreach my $key ( keys( %missing_exon_position ) ){
-    printf STDERR " position %5s = %2d missed\n", ($key , $missing_exon_position{$key});
-  }
-  print STDERR "Exons overpredicted           : ".$total_over_exon_count."\n";
-  foreach my $key ( keys( %over_exon_position ) ){
-    printf STDERR " position %2s = %2d overpredicted\n", ($key , $over_exon_position{$key});
-  }
-  print STDERR "\n";
-
-  # print out the transcripts pairs with at least one exon missing
-  print STDERR "Exons of genes ".$message1." which are missing in genes ".$message2.":\n";
-  foreach my $key ( sort { $a <=> $b } ( keys( %missing ) ) ){
-    my $these_pairs = scalar( @{ $missing{$key} } );
-    my $percentage  = sprintf "%.2f", 100*$these_pairs/$pairs_count;
-    print STDERR "\n$these_pairs transcript pairs with $key exon(s) missed: $percentage percent\n";
-    foreach my $pair ( @{ $missing{ $key } } ){
-      print STDERR $pair->to_String."\n";
-    }
-  }
-
-  # print out the transcripts pairs with at least one exon overpredicted
-  print STDERR "Exons of genes ".$message2." which are overpredicted with respect to genes ".$message1.":\n";
-  foreach my $key ( sort { $a <=> $b } ( keys( %overpredicted ) ) ){
-    my $these_pairs = scalar( @{ $overpredicted{$key} } );
-    my $percentage  = sprintf "%.2f", 100*$these_pairs/$pairs_count;
-    print STDERR "\n$these_pairs transcript pairs with $key exon(s) overpredicted: $percentage percent\n";
-    foreach my $pair ( @{ $overpredicted{ $key } } ){
-      print STDERR $pair->to_String."\n";
-    }
-  }
-
-  # print out the transcripts pairs with at least one exon mismatch
-  print STDERR "Exons mismatches between genes ".$message1." and ".$message2.":\n";
-  foreach my $key ( sort { $a <=> $b } ( keys( %mismatches ) ) ){
-    my $these_pairs = scalar( @{ $mismatches{$key} } );
-    my $percentage  = sprintf "%.2f", 100*$these_pairs/$pairs_count;
-    print STDERR "\n$these_pairs transcript pairs with $key exon(s) mismatch: $percentage percent\n";
-    foreach my $pair ( @{ $mismatches{ $key } } ){
-      print STDERR $pair->to_String."\n";
-    }
-  }
-  # we return the hash with the arrays of transcript pairs as values, and 
-  # the number of missing exons as keys, that can be used to make a histogram
-  return %missing;
-}
-
-####################################################################################
-
-=head2 _match_Exons()
-  
-  Title   : _match_Exons
-  Function: this is the actual method that tries to match exons and store the info
-            in hashes and then return them. This is done per Transcript Pair
-  Args    : a pair of transcripts in the form of a TranscriptCluster object
-
-=cut
-
-sub _match_Exons{
-  my ($self, $pair, $coding ) = @_;
-  
-  my $prediction_matched  = 0; #  ( = TruePositive )
-  my $annotation_length   = 0; #  ( = TruePositive + FalseNegative )
-  my $prediction_length   = 0; #  ( = TruePositive + FalsePositive )
-
-  my $missing_exon_count  = 0;
-  my $over_exon_count     = 0; # counts the number of overpredicted exons
-  my $exon_mismatch_count = 0;
-  my $exon_pair_count     = 0;
-  my $exact_matches       = 0;
-  
-  my %missing_exon_position;
-  my %over_exon_position;
-
-  # the order is given by the order they are put into the pair in GeneCluster
-  my ($prediction,$annotation) = $pair->get_Transcripts;
-  
-  # get the exons
-  my (@ann_exons,@pred_exons);
-  if ($coding){
-    
-    # get only the CDS from the exons... if there is a translation
-    if ( $annotation->translation ){
-      @ann_exons = $annotation->translateable_exons;
-    }
-    else{
-      print STDERR "transcript ".$annotation->stable_id." has no translation, skipping this pair:\n";
-      print STDERR $pair->to_String."\n";
-      next PAIR;
-    }
-    if ( $prediction->translation ){
-      @pred_exons= $prediction->translateable_exons;
-    }
-    else{
-      print STDERR "transcript ".$prediction->stable_id." has no translation, skipping this pair:\n";
-      print STDERR $pair->to_String."\n";
-      next PAIR;
-    }
-  }
-  else{
-    @ann_exons  = $annotation->get_all_Exons;
-    @pred_exons = $prediction->get_all_Exons;
-  }
-  
-  # compute TruePositive + FalseNegative
-  foreach my $exon (@ann_exons){
-    $annotation_length       += $exon->length;
-  }
-  
-  # compute TruePositive + FalsePositive
-  foreach my $exon (@pred_exons){
-    $prediction_length       += $exon->length;
-  }
-  
-  # order exons according to the strand
-  if ( $ann_exons[0]->strand == 1 ){
-    @ann_exons  = sort{ $a->start <=> $b->start } @ann_exons;
-    @pred_exons = sort{ $a->start <=> $b->start } @pred_exons;
-  }
-  if ( $ann_exons[0]->strand == -1 ){
-    @ann_exons  = sort{ $b->start <=> $a->start } @ann_exons;
-    @pred_exons = sort{ $b->start <=> $a->start } @pred_exons;
-  }
-  
-  # now we link the exons, but first, a bit of formatted info
-  print  "\nComparing transcripts:\n";
-  print STDERR $pair->to_String;
-  
-  my %link;
-  my $start=0;    # start looking at the first one
-  my @buffer;     # buffer that keeps track of the skipped exons in @exons2 
-
-  my $sensitivity;
-  my $specificity;
-  my $printout;
-  my $printout_count = 1;
-  
-  $$printout{ $pair }{ 0 } = [ "annotation", "prediction", "" ];
-
-  # Note: variables start at ZERO, but in the print-outs we shift them to start at ONE
- EXONS1:
-  for (my $i=0; $i<=$#ann_exons; $i++){
-    my $foundlink = 0;
-    
-  EXONS2:
-    for (my $j=$start; $j<=$#pred_exons; $j++){
-      
-      # compare $ann_exons[$i] with $pred_exons[$j] 
-      if ( $ann_exons[$i]->overlaps($pred_exons[$j]) ){
-	
-	# if you've found a link, check first whether there is anything left unmatched in @buffer
-	if ( @buffer && scalar(@buffer) != 0 ){
-	  foreach my $exon_number ( @buffer ){
-	    $$printout{ $pair }{ $printout_count } = ["no link",$exon_number,""];
-	    $printout_count++;
-	    $over_exon_count++;
-	    
-	    if ( $exon_number == 1 ){
-	      $over_exon_position{'first'}++;
-	    }
-	    elsif ( $exon_number == $#pred_exons+1 ){
-	      $over_exon_position{'last'}++;
-	    }
-	    else {
-	      $over_exon_position{ $exon_number }++;
-	    }	     
-	  }
-	}
-	$foundlink = 1;
-	$exon_pair_count++;
-
-	# then check whether it is exact
-	if ( $ann_exons[$i]->equals( $pred_exons[$j] ) ){
-	  $$printout{ $pair }{ $printout_count } = [ ($i+1) , ($j+1) , "exact" ];
-	  $printout_count++;
-	  
-	  $prediction_matched += $pred_exons[$j]->length;
-	  $exact_matches++;
-	}
-	
-	# or there is a mismatch in the number of bases
-	else{              
-	  my $message = '';
-	  if ( $ann_exons[$i]->start != $pred_exons[$j]->start ){
-	    my $mismatch = ($ann_exons[$i]->start - $pred_exons[$j]->start);
-	    my $absolute_mismatch = abs( $mismatch );
-	    my $msg;
-	    if ( $mismatch > 0 ){
-	      $msg = "prediction has $absolute_mismatch extra bases in the";
-	    }
-	    elsif( $mismatch < 0 ){
-	      $msg = "prediction misses $absolute_mismatch bases in the";
-	    }
-	    if ( $ann_exons[$i]->strand == 1 ){
-	      $msg .= " 5' end, ";
-	    }
-	    elsif ( $ann_exons[$i]->strand == -1 ){
-	      $msg .= " 3' end, ";
-	    }
-	    $message .= $msg;
-	  }
-	  if (  $ann_exons[$i]->end  != $pred_exons[$j]->end   ){
-	    my $mismatch = ($ann_exons[$i]->end  -  $pred_exons[$j]->end  );
-	    my $absolute_mismatch = abs( $mismatch );
-	    my $msg;
-	    if ( $mismatch > 0 ){
-	      $msg = "prediction misses $absolute_mismatch bases in the";
-	    }
-	    elsif( $mismatch < 0 ){
-	      $msg = "prediction has $absolute_mismatch extra bases in the";
-	    }
-	    if ( $ann_exons[$i]->strand == 1 ){
-	      $msg .= " 3' end, ";
-	    }
-	    elsif ( $ann_exons[$i]->strand == -1 ){
-	      $msg .= " 5' end, ";
-	    }	      
-	    $message .= $msg;
-	  }
-	  $$printout{ $pair }{ $printout_count } = [ ($i+1) , ($j+1) , $message ];
-	  $printout_count++;
-	}
-	$start += scalar(@buffer)+1;
-	
-	# we start a new buffer
-	@buffer = ();  
-	next EXONS1;
-      }
-      # if no overlap, skip this one
-      else {  
-	# keep this info in a @buffer if you haven't exhausted all checks in @exons2  
-	if ( $j<$#pred_exons ){
-	  push ( @buffer, ($j+1) );
-	}
-	# if you got to the end of @exons2 and found no link, ditch the @buffer
-	elsif ( $j == $#pred_exons ){ 
-	  @buffer = ();
-	}
-	# and get outta here              
-	next EXONS2;
-      }
-    }   # end of EXONS2 loop
-    
-    # found no link for $ann_exons[$i], go to the next one
-    if ( $foundlink == 0 ){  
-      $$printout{ $pair }{ $printout_count } = [ ($i+1) , "no link"  , "" ];
-      $printout_count++;
-      $missing_exon_count++;
-      if ( $i+1 == 1 ){
-	$missing_exon_position{'first'}++; 
-      }
-      elsif ( $i+1 == $#ann_exons+1 ){  
-	$missing_exon_position{'last'}++; 
-      }
-      else {  
-	$missing_exon_position{ $i+1 }++; 
-      }	  
-      # see whether there is any feature we could have used 
-      # WARNING: this method works but it is extremely slow
-      #my ($similarity_features, $prediction_features) = $self->_missed_exon_Evidence( $pair, $ann_exons[$i] );
-    }
-  }       # end of EXONS1 loop
-
-  # stats
-  $exon_mismatch_count = $over_exon_count + $missing_exon_count;
-  my $missing_stats  = [ $missing_exon_count , \%missing_exon_position ];
-  my $over_stats     = [ $over_exon_count    , \%over_exon_position    ];
-  my $mismatch_stats = [ $exon_mismatch_count                          ];
-  my $match_stats    = [ $exon_pair_count    , $exact_matches          ];
-
-  # we return printout{ pair }{ exon_number } = [ exon/no link, exon/no link, extra comments ]
-  return ( $printout, $missing_stats, $over_stats, $mismatch_stats, $match_stats );
-}
-
-####################################################################################
-
-sub input_id {  my $self = shift @_;
-  my ($chr,$chrstart,$chrend) = @_;
-  if ( $chr && $chrstart && $chrend ){
-    $self->{'_chr_name'}  = $chr;
-    $self->{'_chr_start'} = $chrstart;
-    $self->{'_chr_end'}   = $chrend;
-  }
-  return ( $self->{'_chr_name'}, $self->{'_chr_start'}, $self->{'_chr_end'} );
-}
-
-####################################################################################
-
-
-=head2 _missed_exon_Evidence
-
-Function: It searches for possible evidence that we could have used to predict the missed exon passed as argument
-
-=cut
-
-sub _missed_exon_Evidence{
-  my ($self, $pair, $exon) = @_;
-
-  # take the missed exon (virtual contig) coordinates
-  my ( $start, $end ) = ( $exon->start, $exon->end );
-
-  # we need to get info from the annotation database
-  my $dbname    = $::db1_conf{'dbname'};
-  my $dbuser    = $::db1_conf{'user'};
-  my $path      = $::db1_conf{'path'};
-  my $host      = $::db1_conf{'host'};
-  my $genetypes = $::db1_conf{'genetypes'}; # this is an arrayref 
-  
-  my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(
-					      -host             => $host,
-					      -user             => $dbuser,
-					      -dbname           => $dbname,
-					     );
-
-  $db->assembly_type($path);
-  my $sgp = $db->get_StaticGoldenPathAdaptor;
-  
-  # get info of where we are sitting:
-  my ($chr,$chrstart,$chrend) = $self->input_id;
-  unless ( $chr && $chrstart && $chrend ){
-    $self->throw( "You must provide a chr_name, chr_start and a chr_end" );
-  }
-  
-  # get a virtual contig for that piece we're interested in
-  my ($start_pos,$end_pos) = ( $chrstart + $start, $chrstart + $end);
-  my $vcontig = $sgp->fetch_VirtualContig_by_chr_start_end($chr,$start_pos,$end_pos);
-  
-  # get features from that piece
-  my @similarity_features = $vcontig->get_all_SimilarityFeatures;
-  my @prediction_features = $vcontig->get_all_PredictionFeatures;
-
-  print STDERR scalar(@similarity_features)." similarity features retrieved in this range\n";
-  print STDERR scalar(@prediction_features)." prediction features retrieved in this range\n";
-
-  return ( \@similarity_features, \@prediction_features );
-  
-}
-
-#####################################################################################  
-
-=head2 exon_Density()
-
-=cut
-
-sub exon_Density{
-  my ($self, @transcripts) = @_;  
-  my $sum_density;
-  foreach my $tran (@transcripts){
-    my $exon_span;
-    my @exons = $tran->get_all_Exons;
-    @exons = sort { $a->start <=> $b->start } @exons;
-    my $transcript_length = $exons[$#exons]->end - $exons[0]->start;
-    foreach my $exon ( $tran->get_all_Exons ){
-      $exon_span += $exon->length;
-    }
-    $sum_density += $exon_span/$transcript_length;
-  }
-  return $sum_density;
-}
-
-####################################################################################
-
-
-=head2 _to_String();
-
-=cut
-
-sub _to_String{
-  my ($self, $pair, $printout ) = @_;
-  my $count = 0;
-  while ( $$printout{ $pair }{ $count } ){
-    my $msg;
-    my ( $str1, $str2, $str3 ) = ( '', '', '' );
-    
-    # exons in annotation (or 'no link' )
-    $str1 = ${ $$printout{$pair}{$count} }[0];
-
-    # exons in prediction (or 'no link' )
-    $str2 = ${ $$printout{$pair}{$count} }[1];
-
-    # comments: exact, mismatch bases
-    $str3 = ${ $$printout{$pair}{$count} }[2];
-    
-    if ( $count == 0 ){
-      $msg = sprintf "%10s   %-2s  %s\n", ( $str1, $str2, $str3 );
-    }
-    else{
-      $msg = sprintf "%8s <----> %-2s  %s\n", ( $str1, $str2, $str3 );
-    }
-    print STDERR $msg;
-    $count++;
-  }
-}
-  
-####################################################################################
-
-=head2 get_Exon_Statistics()
-
-  This method produces an histogram with the number of exon pairs per percentage overlap.
-  The percentage overlap of two exons, say e1 and e2, is calculated in _compare_Transcripts method as
-  100*intersection(e1,e2)/max_length(e1,e2). It takes whole exons, i.e. without chopping out the 
-  non-coding part. This method returns a hash containing the number of occurences as values and
-  the integer percentage overlap as keys
-
-=cut
-  
-sub get_Exon_Statistics{
-
-  my ($self,$array1,$array2) = @_;
-  my (@transcripts1,@transcripts2);
-  my %stats;
-  if ($array1 && $array2){         # we can pass two arrays references (to transcripts) as argument
-    @transcripts1 = @{$array1};
-    @transcripts2 = @{$array2};
-    
-  }
-  else{                            # or else read the transcripts from the gene_array data fields
-    my @genes1 = @{ $self->{'_gene_array1'} };
-    foreach my $gene (@genes1){
-      my @more_transcripts = $gene->each_Transcript;
-      push ( @transcripts1, @more_transcripts );
-    }
-    my @genes2 = @{ $self->{'_gene_array2'} };
-    foreach my $gene (@genes2){
-      my @more_transcripts = $gene->each_Transcript;
-      push ( @transcripts2, @more_transcripts );
-    }
-  }
-  foreach my $t1 (@transcripts1){
-    foreach my $t2 (@transcripts2){
-      if ( _compare_Transcripts($t1,$t2) ){
-	my %partial_stats = _exon_Statistics($t1,$t2); # produces the stats for the current two transcripts
-	                                               # The hash holds number of occurences as values and
-	                                               # integer percentage overlap as keys
-	foreach my $k ( keys %partial_stats ) {
-	  $stats{$k} += $partial_stats{$k};            # Here it adds up to the overall value
-	}
-      }
-    }
-  }
-  return %stats;
-}
-
-#########################################################################
-
-=head2 _exon_Statistics()
-
-This internal function reads two transcripts and calculates the percentage of overlap 
-between their exons = (INTERSECT($exon1,$exon2))/MAX($exon1,$exon2)
-
-=cut
-
-sub _exon_Statistics {
-  my ($transcript1,$transcript2) = @_;
-  my @exons1 = $transcript1->get_all_Exons; # transcripts get their exons in order
-  my @exons2 = $transcript2->get_all_Exons;
-
-  my %stats;
-
-  foreach my $exon1 (@exons1){
-  
-    foreach my $exon2 (@exons2){
-
-      if ( ($exon1->overlaps($exon2)) && ($exon1->strand == $exon2->strand) ){
-	
-	# calculate the percentage of exon overlap = (INTERSECT($exon1,$exon2))/MAX($exon1,$exon2)
-	my $max;
-	if ( ($exon1->length) < ($exon2->length) ){
-	  $max = $exon2->length;
-	}
-	else{
-	  $max = $exon1->length;
-	}
-	# compute the overlap extent
-	
-	my ($s,$e);  # start and end coord of the overlap
-	my ($s1,$e1) = ($exon1->start,$exon1->end);
-	my ($s2,$e2)= ($exon2->start,$exon2->end);
-	if ($s1<=$s2 && $s2<$e1){
-	  $s=$s2;
-	}
-	if ($s1>=$s2 && $e2>$s1){
-	  $s=$s1;
-	}
-	if ($e1<=$e2 && $e1>$s2){
-	  $e=$e1;
-	}
-	if ($e2<=$e1 && $e2>$s1){
-	  $e=$e2;
-	}
-	my $common = ($e - $s + 1);
-	my $percent = int( (100*$common)/$max );
-	$stats{$percent}++;	
-      }
-    }
-  }
-  return %stats;
-}    
-
-####################################################################################
-
-=head2 get_Coding_Exon_Statistics()
-
-  The same aim as the get_Exon_Statistics method but chopping the non-coding part from the exons.
-  This method returns a hash containing the number of occurences as values and
-  the integer percentage overlap as keys
-
-=cut
-  
-sub get_Coding_Exon_Statistics{
-
-  my ($self,$array1,$array2) = @_;
-  my (@transcripts1,@transcripts2);
-  my %stats;
-  if ($array1 && $array2){       # we can pass two arrays references of transcripts as argument
-    @transcripts1 = @{$array1};
-    @transcripts2 = @{$array2};
-    
-  }
-  else{                          # or else read the transcripts from the gene_array data fields
-    my @genes1 = @{ $self->{'_gene_array1'} };
-    foreach my $gene (@genes1){
-      my @more_transcripts = $gene->each_Transcript;
-      push ( @transcripts1, @more_transcripts );
-    }
-    my @genes2 = @{ $self->{'_gene_array2'} };
-    foreach my $gene (@genes2){
-      my @more_transcripts = $gene->each_Transcript;
-      push ( @transcripts2, @more_transcripts );
-    }
-  }
-  # in order to get info about the non-coding regions of the transcripts we have
-  # to get translation objects for them, however, some of them do not have it defined.
-  # In those cases we just ignore the comparison.
-  
-  foreach my $t1 (@transcripts1){
-    foreach my $t2 (@transcripts2){
-	if ($t1->translation && $t2->translation){
-	  my %partial_stats = _coding_Exon_Statistics($t1,$t2);
-	  foreach my $k ( keys %partial_stats ) {
-	    $stats{$k} += $partial_stats{$k};
-	  }
-	}
-	else{
-	  print "Transcript without translation:\n";
-	  if (!$t1->translation){
-	    print $t1->stable_id."\n";
-	  }
-	  if (!$t2->translation){
-	    print $t2->stable_id."\n";
-	  }
-	  print "\n";
-	}
-    }
-  }
-  return %stats;
-}
-
-####################################################################################  
-
-=head2 _coding_Exon_Statistics()
-
-This internal function reads two transcripts and calculates the percentage 
-of overlap between the coding exons = (INTERSECT($exon1,$exon2))/MAX($exon1,$exon2)
-
-=cut
-
-sub _coding_Exon_Statistics {         
-
-  # the coding region may start in any exon, not necessarily the first one
-  # and may end in any one as well
-  my ($transcript1,$transcript2) = @_;
-
-  my @exons1 = $transcript1->get_all_Exons;
-  my @exons2 = $transcript2->get_all_Exons;
-
-  my $translation1 = $transcript1->translation;
-  my $translation2 = $transcript2->translation;
-
-
-  # IDs of the exons where the coding region starts and ends
-  my ($s_id1,$e_id1) = ($translation1->start_exon_id,$translation1->end_exon_id);
-  my ($s_id2,$e_id2) = ($translation2->start_exon_id,$translation2->end_exon_id);
-
-
-  # identify those exons in each transcript
-  my ($s_exon1,$e_exon1);  # these will be the exons where the coding region (starts,ends) in the 1st transcript
-
-  foreach my $exon1 (@exons1){
-    if ($exon1->dbID eq $s_id1){
-      $s_exon1 = $exon1;
-    }
-    if ($exon1->dbID eq $e_id1){
-      $e_exon1 = $exon1;
-    }
-  }
-  my ($s_exon2,$e_exon2);  # these will be the exons where the coding region (starts,ends) in the 2nd transcript
-  foreach my $exon2 (@exons2){
-    if ($exon2->dbID eq $s_id2){
-      $s_exon2 = $exon2;
-    }
-    if ($exon2->dbID eq $e_id2){
-      $e_exon2 = $exon2;
-    }
-  }
-print "Exon of 2ndt transcript\n";
-foreach my $e2 (@exons2){
-print "Exon ".$e2->strand." ".$e2->start." ".$e2->end."\n";
-}
-
-
-  # take these exons and those in between these exons (since only these exons contain coding region)
-  my @coding_exons1;
-
-  foreach my $exon1 (@exons1){
-	if ($exon1->strand eq 1){
-    	if ( $exon1->start >= $s_exon1->start && $exon1->start <= $e_exon1->end ){
-      		push ( @coding_exons1, $exon1 );
-		}
-	}else{
-    	if ( $exon1->start >= $e_exon1->end && $exon1->start <= $s_exon1->start ){
-     		push ( @coding_exons1, $exon1 );
-		}
-	}
-  }
-print "coding region start: ".$s_exon2->start." end: ".$e_exon2->end."\n";
-
-
-
-  my @coding_exons2;
-
-
-	foreach my $exon2 (@exons2){
-	if ($exon2->strand eq 1){
-        	if ( $exon2->start >= $s_exon2->start && $exon2->start <= $e_exon2->end ){
-            	push ( @coding_exons2, $exon2 );
-        	}
-		}else{
-    		if ( $exon2->start >= $e_exon2->end && $exon2->start <= $s_exon2->start ){
-      		push ( @coding_exons2, $exon2 );
-			}
-		}
-  	}
-
-  @exons1 = @coding_exons1;
-  @exons2 = @coding_exons2;
-    
-  # start and end of coding regions 
-  # relative to the origin of the first and last exons where the coding region starts
-  my ($s_code1,$e_code1) = ($translation1->start,$translation1->end);
-  my ($s_code2,$e_code2) = ($translation2->start,$translation2->end);
- 
-  # here I hold my stats results
-  my %stats;
-  
-  foreach my $exon1 (@exons1){
-
-    foreach my $exon2 (@exons2){
-
-      if ( ($exon1->overlaps($exon2)) && ($exon1->strand == $exon2->strand) ){
-	# start and end coord of the overlap
-	my ($s,$e)=(0,0);  
-
-	# exon positions
-	my ($s1,$e1) = ($exon1->start,$exon1->end);
-	my ($s2,$e2)= ($exon2->start,$exon2->end);
-	
-	#exon lengths
-	my ($l1,$l2) = ($exon1->length, $exon2->length);
-
-	# One has to chop off the non-coding part out of the first and last exons in 
-	# the coding region : ($s_id1,$e_id1) and ($s_id2,$e_id2)
-	# Recall that ($s_code1,$e_code1) and ($s_code2,$e_code2) are the start/end of the coding regionS
-	
-	my ($fs1,$fe1,$fs2,$fe2)=(0,0,0,0); # put a flag on the first and last exons to print them out
-
-	if ($exon1->dbID eq $s_id1){
-	  $s1 = $s1 + $s_code1 - 1;
-	  $l1 = $e1 - $s1 + 1;
-	  $fs1=1;
-	}
-	if ($exon1->dbID eq $e_id1){
-	  $e1 = $s1 + $e_code1 - 1;
-	  $l1 = $e1 - $s1 + 1;
-	  $fe1=1;
-	}
-	if ($exon2->dbID eq $s_id2){
-	  $s2 = $s2 + $s_code2 - 1;
-	  $l2 = $e2 - $s2 + 1;
-	  $fs2=1;
-	}
-	if ($exon2->dbID eq $e_id2){
-	  $e2 = $s2 + $e_code2 - 1;
-	  $l2 = $e2 - $s2 + 1;
-	  $fe2=1;
-	}
-
-	# calculate the percentage of exon overlap = (INTERSECT($exon1,$exon2))/MAX($exon1,$exon2)
-	my $max;
-	if ( $l1 < $l2 ){
-	  $max = $l2;
-	}
-	else{
-	  $max = $l1;
-	}
-	
-	# compute the overlap extent
-	if ($s1<=$s2 && $s2<$e1){
-	  $s=$s2;
-	}
-	if ($s1>=$s2 && $e2>$s1){
-	  $s=$s1;
-	}
-	if ($e1<=$e2 && $e1>$s2){
-	  $e=$e1;
-	}
-	if ($e2<=$e1 && $e2>$s1){
-	  $e=$e2;
-	}
-	my $common = ($e - $s + 1);
-	if ($common != 0){ # we check since after cutting the non-coding piece we might lose the overlap
-	  my $percent = int( (100*$common)/$max );
-	  $stats{$percent}++;	
-	  if ($fs1 || $fs2 ) { 
-	    print "(".$s1.",".$e1.")";
-	    if ($fs1){
-	      print "-> start coding exon";
-	    }
-	    print "\t".$exon1->stable_id."\n";
-	    
-	    print "(".$s2.",".$e2.")";
-	    if ($fs2) { 
-	      print "-> start coding exon";
-	    }
-	    print "\t".$exon2->stable_id."\n";
-	  }
-	  if ( $fe1 || $fe2 ) { 
-	    print "(".$s1.",".$e1.")";
-	    if ($fe1){
-	      print "-> end coding exon";
-	    }
-	    print   "\t".$exon1->stable_id."\n";
-	    
-	    print "(".$s2.",".$e2.")";
-	    if ($fe2) { print "-> end coding exon";
-		      }  
-	    print "\t".$exon2->stable_id."\n";
-	  }
-	  if ($fs1 || $fs2 || $fe1 || $fe2){ print "(".$s.",".$e.") Overlap --> ".$percent."\n\n";}
-	}
-      }
-    }
-  }
-  return %stats;
-}    
-
-
-####################################################################################  
-  
-=head2 get_unmatched_Genes()
-
-  This function returns those genes that have not been identified with any other gene in the
-  two arrays (of type1 and type2) passed to GeneComparison->new(). If we have clustered the genes
-  before, then we have probably filled the array $self->{'_unclustered_genes'}, so we can read out the 
-  unmatched genes from there. If not, it derives the unmatched genes
-  directly from the gene_arrays passed to the GeneComaprison object. 
-
-  It returns two references to two arrays of GeneCluster objects. 
-  The first one corresponds to the genes of type1 which have not been identified in type2, 
-  and the second one holds the genes of type2 that have not been 
-  identified with any of the genes of type1. 
-  
-=cut
-
-sub get_unmatched_Genes {
-  my $self = shift @_;
-  
-  # if we have already the unclustered genes, we can read them out from $self->{'_unclustered_genes'}
-  if ($self->unclustered ){
-    my @types1 = @{ $self->{'_type_array1'} };
-    my @types2 = @{ $self->{'_type_array2'} };
-    my @unclustered = $self->unclustered; 
-    my (@array1,@array2);
-    foreach my $cluster (@unclustered){
-      my $gene = ${ $cluster->get_Genes }[0]; 
-      foreach my $type1 ( @{ $self->{'_type_array1'} } ){
-	if ($gene->type eq $type1){
-	  push ( @array1, $gene );
-	}
-      }
-      foreach my $type2 ( @{ $self->{'_type_array2'} } ){
-	if ($gene->type eq $type2){
-	  push ( @array2, $gene );
-	}
-      }
-    }
-    return (\@array1,\@array2);
-  }
-
-  # if not, we can compute them directly
-  my (%found1,%found2);
-
-  foreach my $gene1 ( @{ $self->{'_gene_array1'} } ){
-    foreach my $gene2 ( @{ $self->{'_gene_array2'} } ){
-      if ( _compare_Genes($gene1,$gene2)){
-	$found1{$gene1->stable_id} = 1;
-	$found2{$gene2->stable_id} = 1;
-      }
-    }
-  }
-  my @unmatched1;
-  foreach my $gene1 ( @{ $self->{'_gene_array1'} } ){
-    unless ( $found1{$gene1->stable_id} ){
-      my $new_cluster = Bio::EnsEMBL::Utils::GeneCluster->new();
-      $new_cluster->gene_Types($self->gene_Types);
-      $new_cluster->put_Genes($gene1);
-      push (@unmatched1, $new_cluster);
-    }
-  }
-  my @unmatched2;
-  foreach my $gene2 ( @{ $self->{'_gene_array2'} } ){
-    unless ( $found2{$gene2->stable_id} ){
-      my $new_cluster = Bio::EnsEMBL::Utils::GeneCluster->new();
-      $new_cluster->gene_Types($self->gene_Types);
-      $new_cluster->put_Genes($gene2);
-      push (@unmatched2, $new_cluster);
-    }
-  }
-  return (\@unmatched1,\@unmatched2);
-}
-
-#########################################################################
-
-=head2 get_fragmented_Genes()
-
-it returns an array of GeneCluster objects, where these objects contain
-fragmented genes: genes of a given type which are overlapped with more than one
-gene of the other type. In order to avoid repetition of work, if a gene-clustering
-has been already performed, one can pass the array of clusters as an argument.
-This, however, is only allowed if _type_array1 and _type_array2 are defined, since the method
-makes use of them.
-
-=cut
-
-sub get_fragmented_Genes {
-  my ($self,@array) = @_;
-  my @clusters;
-  my @fragmented;
-
-  if (@array && $self->{'_type_array1'} && $self->{'_type_array2'} ){
-    @clusters = @array;
-  }
-  else{
-    @clusters = $self->cluster_Genes;
-  }
-  
-  foreach my $cluster (@clusters){
-    
-    my @genes = $cluster->get_Genes;
-    my (@type1,@type2);
-    
-    foreach my $gene ( @genes ){
-     
-      my $type = $gene->type;
-      push( @type1, grep /$type/, @{ $self->{'_type_array1'} } );
-      push( @type2, grep /$type/, @{ $self->{'_type_array2'} } );
-      # @type1 and @type2 hold all the occurrences of gene-types 1 and 2, respectively
-      if ( ( @type1 && scalar(@type1)>1 ) || ( @type2 && scalar(@type2) >1 ) ) {
-	push (@fragmented, $cluster);
-      }
-    }
-  }
-  return @fragmented;
-}
-
-#########################################################################
-
-
-=head2 get_3prime_overlaps()
-
-=cut
-
-#########################################################################
-
-=head2 get_5prime_overlaps()
-
-=cut
-
-#########################################################################
-
-
-=head2 _compare_Genes()
-
- Title: _compare_Genes
- Usage: this internal function compares the exons of two genes on overlap
-
-=cut
-
-sub _compare_Genes {         
-  my ($gene1,$gene2) = @_;
-  my @exons1 = $gene1->get_all_Exons;
-  my @exons2 = $gene2->get_all_Exons;
-  
-  foreach my $exon1 (@exons1){
-  
-    foreach my $exon2 (@exons2){
-
-      if ( ($exon1->overlaps($exon2)) && ($exon1->strand == $exon2->strand) ){
-	return 1;
-      }
-    }
-  }
-  return 0;
-}      
-#########################################################################
-
-
-
-=head2 _compare_Transcripts()
-
- Title: _compare_Transcripts()
- Usage: this internal function compares the exons of two transcripts according to overlap
-        and returns the number of overlaps
-=cut
-
-sub _compare_Transcripts {         
-  my ($transcript1,$transcript2) = @_;
-  my @exons1   = $transcript1->get_all_Exons;
-  my @exons2   = $transcript2->get_all_Exons;
-  my $overlaps = 0;
-  
-  foreach my $exon1 (@exons1){
-    
-    foreach my $exon2 (@exons2){
-
-      if ( ($exon1->overlaps($exon2)) && ($exon1->strand == $exon2->strand) ){
-	$overlaps++;
-      }
-    }
-  }
-  return $overlaps;  # we keep track of the number of overlaps to be able to choose the best match
-}    
-
-#########################################################################
-
-=head2 unclustered_Genes()
-
- Title  : unclustered_Genes()
- Usage  : This function stores and returns an array of GeneClusters with only one gene (unclustered) 
- Args   : a Bio::EnsEMBL::Utils::GeneCluster object
- Returns: an array of Bio::EnsEMBL::Utils::GeneCluster objects
-
-=cut
-
-sub unclustered_Genes{
-    my ($self, @unclustered) = @_;
- 
-    if (@unclustered)
-    {
-       $self->throw("Input @unclustered is not a Bio::EnsEMBL::Utils::GeneCluster\n")
-       unless $unclustered[0]->isa('Bio::EnsEMBL::Utils::GeneCluster');
-
-        push ( @{ $self->{'_unclustered_genes'} }, @unclustered);
-    }
-    return @{ $self->{'_unclustered_genes'} };
-}
-
-#########################################################################
-
-=head2 gene_Clusters()
-
-	Title: gene_Clusters()
-	Usage: This function stores and returns an array of clusters
-
-=cut
-sub gene_Clusters {
-    my ($self, @clusters) = @_;
- 
-    if (@clusters)
-    {
-        push (@{$self->{'_gene_clusters'}}, @clusters);
-    }
-    return @{$self->{'_gene_clusters'}};
-}
-
-#########################################################################
-
-=head2 transcript_Clusters()
-
-	Title: transcript_Clusters()
-	Usage: This function stores and returns an array of transcript_Clusters
-
-=cut
-sub transcript_Clusters {
-    my ($self, @clusters) = @_;
- 
-    if (@clusters)
-    {
-        push (@{$self->{'_transcript_clusters'}}, @clusters);
-    }
-    return @{$self->{'_transcript_clusters'}};
-}
-#########################################################################
-
-=head2 flush_transcript_Clusters()
-
-	Usage: This function cleans up the array in $self->{'_transcript_clusters'}
-
-=cut
-
-sub flush_transcript_Clusters {
-    my ($self) = @_;
-    $self->{'_transcript_clusters'} = [];
-}
-
-
-#########################################################################
-
-=head2 flush_gene_Clusters()
-
-	Usage: This function cleans up the array in $self->{'_gene_clusters'}
-
-=cut
-
-sub flush_gene_Clusters {
-    my ($self) = @_;
-    $self->{'_gene_clusters'} = [];
-}
-
-#####################################################################################
-
-
-
-1;
-
diff --git a/modules/Bio/EnsEMBL/Utils/GeneComparison_conf.pl b/modules/Bio/EnsEMBL/Utils/GeneComparison_conf.pl
deleted file mode 100644
index df9998065b..0000000000
--- a/modules/Bio/EnsEMBL/Utils/GeneComparison_conf.pl
+++ /dev/null
@@ -1,50 +0,0 @@
-# configuration file for the gene_comparison_script.pl
-# based on the config files for the pipeline
-#
-# Written by Eduardo Eyras
-# eae@sanger.ac.uk
-
-# Databases in 120 schema (schema as for the 120 release)
-#
-# ensembl110_new_schema@ensrv3, path_type='UCSC', genetype = '1'
-#
-# homo_sapiens_core_120@ensrv3, path_type='UCSC', genetype='ensembl'
-#
-# chr20_120@ecs1b, path_type='Sanger_02', genetypes='HUMACE-Novel_CDS','HUMACE-Known',etc...
-#
-
-
-BEGIN {
-package main;
-
-%db1_conf = (
-	     'host'      => "ecs1b",
-
-	     'dbname'    => "chr20_120",
-
-	     'path'      => "Sanger_02",
-
-	     'user'      => "ensro",
-	     
-	     # genetypes should be array_ref, to allow for multiple types
-	     'genetypes'  => ["HUMACE-Novel_CDS","HUMACE-Known"],
-	     
-);
-
-	      
-%db2_conf = (
-	     'host'      => "ensrv3",
-
-	     'dbname'    => "homo_sapiens_core_120",
-	     	     
-	     'path'      => "UCSC",
-	    	    
-	     'user'      => "ensro",
-
-	     'genetypes'  => ["ensembl"], 
-	     
-);
-
-}
-
-1;
diff --git a/modules/Bio/EnsEMBL/Utils/Report.pm b/modules/Bio/EnsEMBL/Utils/Report.pm
deleted file mode 100644
index 23e47b979c..0000000000
--- a/modules/Bio/EnsEMBL/Utils/Report.pm
+++ /dev/null
@@ -1,183 +0,0 @@
-=head1 NAME - Bio::EnsEMBL::Utils::Report.pm
-
-=head1 DESCRIPTION
-
-=head1 SYNOPSIS
-
-=head1 CONTACT
-
-eae@sanger.ac.uk
-
-=cut
-
-# Let the code begin ...
-
-package Bio::EnsEMBL::Utils::Report;
-
-use vars qw(@ISA);
-use strict;
-use diagnostics;
-
-use Bio::EnsEMBL::DBSQL::DBAdaptor;
-use Bio::EnsEMBL::Utils::GeneCluster;
-use Bio::EnsEMBL::Utils::TranscriptCluster;
-use Bio::EnsEMBL::Utils::GeneComparison;
-use Bio::EnsEMBL::Root;
-
-@ISA = qw(Bio::EnsEMBL::Root);
-
-####################################################################################
-
-=head2 new()
-
-=cut
-
-sub new {
-  
-  my ($class) = @_;
-  
-  if (ref($class)){
-    $class = ref($class);
-  }
-  my $self = {};
-  bless($self,$class);
-  return $self;
-}
-
-####################################################################################
-
-=head2 header()
-
-get/set method for a string header
-
-=cut
-
-sub header{
-  my ($self,$header) = @_;
-  if ($header){
-    $self->{'_header'} = $header;
-  }
-  return $self->{'_header'};
-}
-
-
-####################################################################################
-
-=head2 histogram()
-
-get/set method for a hash, aimed to be used to create histograms
-
-=cut
-
-sub histogram{
-  my ($self,$histogram) = @_;
-  if ($histogram){
-    $self->{'_histogram'} = $histogram;
-  }
-  return $self->{'_histogram'} = $histogram;
-}
-
-####################################################################################
-
-=head2 list()
-
-get/set method for an array of arrays, aimed to be used to print out lists
-
-=cut
-
-sub list{
-  my ($self,$list) = @_;
-
-  # $list is a reference to an array, whose elementes are arrays
-  if ($list){
-    $self->{'_list'} = $list;
-  }
-  return $self->{'_list'};
-}
-
-####################################################################################
-
-=head2 to_plainTXT()
-
-sends the info in the containers {'_list'}, {'_histogram'}, ... to plain text output
-to file passed as argument, or as default, to SDTOUT
-
-=cut
-
-sub to_plainTXT(){
-  my ($self,$filename) = @_;
-  
-  my $title     =    $self->title;
-  my %histogram = %{ $self->histogram };
-  my @list      = @{ $self->list      };
-
-  # here we should be able to choose where to send the output
-  # define OUTPUT handle
-  #if ($filename){
-  #  open OUT,">$filename";
-  #}
-  #else{
-  #  OUT = STDOUT;
-  #}
-  print STDOUT $title."\n";
-  foreach my $sublist ( @list ){
-    foreach my $item ( @$sublist ){
-      print STDOUT $item."\t";
-    }
-    print STDOUT "\n";
-  }
-
-  foreach my $key ( keys(%histogram) ){
-    print STDOUT $key." ---> ".$histogram{$key}."\n";
-  }
-}
-
-####################################################################################
-
-=head2 histogram_to_R()
-
-prints out the {'_histogram'} in format to be read by R and make a histogram
-
-=cut
-
-sub histogram_to_R(){
-  ($self, $filename) = @_;
-  %histogram = $self->histogram;
-  
-  print STDERR "printing histogram to file $filename...\n";
-
-  # open filehandle
-  my $opening = open OUT, ">$filename";;
-  unless ($opening){
-    $self->throw("Can't open file $filename");
-  }
-
-  # here we process the contents of %histogram and put it into $filename
-  foreach my $key ( keys(%histogram) ){
-    
-    # ...
-    # put here the right format
-    print OUT $key."\t".$histogram{$key}."\n";
-
-  }
-
-  # close filehandle
-  my $closing = close OUT;
-  unless ($closing){
-    $self->throw("Can't close file $filename");
-  }
-}
-
-####################################################################################
-
-=head2 to_HTML
-
-sends the info in the containers {'_list'}, {'_histogram'}, ... to plain html format output
-
-=cut
-
-sub to_HTML{
-  my ($self,$filename) = @_;
-}
-
-
diff --git a/modules/Bio/EnsEMBL/Utils/TranscriptCluster.pm b/modules/Bio/EnsEMBL/Utils/TranscriptCluster.pm
deleted file mode 100644
index d50a1acb59..0000000000
--- a/modules/Bio/EnsEMBL/Utils/TranscriptCluster.pm
+++ /dev/null
@@ -1,560 +0,0 @@
-=head1 NAME
-
-TranscriptCluster
-
-=head1 SYNOPSIS
-
-
-=head1 DESCRIPTION
-
-This object holds one or more transcripts which have been clustered according to 
-comparison criteria external to this class (for instance, in the 
-method _compare_Transcripts of the class GeneComparison).
-Each TranscriptCluster object holds the IDs of the transcripts clustered and the beginning and end coordinates
-of each one (taken from the start and end coordinates of the first and last exon in the correspondig
-get_all_Exons array)
-
-It inherits from Bio::EnsEMBL::Root and Bio::RangeI. A TranscriptCluster is a range in the sense that it convers
-a defined extent of genomic sequence. It is also possible to check whether two clusters overlap (in range),
-is included into another cluster, etc...
-
-=head1 CONTACT
-
-eae@sanger.ac.uk
-
-=cut
-
-# Let the code begin ...
-
-package Bio::EnsEMBL::Utils::TranscriptCluster;
-use Bio::EnsEMBL::Transcript;
-use vars qw(@ISA);
-use strict;
-
-use Bio::EnsEMBL::Gene;
-use Bio::EnsEMBL::Root;
-use Bio::RangeI;
-
-@ISA = qw(Bio::EnsEMBL::Root,Bio::RangeI);
-
-=head1 METHODS
-
-=cut
-
-#########################################################################
-
-
-=head2 new()
-
-new() initializes the attributes:
-_transcript_array
-_transcriptID_array
-_start
-_end
-
-=cut
-
-sub new {
-  my ($class,$whatever)=@_;
-
-  if (ref($class)){
-    $class = ref($class);
-  }
-  my $self = {};
-  bless($self,$class);
-  
-  if ($whatever){
-    $self->throw( "Can't pass an object to new() method. Use put_Genes() to include Bio::EnsEMBL::Gene in cluster");
-  }
-
-  return $self;
-}
-#########################################################################
-
-=head1 Range-like methods
-
-Methods start and end are typical for a range. We also implement the boolean
-and geometrical methods for a range.
-
-=head2 start()
-
-  Title   : start
-  Usage   : $start = $transcript_cluster->end();
-  Function: get/set the start of the range covered by the cluster. This is re-calculated and set everytime
-            a new transcript is added to the cluster
-  Returns : a number
-  Args    : optionaly allows the start to be set
-
-=cut
-
-sub start{
-  my ($self,$start) = @_;
-  if ($start){
-    $self->throw( "$start is not an integer") unless $start =~/^[-+]?\d+$/;
-    $self->{'_start'} = $start;
-  }
-  return $self->{'_start'};
-}
-
-############################################################
-
-=head2 end()
-
-  Title   : end
-  Usage   : $end = $transcript_cluster->end();
-  Function: get/set the end of the range covered by the cluster. This is re-calculated and set everytime
-            a new transcript is added to the cluster
-  Returns : the end of this range
-  Args    : optionaly allows the end to be set
-          : using $range->end($end
-=cut
-
-sub end{
-  my ($self,$end) = @_;
-  if ($end){
-    $self->throw( "$end is not an integer") unless $end =~/^[-+]?\d+$/;
-    $self->{'_end'} = $end;
-  }
-  return $self->{'_end'};
-}
-
-############################################################
-
-=head2 length
-
-  Title   : length
-  Usage   : $length = $range->length();
-  Function: get/set the length of this range
-  Returns : the length of this range
-  Args    : optionaly allows the length to be set
-          : using $range->length($length)
-
-=cut
-
-sub length{
-  my $self = shift @_;
-  if (@_){
-    $self->confess( ref($self)."->length() is read-only");
-  }
-  return ( $self->{'_end'} - $self->{'_start'} + 1 );
-}
-
-############################################################
-
-=head2 strand
-
-  Title   : strand
-  Usage   : $strand = $transcript->strand();
-  Function: get/set the strand of the transcripts in the cluster.
-            The strand is set in put_Transcripts when the first transcript is added to the cluster, in that
-            that method there is also a check for strand consistency everytime a new transcript is added
-  Returns : the strandidness (-1, 0, +1)
-  Args    : optionaly allows the strand to be set
-        
-=cut
-
-sub strand{
-  my ($self,$strand) = @_;
-  if ($strand){
-    $self->{'_strand'} = $strand;
-  }
-  return $self->{'_strand'};
-}
-
-
-############################################################
-
-=head1 Boolean Methods
-
-These methods return true or false. They throw an error if start and end are
-not defined. They are implemented in Bio::RangeI.
-
- $cluster->overlaps($other_cluster) && print "Clusters overlap\n";
-
-=head2 overlaps
-
-  Title   : overlaps
-  Usage   : if($cluster1->overlaps($cluster)) { do stuff }
-  Function: tests if $cluster2 overlaps $cluster1 overlaps in the sense of genomic-range overlap,
-            it does NOT test for exon overlap.
-  Args    : arg #1 = a range to compare this one to (mandatory)
-            arg #2 = strand option ('strong', 'weak', 'ignore') 
-  Returns : true if the clusters overlap, false otherwise
-
-=cut
-
-=head2 contains
-
-  Title   : contains
-  Usage   : if($cluster1->contains($cluster2) { do stuff }
-  Function: tests whether $cluster1 totally contains $cluster2 
-  Args    : arg #1 = a range to compare this one to (mandatory)
-	             alternatively, integer scalar to test
-            arg #2 = strand option ('strong', 'weak', 'ignore') (optional)
-  Returns : true if the argument is totaly contained within this range
-
-=cut
-
-=head2 equals
-
-  Title   : equals
-  Usage   : if($cluster1->equals($cluster2))
-  Function: test whether the range covered by $cluster1 has the same start, end, length as the range 
-            covered by $cluster2
-  Args    : a range to test for equality
-  Returns : true if they are describing the same range
-
-=cut
-
-=head1 Geometrical methods
-
-These methods do things to the geometry of ranges, and return
-Bio::RangeI compliant objects or triplets (start, stop, strand) from
-which new ranges could be built. They are implemented  here and not in Bio::RangeI, since we
-want them to return a new TranscriptCluster object.
-
-=head2 overlap_extent
-
- Title   : overlap_extent
- Usage   : ($a_unique,$common,$b_unique) = $a->overlap_extent($b)
- Function: Provides actual amount of overlap between two different
-           ranges. Implemented already in RangeI
- Example :
- Returns : array of values for 
-           - the amount unique to a
-           - the amount common to both
-           - the amount unique to b
- Args    : 
-
-=cut
-
-#########################################################################
-
-=head2 intersection
-
-  Title   : intersection
-  Usage   : $intersection_cluster = $cluster1->intersection($cluster2)
-  Function: gives a cluster with the transcripts which fall entirely within the intersecting range of
-            $cluster1 and $cluster2
-  Args    : arg #1 = a cluster to compare this one to (mandatory)
-            arg #2 = strand option ('strong', 'weak', 'ignore') (not implemented here)
-  Returns : a TranscriptCluster object, which is empty if the intersection does not contain
-            any transcript
-=cut
-
-sub intersection{
-my ($self, $cluster) = @_;
-
-# if either is empty, return an empty cluster
-if ( scalar( $self->get_Transcripts) == 0 
-  $self->warn( "cluster $self is empty, returning an empty TranscriptCluster");
-  my $empty_cluster = Bio::EnsEMBL::Utils::TranscriptCluster->new();
-  return $empty_cluster;
-}
-
-if ( scalar( $cluster->get_Transcripts ) == 0 ){
-  $self->warn( "cluster $cluster is empty, returning an empty TranscriptCluster");
-  my $empty_cluster = Bio::EnsEMBL::Utils::TranscriptCluster->new();
-  return $empty_cluster;
-}
-
-my @transcripts = $self->get_Transcripts;
-push( @transcripts, $cluster->get_Transcripts);
-
-# make an unique list of transcripts, in case they are repeated
-my %list;
-foreach my $transcript (@transcripts){
-  $list{$transcript} = $transcript;
-}
-@transcripts = values( %list );
-
-my ($inter_start,$inter_end);
-my ($start1,$end1) = ($self->start   ,   $self->end);
-my ($start2,$end2) = ($cluster->start,$cluster->end);
-
-my $strand = $cluster->strand;
-
-# if clusters overlap, calculate the intersection
-if ( $self->overlaps( $cluster ) ){
-  if ( $start2 >= $start1 && $end2 >= $end1 ){
-    if ( $strand == 1){
-      $inter_start = $start2;
-      $inter_end   = $end1;
-    }
-    else{
-      $inter_start = $start1;
-      $inter_end   = $end2;
-    }
-  }
-  if ( $start2 >= $start1 && $end2 < $end1){
-    if ( $strand == 1){
-      $inter_start = $start2;
-      $inter_end   = $end2;
-    }
-    else{
-      $inter_start = $start1;
-      $inter_end   = $end1;
-    }
-  }
-  if ( $start2 < $start1 && $end2 < $end1 ){
-    if ( $strand == 1){
-      $inter_start = $start1;
-      $inter_end   = $end2;
-    }
-    else{
-      $inter_start = $start2;
-      $inter_end   = $end1;
-    }
-  }
-  if ( $start2 < $start1 && $end2 >= $end1 ){
-    if ( $strand == 1){
-      $inter_start = $start1;
-      $inter_end   = $end1;
-    }
-    else{
-      $inter_start = $start2;
-      $inter_end   = $end2;
-    }
-  }
-}
-else{
-  $self->warn( "clusters $self and $cluster do not intersect range-wise, returning an empty TranscriptCluster");
-  my $empty_cluster = Bio::EnsEMBL::Utils::TranscriptCluster->new();
-  return $empty_cluster;
-}
-
-my $inter_cluster = Bio::EnsEMBL::Utils::TranscriptCluster->new();
-my @inter_transcripts;
-
-# see whether any transcript falls within this intersection
-foreach my $transcript ( @transcripts ){
-  my ($start,$end) = ($self->_get_start($transcript), $self->_get_end($transcript));
-  if ($strand == 1 &&  $start >= $inter_start && $end <= $inter_end ){
-     $inter_cluster->put_Transcripts( $transcript );
-  }
-  elsif ( $strand == -1 && $start <= $inter_start && $end >= $inter_end ){
-     $inter_cluster->put_Transcripts( $transcript );
-  }
-}
-
-if ( scalar( $inter_cluster->get_Transcripts ) == 0 ){
-   $self->warn( "cluster $inter_cluster is empty, returning an empty TranscriptCluster");
-   return $inter_cluster;
-}
-else{
- 
-return $inter_cluster;
-
-}
-
-
-############################################################
-
-=head2 union
-
-  Title   : union
-  Usage   : $union_cluster = $cluster1->union(@clusters);
-  Function: returns the union of clusters 
-  Args    : a TranscriptCluster or list of TranscriptClusters to find the union of
-  Returns : the TranscriptCluster object containing all of the ranges
-
-=cut
-
-sub union{
-my ($self,@clusters) = @_;
-
-if ( ref($self) ){
- unshift @clusters, $self;
-}
-
-my $union_cluster = Bio::EnsEMBL::Utils::TranscriptCluster->new();
-my $union_strand;
-
-foreach my $cluster (@clusters){
- unless ($union_strand){
-  $union_strand = $cluster->strand;
- }
- unless ( $cluster->strand == $union_strand){
-  $self->warn("You're making the union of clusters in opposite strands");
- }
- $union_cluster->put_Transcripts($cluster->get_Transcripts);
-}
-
-return $union_cluster;
-
-}
-
-############################################################
-
-=head2 put_Transcripts()
-
-  function to include one or more transcripts in the cluster.
-  Useful when creating a cluster. It takes as argument an array of transcripts, it returns nothing.
-
-=cut
-
-sub put_Transcripts {
-  my ($self, @new_transcripts)= @_;
-  
-  if ( !$new_transcripts[0]->isa('Bio::EnsEMBL::Transcript') ){
-    $self->throw( "Can't accept a [ $new_transcripts[0] ] instead of a Bio::EnsEMBL::Transcript");
-  }
-  
-  my @starts = sort { $a <=> $b } map( { $self->_get_start($_) } @new_transcripts );
-  my @ends   = sort { $a <=> $b } map( { $self->_get_end($_)   } @new_transcripts );
-
-  # check strand consistency among transcripts
-  foreach my $transcript (@new_transcripts){
-    my @exons = $transcript->get_all_Exons;
-    unless ( $self->strand ){
-      $self->strand( $exons[0]->strand );
-    }
-    if ( $self->strand != $exons[0]->strand ){
-      $self->warn( "You're trying to put $tran in a cluster of opposite strand");
-    }
-  }
-
-  # if start is not defined, set it
-  unless ( $self->start ){
-    if ( $self->strand == 1 ){
-      $self->start( $starts[0] );
-    }
-    if ( $self->strand == -1 ){
-      $self->start( $starts[$#starts] );
-    }
-  }
-
-  # if end is not defined, set it
-  unless ( $self->end ){
-    if ( $self->strand == 1 ){
-      $self->end( $ends[$#ends]);
-    }
-    if ( $self->strand == -1 ){
-      $self->end( $ends[0] );
-    }
-  }
-  
-  # extend start and end if necessary as we include more transcripts
-  if ( $self->strand == 1 && $starts[0] < $self->start ){
-    $self->start( $starts[0] );
-  }
-  if ( $self->strand == -1 && $starts[$#starts] > $self->start ){
-    $self->start( $starts[$#starts] );
-  }
-  if ( $self->strand == 1 && $ends[$#ends] > $self->end ){
-    $self->end( $ends[$#ends] );
-  }
-  if ( $self->strand == -1 && $ends[0] < $self->end ){
-    $self->end( $ends[0] );
-  }
-
-  push ( @{ $self->{'_transcript_array'} }, @new_transcripts );
-
-}
-
-#########################################################################
-
-=head2 get_Transcripts()
-
-  it returns the array of transcripts in the GeneCluster object
-
-=cut
-
-sub get_Transcripts {
-  my $self = shift @_;
-  my @transcripts = @{ $self->{'_transcript_array'} };
-  return @transcripts;
-}
-
-#########################################################################
-
-=head2 to_String()
-
-  it returns a string containing the information about the transcripts in the TranscriptCluster object
-
-=cut
-
-sub to_String {
-  my $self = shift @_;
-  my $data='';
-  foreach my $tran ( @{ $self->{'_transcript_array'} } ){
-    my @exons = $tran->get_all_Exons;
-     
-    $data .= sprintf "Id: %-16s"             , $tran->stable_id;
-    $data .= sprintf "Contig: %-21s"         , $exons[0]->contig->id;
-    $data .= sprintf "Exons: %-3d"           , scalar(@exons);
-    $data .= sprintf "Start: %-9d"           , $self->_get_start($tran);
-    $data .= sprintf "End: %-9d"             , $self->_get_end  ($tran);
-    $data .= sprintf "Strand: %-3d"          , $exons[0]->strand;
-    $data .= sprintf "Exon-density: %3.2f\n", $self->exon_Density($tran);
-  }
-  return $data;
-}
-
-#########################################################################
-
-sub exon_Density{
-  my ($self, $transcript) = @_;  
-  my $density;
-  my $exon_span;
-  my @exons = $transcript->get_all_Exons;
-  @exons = sort { $a->start <=> $b->start } @exons;
-  my $transcript_length = $exons[$#exons]->end - $exons[0]->start;
-  foreach my $exon ( @exons ){
-    $exon_span += $exon->length;
-  }
-  $density = $exon_span/$transcript_length;
-  return $density;
-}
-
-#########################################################################
-
-=head2 _get_start()
-
- function to get the start position of a transcript - it reads the Bio::EnsEMBL::Transcript 
- object and it returns the start position of the first exon
-
-=cut
-
-sub _get_start {
-  my ($self,$transcript) = @_;
-  my @exons = $transcript->get_all_Exons;
-  my $st;
-  
-  if ($exons[0]->strand == 1) {
-    @exons = sort {$a->start <=> $b->start} @exons;
-    $st = $exons[0]->start;
-  } else {
-    @exons = sort {$b->start <=> $a->start} @exons;
-    $st = $exons[0]->end;
-  }
-
-  return $st;
-}
-
-#########################################################################
-
-=head2 _get_end()
-
- function to get the end position of a transcript - it reads the Bio::EnsEMBL::Transcript 
- object and it returns the end position of the last exon
-
-=cut
-
-sub _get_end {
-  my ($self,$transcript) = @_;
-  my @exons = $transcript->get_all_Exons;
-  my $end;
-  
-  if ($exons[0]->strand == 1) {
-    @exons = sort {$a->start <=> $b->start} @exons;
-    $end = $exons[$#exons]->end;
-  } else {
-    @exons = sort {$b->start <=> $a->start} @exons;
-    $end = $exons[$#exons]->start;
-  }
-  return $end;
-}
-#########################################################################
-
-1;
-- 
GitLab