Skip to content
Snippets Groups Projects
Commit a77a01f1 authored by Eduardo Eyras's avatar Eduardo Eyras
Browse files

Modified to keep track of the gene types

parent 13c3254c
No related branches found
No related tags found
No related merge requests found
......@@ -23,9 +23,15 @@ eae@sanger.ac.uk
# Let the code begin ...
package Bio::EnsEMBL::Utils::GeneCluster;
use Bio::EnsEMBL::Gene;
use vars qw(@ISA);
use strict;
use Bio::EnsEMBL::Gene;
use Bio::Root::RootI;
@ISA = qw(Bio::Root::RootI);
=head1 METHODS
=cut
......@@ -36,32 +42,30 @@ use strict;
=head2 new()
new() initializes the attributes:
_gene_array
_geneID_array
_start
_end
$self->{'_benchmark_types'}
$self->{'_prediction_types'}
$self->{'_benchmark_genes'}
$self->{'_prediction_genes'}
=cut
sub new {
my ($class,@args)=@_;
my ($class,$whatever)=@_;
if (ref($class)){
$class = ref($class);
}
my $self = {};
bless($self,$class);
$self->{'_gene_array'}= \@args; # array reference that holds the list of genes in the cluster
$self->{'_geneID_array'}=(); # array that holds the IDs of the genes
$self->{'_start'}={}; # hash that holds the start position of each gene
$self->{'_end'}={}; # hash that holds the end position of each gene
foreach my $gene (@args){
my ( $start , $end)=( _get_start($gene) , _get_end($gene) );
push ( @ {$self->{'_geneID_array'} }, $gene->id );
$self->{'_start'}->{$gene->id}=$start;
$self-> {'_end'} ->{$gene->id}=$end;
if ($whatever){
$self->throw( "Can't pass an object to new() method. Use put_Genes() to include Bio::EnsEMBL::Gene in cluster");
}
$self->{'_ys_gene'}= {};
%{$self->{'_statistics'}}=();
return $self;
}
......@@ -75,14 +79,25 @@ sub new {
=cut
sub put_Genes {
my ($self, @args)= @_;
my @new_genes = @args;
push ( @{ $self->{'_gene_array'} }, @new_genes );
foreach my $new_gene (@new_genes){
push ( @{ $self->{'_geneID_array'} }, $new_gene->id );
my ( $start , $end)=( _get_start($new_gene) , _get_end($new_gene) );
$self->{'_start'}->{$new_gene->id}=$start;
$self-> {'_end'} ->{$new_gene->id}=$end;
my ($self, @new_genes)= @_;
if ( !defined( $self->{'_benchmark_types'} ) || !defined( $self->{'_prediction_types'} ) ){
$self->throw( "Cluster lacks references to gene-types, unable to put the gene");
}
GENE:
foreach my $gene (@new_genes){
foreach my $type ( @{ $self->{'_benchmark_types'} } ){
if ($gene->type eq $type){
push ( @{ $self->{'_benchmark_genes'} }, $gene );
next GENE;
}
}
foreach my $type ( @{ $self->{'_prediction_types'} } ){
if ($gene->type eq $type){
push ( @{ $self->{'_prediction_genes'} }, $gene );
next GENE;
}
}
}
}
......@@ -96,10 +111,112 @@ sub put_Genes {
sub get_Genes {
my $self = shift @_;
my @genes = @{ $self->{'_gene_array'} };
my @genes;
if ( !defined( $self->{'_benchmark_genes'} ) && !defined( $self->{'_prediction_genes'} ) ){
$self->warn("The gene array you try to retrieve is empty");
@genes = ();
}
if ( $self->{'_benchmark_genes'} ){
push( @genes, @{ $self->{'_benchmark_genes'} } );
}
if ( $self->{'_prediction_genes'} ){
push( @genes, @{ $self->{'_prediction_genes'} } );
}
return @genes;
}
=head2 get_separated_Genes()
Handy method to get the genes in the genes in the cluster separated by type.
It returns two arrayrefs.
=cut
sub get_separated_Genes {
my ($self) = @_;
return ( $self->{'_benchmark_genes'}, $self->{'_prediction_genes'} );
}
#########################################################################
=head2 get_Gene_Count()
it returns the number of genes in the GeneCluster object
=cut
sub get_Gene_Count {
my $self = shift @_;
my $count =0;
if ( $self->{'_benchmark_genes'} ){
$count += scalar( @{ $self->{'_benchmark_genes'} } );
}
if ( $self->{'_prediction_genes'} ){
$count += scalar( @{ $self->{'_prediction_genes'} } );
}
#print STDERR "In GeneCluster.get_Gene_Count(), Count = ".$count."\n";
return $count;
}
#########################################################################
=head2 gene_Types()
It accepts two array references to set the types. One array holds the gene-types for the
benchmark genes and the other on for the predicted genes.
It can also be used to get the two type-arrays: ($types1, $types2) = $cluster->gene_Types;
The conventions throughout are (first entry: benchmark, second entry: prediction)
=cut
sub gene_Types {
my ($self, $benchmark_types, $prediction_types) = @_;
if ( $benchmark_types && $prediction_types ) {
$self->{'_benchmark_types'} = $benchmark_types;
$self->{'_prediction_types'} = $prediction_types;
}
return ($self->{'_benchmark_types'},$self->{'_prediction_types'});
}
#########################################################################
=head2 get_Genes_by_Type()
We can get the genes in each cluster of a given type.
We pass an arrayref containing the types we want to retrieve.
=cut
sub get_Genes_by_Type() {
my ($self,$types) = @_;
unless ($types){
$self->throw( "must provide a type");
}
my @genes = $self->get_Genes; # this should give them in order, but we check anyway
my @selected_genes;
foreach my $type ( @{ $types } ){
push ( @selected_genes, grep { $_->type eq $type } @genes );
}
return @selected_genes;
}
#########################################################################
=head2 get_first_Gene()
it returns the first gene in the cluster, which usually would be the benchmark gene
=cut
sub get_first_Gene {
my $self = shift @_;
return @{$self->{'_benchmark_genes'}}[0];
}
#########################################################################
=head2 string()
......@@ -112,27 +229,43 @@ sub get_Genes {
sub string {
my $self = shift @_;
my $data='';
foreach my $gene ( @{ $self->{'_gene_array'} } ){
my $id = $gene->id;
while (length($id)<16){
$id .=' ';
}
$data .= $id." "._get_start($gene)." "._get_end($gene)."\n";
foreach my $gene ( $self->get_Genes ){
$data .= $gene->id." ".$gene->type."\t".$self->get_start($gene)." ".$self->get_end($gene)."\n";
}
# foreach my $gene ( @{ $self->{'_benchmark_genes'} } ){
# my $id = $gene->id;
# #while (length($id)<16){
# # $id .=' ';
# #}
# $data .= $id." ".$gene->type."\t".$self->get_start($gene)." ".$self->get_end($gene)."\n";
# }
# foreach my $gene ( @{ $self->{'_prediction_genes'} } ){
# my $id = $gene->id;
# #while (length($id)<16){
# # $id .=' ';
# #}
# $data .= $id." ".$gene->type."\t".$self->get_start($gene)." ".$self->get_end($gene)."\n";
# }
return $data;
}
#########################################################################
=head2 _get_start()
=head2 get_start()
function to get the start position of a gene - it reads the gene object and it returns
the start position of the first exon
=cut
sub _get_start {
my $gene = shift @_;
sub get_start {
my ($self,$gene) = @_;
my @exons = $gene->each_unique_Exon;
my $st;
......@@ -142,21 +275,21 @@ sub _get_start {
} else {
@exons = sort {$b->start <=> $a->start} @exons; # they're read in opposite direction (from right to left)
$st = $exons[0]->end; # the start is the end coordinate of the right-most exon
} # which is here the first of the list @exons
} # which is here the first of the list of sorted @exons
return $st;
}
#########################################################################
=head2 _get_end()
=head2 get_end()
function to get the end position of a gene - it reads the gene object and it returns
the end position of the last exon
=cut
sub _get_end {
my $gene = shift @_;
sub get_end {
my ($self,$gene) = @_;
my @exons = $gene->each_unique_Exon;
my $end;
......@@ -170,6 +303,148 @@ sub _get_end {
return $end;
}
#########################################################################
#Adding new methods to calculate the prediction accuracies of the genes in this cluster
#########################################################################
=head2 nucleotide_level_accuracy()
function that calculates the difference between the annotated and predicted genes at a nucleotide level
returns the average sensitivity and specificity of the predictions
=cut
sub nucleotide_level_accuracy {
my ($self) = @_;
my @genes = $self->get_Genes;
my %statistics;
shift @genes; # the first gene in the array should be the yardstick gene
GENE: foreach my $gene (@genes){
my $count =0;
my ($sum_sn,$sum_sp) = (0,0);
my ($gene_sn,$gene_sp) ;
TRANS: foreach my $trans ($gene->each_Transcript) {
my @results=$self->evaluate_Transcripts($trans);
next TRANS unless @results;
$count ++; #provides a running counter of transcripts which overlap.
my ($trans_sn, $trans_sp) = @results;
$sum_sn += $trans_sn;
$sum_sp += $trans_sp;
}
$gene_sn = $sum_sn/$count;
$gene_sp = $sum_sp/$count;
my @gene_stats = ($gene_sn, $gene_sp);
$statistics{$gene->id} = [@gene_stats];
}
$self->statistics(%statistics);
}
#########################################################################
=head2 statistics()
returns a hash containing the statistics of each gene in this cluster
=cut
sub statistics {
my ($self,%stats) = @_;
if (%stats){
%{$self->{'_statistics'}}=%stats;
}
return %{$self->{'_statistics'}};
}
#########################################################################
=head2 evaluate_Transcripts()
function that compares a transcript with each transcript of the annotated gene in this cluster
=cut
sub evaluate_Transcripts {
my ($self,$trans) = @_;
my $yardstick = $self->get_first_Gene();
my $count=0;
my ($sum_sn, $sum_sp,$sn,$sp) = (0,0); # The sums and average sensitivity and specificity of this particular transcript WRT
# all transcripts in the yardstick gene in this cluster.
TRANS: foreach my $ys_trans($yardstick->each_Transcript) {
my $trans_tp = 0;
my @ys_exons = $ys_trans->translateable_exons;
foreach my $ys_exon (@ys_exons){
my @exons = $trans->translateable_exons;
EXON: while (@exons){
my $exon = shift @exons;
next EXON unless ($exon->overlaps($ys_exon) && ($exon->strand eq $ys_exon->strand));
my $overlap;
my ($exon_start,$exon_end) = ($exon->start,$exon->end);
my ($ys_start,$ys_end) = ($ys_exon->start,$ys_exon->end);
my ($start,$end);
if ($exon_start > $ys_start) { $start = $exon_start;}
else {$start = $ys_start;}
if ($exon_end < $ys_end) { $end = $exon_end;}
else {$end = $ys_end;}
$overlap = $end - $start;
$trans_tp += $overlap;
}
}
next TRANS unless ($trans_tp ne 0);
$count ++; #provides a running counter of transcripts which overlap.
my $trans_ap = _translateable_exon_length($ys_trans);
my $trans_pp = _translateable_exon_length($trans);
$sum_sn += sprintf("%.2f",($trans_tp)/($trans_ap)*100);
$sum_sp += sprintf("%.2f",($trans_tp)/($trans_pp)*100);
#$sum_sn += $trans_tp/$trans_ap; # sensitivity
#$sum_sp += $trans_tp/$trans_pp; #specificity
}
if ($count eq 0){
print STDERR "Count eq 0. is there an error?\n";
return 0 ;
}
$sn = $sum_sn/$count;
$sp = $sum_sp/$count;
my @statistics = ( $sn,$sp);
return @statistics;
}
#########################################################################
=head2 _translateable_exon_length()
internal function that returns the length of the translateable exons
=cut
sub _translateable_exon_length {
my ($trans)= @_;
my @exons = $trans->translateable_exons;
my $length = 0;
foreach my $ex (@exons) {
$length += $ex->length;
}
return $length;
}
1;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment