Commit 8b01156b authored by Elia Stupka's avatar Elia Stupka
Browse files

Further DBOLD modification

parent b58ad16c
#
# EnsEMBL module for Bio::EnsEMBL::DBOLD::Gene_Obj
#
# Cared for by Elia Stupka <elia@ebi.ac.uk>
#
# Copyright Elia Stupka
#
# You may distribute this module under the same terms as perl itself
# POD documentation - main docs before the code
=head1 NAME
Bio::EnsEMBL::DBOLD::Gene_Obj - MySQL database adapter class for EnsEMBL genes, transcripts,
exons, etc.
=head1 SYNOPSIS
$gene = $gene_obj->get('HG45501');
use Bio::EnsEMBL::Gene;
use Bio::EnsEMBL::DBOLD::Gene_Obj;
# Get a gene object from the database
my $gene = $gene_obj->get('HG45501', $db_obj);
=head1 DESCRIPTION
This is one of the objects contained in Bio:EnsEMBL::DBOLD::Obj,
dealing with Gene methods, such as writing and getting genes,
transcripts, translations, and exons.
The Obj object represents a database that is implemented somehow (you
shouldn\'t care much as long as you can get the object).
=head1 CONTACT
Elia Stupka: elia@ebi.ac.uk
=head1 APPENDIX
The rest of the documentation details each of the object
methods. Internal methods are usually preceded with a _
=cut
# Let the code begin...
package Bio::EnsEMBL::DBOLD::Gene_Obj;
use vars qw(@ISA);
use strict;
# Object preamble - inheriets from Bio::Root::Object
use Bio::Root::Object;
use Bio::EnsEMBL::DBOLD::Obj;
use Bio::EnsEMBL::DB::Gene_ObjI;
use Bio::EnsEMBL::Gene;
use Bio::EnsEMBL::Exon;
use Bio::EnsEMBL::Transcript;
use Bio::EnsEMBL::DB::VirtualContig;
use DBI;
use Bio::EnsEMBL::StickyExon;
use Bio::EnsEMBL::DBOLD::DummyStatement;
use Bio::EnsEMBL::DB::Gene_ObjI;
@ISA = qw(Bio::EnsEMBL::DB::Gene_ObjI Bio::Root::Object);
# new() is inherited from Bio::Root::Object
# _initialize is where the heavy stuff will happen when new is called
sub _initialize {
my($self,$db_obj) = @_;
my $make = $self->SUPER::_initialize;
$db_obj || $self->throw("Database Gene object must be passed a db obj!");
$self->_db_obj($db_obj);
return $make; # success - we hope!
}
=head2 delete
Title : delete
Usage : $Gene_Obj->delete_Gene($gene_id)
Function: deletes a gene from the database, i.e. exons, transcripts, translations
Example : $geneobj->delete_Gene('ENSG00000019482')
Returns : nothing
Args : gene id
=cut
sub delete{
my ($self,$geneid) = @_;
my @trans;
my %exon;
my @translation;
# get out exons, transcripts for gene.
my $sth = $self->_db_obj->prepare("select id,translation from transcript where gene = '$geneid'");
$sth->execute;
while( my $rowhash = $sth->fetchrow_hashref) {
push(@trans,$rowhash->{'id'});
push(@translation,$rowhash->{'translation'});
}
foreach my $trans ( @trans ) {
my $sth = $self->_db_obj->prepare("select exon from exon_transcript where transcript = '$trans'");
$sth->execute;
while( my $rowhash = $sth->fetchrow_hashref) {
$exon{$rowhash->{'exon'}} =1;
}
}
foreach my $translation (@translation) {
my $sth2 = $self->_db_obj->prepare("delete from translation where id = '$translation'");
$sth2->execute;
}
# delete exons, transcripts, gene rows
foreach my $exon ( keys %exon ) {
my $sth = $self->_db_obj->prepare("delete from exon where id = '$exon'");
$sth->execute;
$sth = $self->_db_obj->prepare("delete from supporting_feature where exon = '$exon'");
$sth->execute;
}
foreach my $trans ( @trans ) {
my $sth= $self->_db_obj->prepare("delete from transcript where id = '$trans'");
$sth->execute;
$sth= $self->_db_obj->prepare("delete from exon_transcript where transcript = '$trans'");
$sth->execute;
}
$sth = $self->_db_obj->prepare("delete from gene where id = '$geneid'");
$sth->execute;
$sth = $self->_db_obj->prepare("delete from genetype where gene_id = '$geneid'");
$sth->execute;
}
=head2 delete_Exon
Title : delete_Exon
Usage : $obj->delete_Exon($exon_id)
Function: Deletes exon, including exon_transcript rows
Example : $obj->delete_Exon(ENSE000034)
Returns : nothing
Args : $exon_id
=cut
sub delete_Exon{
my ($self,$exon_id) = @_;
$exon_id || $self->throw ("Trying to delete an exon without an exon_id\n");
#Delete exon_transcript rows
my $sth = $self->_db_obj->prepare("delete from exon_transcript where transcript = '".$exon_id."'");
my $res = $sth ->execute;
#Delete exon rows
$sth = $self->_db_obj->prepare("delete from exon where id = '".$exon_id."'");
$res = $sth->execute;
$self->delete_Supporting_Evidence($exon_id);
}
=head2 delete_Supporting_Evidence
Title : delete_Supporting_Evidence
Usage : $obj->delete_Supporting_Evidence($exon_id)
Function: Deletes exon\'s supporting evidence entries
Example : $obj->delete_Supporting_Evidence(ENSE000034)
Returns : nothing
Args : $exon_id
=cut
sub delete_Supporting_Evidence {
my ($self,$exon_id) = @_;
$exon_id || $self->throw ("Trying to delete supporting_evidence without an exon_id\n");
my $sth = $self->_db_obj->prepare("delete from supporting_feature where exon = '" . $exon_id . "'");
my $res = $sth->execute;
}
=head2 get_all_Gene_id
Title : get_all_Gene_id
Usage : $geneobj->get_all_Gene_id
Function: Gets an array of ids for all genes in the current db
Example : $geneobj->get_all_Gene_id
Returns : array of ids
Args : none
=cut
sub get_all_Gene_id{
my ($self) = @_;
my @out;
my $sth = $self->_db_obj->prepare("select id from gene");
my $res = $sth->execute;
while( my $rowhash = $sth->fetchrow_hashref) {
push(@out,$rowhash->{'id'});
}
return @out;
}
=head2 get_geneids_by_hids
Title : get_geneids_by_hids
Usage : @geneids = $obj->get_geneids_by_hids(@hids)
Function: gives back geneids with these hids as supporting evidence
Example :
Returns :
Args :
=cut
sub get_geneids_by_hids{
my ($self,@hids) = @_;
my $inlist = join(',',map "'$_'", @hids);
$inlist = "($inlist)";
my $sth = $self->_db_obj->prepare("select transcript.gene from transcript as transcript, exon_transcript as exon_transcript, exon as exon, supporting_feature as supporting_feature where exon.id = supporting_feature.exon and exon_transcript.exon = exon.id and exon_transcript.transcript = transcript.id and supporting_feature.hid in $inlist");
$sth->execute();
my %gene;
while( (my $arr = $sth->fetchrow_arrayref()) ) {
my ($geneid) = @{$arr};
$gene{$geneid} =1;
}
return keys %gene;
}
=head2 get_Interpro_by_geneid
Title : get_Interpro_by_geneid
Usage : @interproid = $gene_obj->get_Interpro_by_geneid($gene->id);
Function: gets interpro accession numbers by geneid. A hack really -
we should have a much more structured system than this
Example :
Returns :
Args :
=cut
sub get_Interpro_by_geneid{
my ($self,$gene) = @_;
my $sth = $self->_db_obj->prepare("select i.interpro_ac,idesc.description from transcript t, protein_feature pf, interpro i, interpro_description idesc where t.gene = '$gene' and t.translation = pf.translation and i.id = pf.hid and i.interpro_ac = idesc.interpro_ac");
$sth->execute;
my @out;
my %h;
while( (my $arr = $sth->fetchrow_arrayref()) ) {
if( $h{$arr->[0]} ) { next; }
$h{$arr->[0]}=1;
my $string = $arr->[0] .":".$arr->[1];
push(@out,$string);
}
return @out;
}
=head2 get_Interpro_by_keyword
Title : get_Interpro_by_keyword
Usage : @interproid = $gene_obj->get_Interpro_by_keyword('keyword');
Function: gets interpro accession numbers by keyword. Another really stink hack - we should have a much more structured system than this
Example :
Returns :
Args :
=cut
sub get_Interpro_by_keyword{
my ($self,$keyword) = @_;
my $sth = $self->_db_obj->prepare("select idesc.interpro_ac,idesc.description from interpro_description idesc where idesc.description like '%$keyword%'");
$sth->execute;
my @out;
my %h;
while( (my $arr = $sth->fetchrow_arrayref()) ) {
if( $h{$arr->[0]} ) { next; }
$h{$arr->[0]}=1;
my $string = $arr->[0] .":".$arr->[1];
push(@out,$string);
}
return @out;
}
=head2 get
Title : get
Usage : $geneobj->get($geneid, $supporting)
Function: gets one gene out of the db with or without supporting evidence
Example : $obj->get('ENSG00000009151','evidence')
Returns : gene object (with transcripts, exons and supp.evidence if wanted)
Args : gene id and supporting tag (if latter not specified, assumes without
Note that it is much faster to get genes without supp.evidence!
=cut
sub get {
my ($self,$geneid, $supporting) = @_;
my @out;
if (!$supporting) {
@out = $self->get_array_supporting('without', $geneid);
}
else {
@out = $self->get_array_supporting($supporting, $geneid);
}
$self->throw("Error retrieving gene with ID: $geneid") unless $out[0];
return $out[0];
}
=head2 get_array_supporting
Title : get_Gene_array_supporting
Usage : $obj->get_Gene_array_supporting($supporting,@geneid)
Function: Gets an array of genes, with transcripts and exons. If $supporting
equal to 'evidence' the supporting evidence for each exon is also read
from the supporting evidence table
Example : $obj->get_Gene_array_supporting ('evidence',@geneid)
Returns : an array of gene objects
Args : 'evidence' and gene id array
=cut
sub get_array_supporting {
my ($self,$supporting,@geneid) = @_;
defined($supporting) || $self->throw("You need to specify whether to retrieve supporting evidence or not!");
if( @geneid == 0 ) {
$self->throw("Attempting to create gene with no id");
}
my (@out, @sup_exons);
my @inlist;
my $count = 0;
my $count2 = 0;
# The gene list is split into chunks of 11 as mysql grinds
# to a halt over this number
foreach my $in (@geneid) {
if (!defined($inlist[$count])) {
$inlist[$count] = [];
}
push(@{$inlist[$count]},$in);
$count2++;
if ($count2 > 5) {
$count++;
$count2 = 0;
}
}
# my $inlist = join(',', map "'$_'", @geneid);
foreach my $inarray (@inlist) {
my $inlist = join(',', map "'$_'", @$inarray);
# I know this SQL statement is silly.
#
#print STDERR "Inlist = $inlist\n";
my $query = qq{
SELECT tscript.gene
, con.id
, tscript.id
, e_t.exon, e_t.rank
, exon.seq_start, exon.seq_end
, UNIX_TIMESTAMP(exon.created)
, UNIX_TIMESTAMP(exon.modified)
, exon.strand
, exon.phase
, exon.rank
, transl.seq_start, transl.start_exon
, transl.seq_end, transl.end_exon
, transl.id
, gene.version
, tscript.version
, exon.version
, transl.version
, con.clone
, genetype.type
FROM contig con
, gene
, transcript tscript
, exon_transcript e_t
, exon
, translation transl
, genetype genetype
WHERE gene.id = tscript.gene
AND tscript.id = e_t.transcript
AND e_t.exon = exon.id
AND exon.contig = con.internal_id
AND tscript.translation = transl.id
AND genetype.gene_id = gene.id
AND gene.id IN ($inlist)
ORDER BY tscript.gene
, tscript.id
, e_t.rank
, exon.rank
LIMIT 2500
};
# print STDERR "Query is " . $query . "\n";
my $sth = $self->_db_obj->prepare($query);
my $res = $sth ->execute();
my $current_gene_id = '';
my $current_transcript_id = '';
my $previous_exon = undef;
my $sticky_exon = 0;
my ($gene,$trans);
my @transcript_exons;
while( (my $arr = $sth->fetchrow_arrayref()) ) {
my ($geneid,$contigid,$transcriptid,$exonid,$rank,$start,$end,
$exoncreated,$exonmodified,$strand,$phase,$exon_rank,$trans_start,
$trans_exon_start,$trans_end,$trans_exon_end,$translationid,
$geneversion,$transcriptversion,$exonversion,$translationversion,$cloneid,$genetype) = @{$arr};
if( ! defined $phase ) {
$self->throw("Bad internal error! Have not got all the elements in gene array retrieval");
}
# I think this is a dirty hack
#if( exists $seen{"$exonid-$rank"} ) {
# next;
#}
# Create new gene if the id has changed
if( $geneid ne $current_gene_id ) {
if( $transcriptid eq $current_transcript_id ) {
$self->throw("Bad internal error. Switching genes without switching transcripts");
}
$gene = Bio::EnsEMBL::Gene->new();
$gene->id ($geneid);
$gene->version ($geneversion);
$gene->type ($genetype);
$gene->add_cloneid_neighbourhood($cloneid);
$current_gene_id = $geneid;
push(@out,$gene);
}
# Create new transcript if the id has changed
if( $transcriptid ne $current_transcript_id ) {
# put away old exons
if( defined $trans ) {
$self->_store_exons_in_transcript($trans,@transcript_exons);
}
@transcript_exons = ();
# put in new exons
$trans = Bio::EnsEMBL::Transcript->new();
$trans->id ($transcriptid);
$trans->version($transcriptversion);
$current_transcript_id = $transcriptid;
my $translation = Bio::EnsEMBL::Translation->new();
$translation->start ($trans_start);
$translation->end ($trans_end);
$translation->start_exon_id($trans_exon_start);
$translation->end_exon_id ($trans_exon_end);
$translation->id ($translationid);
$translation->version ($translationversion);
$trans->translation ($translation);
$gene ->add_Transcript ($trans);
}
my $exon = Bio::EnsEMBL::Exon->new();
#print(STDERR "Creating exon - contig id $contigid\n");
$exon->clone_id ($cloneid);
$exon->contig_id($contigid);
$exon->id ($exonid);
$exon->created ($exoncreated);
$exon->modified ($exonmodified);
$exon->start ($start);
$exon->end ($end);
$exon->phase ($phase);
$exon->strand ($strand);
$exon->version ($exonversion);
$exon->seqname ($contigid);
$exon->sticky_rank($exon_rank);
#
# Attach the sequence, cached if necessary...
#
if ($supporting && $supporting eq 'evidence') {
push @sup_exons, $exon;
}
my $seq;
if( $self->_db_obj->_contig_seq_cache($exon->contig_id) ) {
$seq = $self->_db_obj->_contig_seq_cache($exon->contig_id);
} else {
my $contig = $self->_db_obj->get_Contig($exon->contig_id);
$contig->fetch();
$seq = $contig->primary_seq();
$self->_db_obj->_contig_seq_cache($exon->contig_id,$seq);
}
$exon ->attach_seq($seq);
push(@transcript_exons,$exon);
}
if( $current_gene_id eq '' ) {
return ();
}
$self->_store_exons_in_transcript($trans,@transcript_exons);
if ($supporting && $supporting eq 'evidence') {
$self->get_supporting_evidence_direct(@sup_exons);
}
foreach my $g ( @out) {
$self->_get_dblinks($g);
}
}
return @out;
}
=head2 _store_exons_in_transcript
Title : _store_exons_in_transcript
Usage :
Function:
Example :
Returns :
Args :
=cut
sub _store_exons_in_transcript{
my ($self,$trans,@exons) = @_;
if( !ref $trans || !$trans->isa('Bio::EnsEMBL::Transcript') ) {
$self->throw(" $trans is not a transcript");
}
#print STDERR "Got ",scalar(@exons),"to store...\n";
my $exon;
while ( ($exon = shift @exons)) {
#print STDERR "Handling exon",$exon->id,":",$exon->sticky_rank,"\n";
if( $#exons >= 0 && $exons[0]->id eq $exon->id ) {
# sticky exons.