Commit 68f0ec46 authored by Ewan Birney's avatar Ewan Birney
Browse files

fixed EMBLLOAD to work; commited tests against EMBLLOAD

parent d5ad4a4a
......@@ -83,7 +83,7 @@ sub _initialize {
sub id {
my ($self) = @_;
my $id=$self->_get_Seq->seq->id;
my $id=$self->_get_Seq->id;
return $id;
}
......@@ -104,7 +104,7 @@ sub id {
sub embl_id {
my ($self) = @_;
my $id=$self->_get_Seq->seq->id;
my $id=$self->_get_Seq->id;
return $id;
}
......@@ -125,56 +125,9 @@ sub embl_id {
sub get_all_Genes {
my ($self)=@_;
my @genes;
my @exons;
my $exon_counter;
foreach my $ft($self->_get_Seq->all_SeqFeatures){
if($ft->primary_tag eq 'CDS'){
my $exon = Bio::EnsEMBL::Exon->new($ft->start,$ft->end,$ft->strand);
$exon->phase("1");
$exon->end_phase("1");
$exon_counter++;
$exon->id($exon_counter);
$exon->contig_id($self->id);
$exon->version("1");
$exon->created("2000");
$exon->modified("2000");
$exon->attach_seq($self->_get_Seq->seq);
push @exons,$exon;
}
}
unless (scalar @exons ==0){
my $transcript = Bio::EnsEMBL::Transcript->new(@exons);
$transcript->id("transcript_id");
my $translation=Bio::EnsEMBL::Translation->new();
$translation->id ("some_id");
$translation->version (2);
$translation->start (55);
$translation->start_exon_id (1);
$translation->end (55);
$translation->end_exon_id (3);
$transcript->translation($translation);
$transcript->version(1);
#$transcript->gene("new_gene_id");
my $gene = Bio::EnsEMBL::Gene->new();
my $gene_id=$self->_get_Seq->seq->id;
$gene_id="EMBLG" . "0000" . $gene_id;
$gene->id($gene_id);
$gene->add_Transcript($transcript);
push @genes,$gene;
}
return @genes;
my ($contig) = $self->get_all_Contigs;
return $contig->get_all_Genes;
}
......@@ -404,6 +357,24 @@ sub get_all_Contigs {
}
=head2 get_all_ContigOverlaps
Title : get_all_ContigOverlaps
Usage : $obj->get_all_ContigOverlaps($newval)
Function:
Example :
Returns : value of get_all_ContigOverlaps
Args : newvalue (optional)
=cut
sub get_all_ContigOverlaps{
my ($obj,$value) = @_;
return ();
}
1;
......
......@@ -40,11 +40,11 @@ package Bio::EnsEMBL::EMBLLOAD::Contig;
use vars qw(@ISA);
use strict;
use Bio::Root::Object;
use Bio::AnnSeq;
@ISA = qw(Bio::Root::Object Bio::EnsEMBL::DB::ContigI);
use Bio::EnsEMBL::EMBLLOAD::Obj;
use Bio::EnsEMBL::Analysis::Analysis;
use Bio::EnsEMBL::SeqFeature;
use Bio::EnsEMBL::Gene;
sub _initialize {
my($self,@args) = @_;
......@@ -74,35 +74,76 @@ sub _initialize {
sub id {
my ($self) = @_;
my $id=$self->_get_Seq->seq->id . "00001";
my $id=$self->_get_Seq->id . ".00001";
return $id;
}
=head2 internal_id
Title : internal_id
Usage : $obj->internal_id($newval)
Function:
Example :
Returns : value of internal_id
Args : newvalue (optional)
=cut
sub internal_id{
my ($obj,$value) = @_;
if( defined $value) {
$obj->{'internal_id'} = $value;
}
return $obj->{'internal_id'};
}
=head2 seq
Title : seq
Usage : $seq = $contig->seq();
Function: Gets a Bio::Seq object out from the contig
Usage :
Function:
Example :
Returns : Bio::Seq object
Returns :
Args :
=cut
sub seq {
sub seq{
my ($self,@args) = @_;
return $self->primary_seq->seq;
}
=head2 primary_seq
Title : primary_seq
Usage :
Function:
Example :
Returns :
Args :
=cut
sub primary_seq{
my ($self,@args) = @_;
my ($self) = @_;
my $seq=$self->_get_Seq;
return $seq;
}
=head2 get_all_SeqFeatures
Title : get_all_SeqFeatures
......@@ -117,35 +158,7 @@ sub seq {
sub get_all_SeqFeatures {
my ($self) = @_;
my @features=$self->_get_Seq->all_SeqFeatures;
my @ensembl_features;
foreach my $feature(@features){
# Ewan to explain why do I have to copy one object to another
my $analysis = new Bio::EnsEMBL::Analysis::Analysis(-db => 'EMBL',
-db_version => 'NULL',
-program => 'NULL',
-program_version => 'NULL',
-gff_source => 'NULL',
-gff_feature => 'EMBL ann',
);
my $ensembl_feature=new Bio::EnsEMBL::SeqFeature(-seqname => $self->id,
-start => $feature->start,
-end => $feature->end,
-strand => $feature->strand,
-frame => $feature->frame,
-source_tag => $feature->source_tag,
-primary_tag => $feature->primary_tag,
-analysis => $analysis,
-score => $feature->score
);
push @ensembl_features,$ensembl_feature;
}
return @ensembl_features;
return ();
}
......@@ -164,13 +177,8 @@ sub get_all_SeqFeatures {
sub get_all_SimilarityFeatures{
my ($self) = @_;
my @sim_features;
my @features=$self->get_all_SeqFeatures;
foreach my $feature (@features){
if ($feature->analysis->gff_feature eq 'similarity'){
push @sim_features,$feature;}}
return @sim_features;
return ();
}
......@@ -194,55 +202,102 @@ sub get_all_Genes {
my ($self)=@_;
my @genes;
my @exons;
my $exon_counter;
foreach my $ft($self->_get_Seq->all_SeqFeatures){
if($ft->primary_tag eq 'CDS'){
my $exon = Bio::EnsEMBL::Exon->new($ft->start,$ft->end,$ft->strand);
$exon->phase("1");
$exon->end_phase("1");
$exon_counter++;
$exon->id($exon_counter);
$exon->contig_id($self->id);
$exon->version("1");
$exon->created("2000");
$exon->modified("2000");
$exon->attach_seq($self->_get_Seq->seq);
push @exons,$exon;
}
}
my $exoncounter = 1;
my $genecounter = 1;
unless (scalar @exons ==0){
my $transcript = Bio::EnsEMBL::Transcript->new(@exons);
$transcript->id("transcript_id");
my $translation=Bio::EnsEMBL::Translation->new();
$translation->id ("some_id");
$translation->version (2);
$translation->start (55);
$translation->start_exon_id (1);
$translation->end (55);
$translation->end_exon_id (3);
$transcript->version(1);
#$transcript->gene("new_gene_id");
$transcript->translation($translation);
my $gene = Bio::EnsEMBL::Gene->new();
my $gene_id=$self->_get_Seq->seq->id;
$gene_id="EMBLG" . "0000" . $gene_id;
$gene->id($gene_id);
$gene->add_Transcript($transcript);
push @genes,$gene;
my $id = $self->_get_Seq->id;
my $time = time();
foreach my $ft ( $self->_get_Seq->top_SeqFeatures ) {
if( $ft->primary_tag eq 'CDS_span' ) {
my $gene = Bio::EnsEMBL::Gene->new();
my $trans = Bio::EnsEMBL::Transcript->new();
$gene->add_Transcript($trans);
$gene->id($id.".gene.".$genecounter);
$gene->version(1);
$trans->id($id.".trans.".$genecounter);
$trans->version(1);
# split seqfeature
my $phase = 0;
foreach my $sub ( $ft->sub_SeqFeature ) {
my $exon = Bio::EnsEMBL::Exon->new();
$exon->phase($phase);
$exon->start($sub->start);
$exon->end($sub->end);
$exon->strand($sub->strand);
$exon->contig_id($self->id);
$exon->seqname($self->id);
$exon->version(1);
$exon->created($time);
$exon->modified($time);
$exon->id($id.".exon.".$exoncounter++);
$trans->add_Exon($exon);
$phase = $exon->end_phase();
}
my @exons = $trans->each_Exon;
my $first = shift @exons;
my $last;
if( $#exons == 0 ) {
$last = $first;
} else {
$last = pop @exons;
}
my $tranl = Bio::EnsEMBL::Translation->new();
$tranl->id($id.".transl.".$genecounter);
$tranl->start_exon_id($first->id);
$tranl->end_exon_id($last->id);
$tranl->start(1);
$tranl->end($last->length);
$tranl->version(1);
$trans->translation($tranl);
$genecounter++;
push(@genes,$gene);
} elsif ( $ft->primary_tag eq 'CDS' ) {
my $gene = Bio::EnsEMBL::Gene->new();
my $trans = Bio::EnsEMBL::Transcript->new();
$gene->add_Transcript($trans);
$gene->version(1);
$gene->id($id.".gene.".$genecounter);
$trans->id($id.".trans.".$genecounter);
$trans->version(1);
my $exon = Bio::EnsEMBL::Exon->new();
$exon->phase(0);
$exon->start($ft->start);
$exon->end($ft->end);
$exon->strand($ft->strand);
$exon->contig_id($self->id);
$exon->seqname($self->id);
$exon->version(1);
$exon->created($time);
$exon->modified($time);
$exon->id($id.".exon.".$exoncounter++);
$trans->add_Exon($exon);
my $tranl = Bio::EnsEMBL::Translation->new();
$tranl->id($id.".transl.".$genecounter);
$tranl->start_exon_id($exon->id);
$tranl->end_exon_id($exon->id);
$tranl->start(1);
$tranl->end($exon->length);
$trans->translation($tranl);
$tranl->version(1);
$genecounter++;
push(@genes,$gene);
} else {
# do nothing!
}
}
return @genes;
}
=head2 length
Title : length
......@@ -267,7 +322,7 @@ sub length {
sub _get_AnnSeq {
sub _get_Seq {
my ($self,$value) = @_;
if (defined $value){$self->{'annseq'}=$value;}
return $self->{'annseq'};
......@@ -349,6 +404,42 @@ sub seq_date{
}
=head2 embl_offset
Title : embl_offset
Usage :
Function:
Example :
Returns :
Args :
=cut
sub embl_offset{
my ($self,@args) = @_;
return 1;
}
=head2 embl_order
Title : embl_order
Usage :
Function:
Example :
Returns :
Args :
=cut
sub embl_order{
my ($self,@args) = @_;
return 1;
}
=head2 orientation
......
......@@ -40,7 +40,7 @@ use vars qw(@ISA);
use strict;
use Bio::Root::Object;
@ISA = qw(Bio::Root::Object);
use Bio::AnnSeqIO;
use Bio::SeqIO;
use Bio::EnsEMBL::Gene;
use Bio::EnsEMBL::EMBLLOAD::Clone;
......
# $Id$
# testing of translations of exons that lie across contig boundaries.
# based on staticgoldenpath.t and staticgoldenpath.dump
## We start with some black magic to print on failure.
BEGIN { $| = 1; print "1..4\n";
use vars qw($loaded); }
END {print "not ok 1\n" unless $loaded;}
use Bio::EnsEMBL::DBSQL::Obj;
use Bio::EnsEMBL::DBLoader;
use Bio::EnsEMBL::EMBLLOAD::Obj;
use Bio::SeqIO;
use lib 't';
use EnsTestDB;
$loaded = 1;
print "ok 1\n"; # 1st test passes.
$" = ", "; # for easier list-printing
my $ens_test = EnsTestDB->new();
my $db = $ens_test->get_DBSQL_Obj;
$file = "t/roa1.dat";
$seqio = Bio::SeqIO->new( -format => 'EMBL',-file => $file);
while( my $seq = $seqio->next_seq ) {
$obj = Bio::EnsEMBL::EMBLLOAD::Obj->new(-seq => $seq);
($clone) = $obj->get_Clone();
$db->write_Clone($clone);
@genes = $clone->get_all_Genes();
foreach $gene ( @genes ) {
$db->write_Gene($gene);
}
}
print "ok 2\n";
$clone = $db->get_Clone('HSHNRNPA');
print "ok 3\n";
$gene = $db->get_Gene('HSHNRNPA.gene.1');
print "ok 4\n";
ID HSHNRNPA standard; DNA; HUM; 5368 BP.
XX
AC X12671;
XX
SV X12671.1
XX
DT 23-NOV-1989 (Rel. 21, Created)
DT 24-APR-1993 (Rel. 35, Last updated, Version 3)
XX
DE Human gene for heterogeneous nuclear ribonucleoprotein (hnRNP) core protein
DE A1
XX
KW hnRNP A1 proten; ribonucleoprotein; RNA binding protein.
XX
OS Homo sapiens (human)
OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia;
OC Eutheria; Primates; Catarrhini; Hominidae; Homo.
XX
RN [1]
RP 1-5368
RA Riva S.;
RT ;
RL Submitted (23-AUG-1988) to the EMBL/GenBank/DDBJ databases.
RL Riva S., Consiglio Nazionale Delle Ricerche, Istituto Di Genetica
RL Biochimica Ed Evoluzionistica CNR, Via Abbiategrasso 2D7, 27100 Pavia,
RL Italy.
XX
RN [2]
RP 1-5368
RA Biamonti G., Buvoli M., Bassi M.T., Morandi C., Cobianchi F., Riva S.;
RT "Isolation of an active gene encoding human hnRNP protein A1";
RL J. Mol. Biol. 207:491-503(1988).
XX
DR SWISS-PROT; P09651; ROA1_HUMAN.
XX
FH Key Location/Qualifiers
FH
FT source 1..5368
FT /db_xref="taxon:9606"
FT /organism="Homo sapiens"
FT /clone="pES5"
FT /tissue_type="liver"
FT /clone_lib="lambdaCh4A."
FT mRNA join(695..813,1377..1493,1789..1935,2084..2294,2388..2480,
FT 2567..2659,2794..2868,3806..3961,4252..4311,4543..5240)
FT CDS join(799..813,1377..1493,1789..1935,2084..2294,2388..2480,
FT 2567..2659,2794..2868,3806..3961,4252..4307)
FT /db_xref="SWISS-PROT:P09651"
FT /product="hnrnp a1 protein"
FT /protein_id="CAA31191.1"
FT /translation="MSKSESPKEPEQLRKLFIGGLSFETTDESLRSHFEQWGTLTDCVV
FT MRDPNTKRSRGFGFVTYATVEEVDAAMNARPHKVDGRVVEPKRAVSREDSQRPGAHLTV
FT KKIFVGGIKEDTEEHHLRDYFEQYGKIEVIEIMTDRGSGKKRGFAFVTFDDHDSVDKIV
FT IQKYHTVNGHNCEVRKALSKQEMASASSSQRGRSGSGNFGGGRGGGFGGNDNFGRGGNF
FT SGRGGFGGSRGGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGPMKGGNF
FT GGRSSGPYGGGGQYFAKPRNQGGYGGSSSSSSYGSGRRF"
FT exon 695..813
FT /number=1
FT misc_feature 695..695
FT /note="mRNA initiation site"
FT misc_feature 715..715
FT /note="mRNA initiation site"
FT misc_feature 735..735
FT /note="mRNA initiation site"
FT intron 814..1376
FT /number=1
FT exon 1377..1493
FT /number=2
FT intron 1494..1788
FT /number=2
FT exon 1789..1935
FT /number=3
FT intron 1936..2083
FT /number=3
FT exon 2084..2294
FT /number=4
FT intron 2295..2387
FT /number=4
FT exon 2388..2480
FT /number=5
FT intron 2481..2566
FT /number=5
FT exon 2567..2659
FT /number=6
FT intron 2660..2793
FT /number=6
FT exon 2794..2868
FT /number=7
FT intron 2869..3805
FT /number=7
FT exon 3806..3961
FT /number=8
FT intron 3962..4251
FT /number=8
FT exon 4252..4311
FT /number=9
FT intron 4312..4542
FT /number=9
FT exon 4543..5240
FT /number=10
XX
SQ Sequence 5368 BP; 1476 A; 1052 C; 1270 G; 1570 T; 0 other;
gggattgaga gtgatcactc acgctaacgt ctgccctgtt cctgtatggt gaggccgcac 60
cacaagccac caccgccgcc gccttctgcg caacgccaac cgcccgccaa aacggatcct 120
tccctgcgcc tgcgcaacca atcttgggac cggacctttt ttctccgccc actacgcatg 180