Commit 6b4d4460 authored by Monika Komorowska's avatar Monika Komorowska
Browse files

get mapping between vega stable ids and ensembl gene ids from the core db

parent de4242fe
...@@ -38,44 +38,75 @@ if(!defined($cdbname)){ ...@@ -38,44 +38,75 @@ if(!defined($cdbname)){
$cdbname = "homo_sapiens_core_".$api_version."_37"; $cdbname = "homo_sapiens_core_".$api_version."_37";
} }
#
# Connect to the core database
#
my $core_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $chost||'ens-staging1',
-user => $cuser||'ensadmin',
-pass => $cpass,
-species => "test",
-dbname => $cdbname||"homo_sapiens_core_63_37");
#
# get ensembl gene ids and vega stable ids from the core database
#
my %vega_to_ens_id;
my ($vega_stable_id, $gene_id);
my $sth = $core_dba->dbc->prepare("select ensembl_id, display_label from object_xref join xref using(xref_id) join external_db using(external_db_id) where db_name = 'OTTG' and ensembl_object_type = 'Gene'");
$sth->execute;
$sth->bind_columns(\$gene_id, \$vega_stable_id);
while ($sth->fetch){
$vega_to_ens_id{$vega_stable_id} = $gene_id;
}
$sth->finish;
my $vega_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $vhost||'ens-staging1', my $vega_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $vhost||'ens-staging1',
-user => $vuser||'ensro', -user => $vuser||'ensro',
-port => $vport||3306, -port => $vport||3306,
-dbname => $vdbname||"homo_sapiens_vega_63_37"); -dbname => $vdbname||"homo_sapiens_vega_63_37");
# #
# the magic SQL to get the data from vega # SQL to get alt_allele data from vega
# #
my $sql =(<<EOS); my $sql =(<<EOS);
select aa.alt_allele_id, g.stable_id, x2.display_label select aa.alt_allele_id, g.stable_id
from alt_allele aa, xref x1, gene g from alt_allele aa, gene g
left join object_xref ox on g.gene_id = ox.ensembl_id
left join xref x2 on ox.xref_id = x2.xref_id
left join external_db edb on x2.external_db_id = edb.external_db_id
where aa.gene_id = g.gene_id where aa.gene_id = g.gene_id
and g.display_xref_id = x1.xref_id
and ox.ensembl_object_type = 'Gene'
and edb.db_name = 'ENSG';
EOS EOS
# #
# Store data in a hash where the key is the alt_id and the stable ids # Store data in a hash where the key is the alt_id and the ensembl gene ids
# stored in an anonymous array (value of the hash). # stored in an anonymous array (value of the hash).
# #
my $sth = $vega_dba->dbc->prepare($sql); my $sth = $vega_dba->dbc->prepare($sql);
$sth->execute; $sth->execute;
my ($alt_id, $vega_stable_id, $core_stable_id); my ($alt_id, $vega_stable_id);
my %alt_to_stable; my %alt_to_gene;
$sth->bind_columns(\$alt_id, \$vega_stable_id, \$core_stable_id); $sth->bind_columns(\$alt_id, \$vega_stable_id);
my $max_alt_id = 0; my $max_alt_id = 0;
my %no_gene_id;
while($sth->fetch()){ while($sth->fetch()){
push @{$alt_to_stable{$alt_id}}, $core_stable_id; my $gene_id = $vega_to_ens_id{$vega_stable_id};
if ( defined($gene_id) ) {
push @{$alt_to_gene{$alt_id}}, $gene_id ;
} else {
push @{$no_gene_id{$alt_id}}, $vega_stable_id;
print STDERR "no ensembl gene_id found for vega stable id $vega_stable_id in core\n";
}
if($alt_id > $max_alt_id){ if($alt_id > $max_alt_id){
$max_alt_id = $alt_id; $max_alt_id = $alt_id;
} }
...@@ -84,62 +115,6 @@ $sth->finish; ...@@ -84,62 +115,6 @@ $sth->finish;
$max_alt_id += 1000; $max_alt_id += 1000;
#
# Test case for invalid stable id. (Delete after testing
#
push @{$alt_to_stable{9999}}, "INVALID1";
# initial testing to make sure sensible info was obtained.
#foreach my $key (keys %alt_to_stable){
# my @arr = @{$alt_to_stable{$key}};
# if(scalar(@arr) > 1){
# print $key;
# foreach my $stable_id (@arr){
# print "\t".$stable_id;
# }
# print "\n";
# }
# else{
# print "$key has only one ensembl stable id ".$arr[0]." so ignoring\n";
# }
#}
#EXAMPLE:
#54 ENSG00000206285 ENSG00000236802 ENSG00000226936 ENSG00000235155 ENSG00000235863
#
# Connect to the core database to store the data in.
#
my $core_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $chost||'ens-staging1',
-user => $cuser||'ensadmin',
-pass => $cpass,
-species => "test",
-dbname => $cdbname||"homo_sapiens_core_63_37");
#
# for each stable_id look it up in the gene_stable_id table and get the gene_id
# store this in a hash.
my %stable_id_to_gene_id;
$sth = $core_dba->dbc->prepare("Select stable_id, gene_id from gene");
$sth->execute;
my ($gene_id);
$sth->bind_columns(\$core_stable_id, \$gene_id);
while ($sth->fetch){
$stable_id_to_gene_id{$core_stable_id} = $gene_id;
}
$sth->finish;
my %non_reference; # $non_reference{$gene_id} = 1; my %non_reference; # $non_reference{$gene_id} = 1;
...@@ -160,7 +135,6 @@ while($sth->fetch()){ ...@@ -160,7 +135,6 @@ while($sth->fetch()){
} }
# #
# Delete the old data # Delete the old data
# #
...@@ -170,28 +144,17 @@ $sth->execute; ...@@ -170,28 +144,17 @@ $sth->execute;
# #
# Check that all stable_ids are valid. # Check that all alt_alleles are valid.
# NOTE: if one is invalid then if the alt_allele only had two members # NOTE: if an alt allele has only one gene_id delete it from the hash
# then with the reduction of one member will invalidate the alt_allele.
# #
foreach my $key (keys %alt_to_stable){ foreach my $key (keys %alt_to_gene){
my @arr = @{$alt_to_stable{$key}}; my @arr = @{$alt_to_gene{$key}};
my @arr2; if(scalar(@arr) <= 1){
if(scalar(@arr) > 1){ delete $alt_to_gene{$key};
foreach my $stable_id (@arr){ my @unmapped_vega = $no_gene_id{$key};
if(defined($stable_id_to_gene_id{$stable_id})){ print STDERR "ignoring alt allele $key: unmapped vega stable id(s): ". join(", ",@unmapped_vega) ."\n";
push @arr2, $stable_id;
}
else{
print STDERR "$stable_id in vega database but not in core\n";
}
}
}
else{
print "$key has only one ensembl stable id ".join(", ",@arr)." so ignoring\n";
} }
$alt_to_stable{$key} = \@arr2;
} }
# #
...@@ -210,20 +173,20 @@ my $alt_id_count = 0; ...@@ -210,20 +173,20 @@ my $alt_id_count = 0;
# #
#my %gene_id_to_alt_id; #my %gene_id_to_alt_id;
foreach my $key (keys %alt_to_stable){ foreach my $key (keys %alt_to_gene){
my @arr = @{$alt_to_stable{$key}}; my @arr = @{$alt_to_gene{$key}};
if(scalar(@arr) > 1){ if(scalar(@arr) > 1){
my $sanity_check =0; # should be 1 refernce gene only if 0 or more than 1 we may have a problem my $sanity_check =0; # should be 1 refernce gene only if 0 or more than 1 we may have a problem
foreach my $stable_id (@arr){ foreach my $gene_id (@arr){
my $ref = 1; my $ref = 1;
if(defined( $non_reference{$stable_id_to_gene_id{$stable_id}})){ if(defined( $non_reference{$gene_id})){
$ref = 0; $ref = 0;
} }
else{ else{
$sanity_check++; $sanity_check++;
} }
$insert_sth->execute($key, $stable_id_to_gene_id{$stable_id}, $ref); $insert_sth->execute($key, $gene_id, $ref);
# $gene_id_to_alt_id{$stable_id_to_gene_id{$stable_id}}= $key; # $gene_id_to_alt_id{$gene_id}= $key;
$count++; $count++;
} }
if($sanity_check !=1){ if($sanity_check !=1){
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment