Commit 6b4d4460 authored by Monika Komorowska's avatar Monika Komorowska
Browse files

get mapping between vega stable ids and ensembl gene ids from the core db

parent de4242fe
......@@ -38,44 +38,75 @@ if(!defined($cdbname)){
$cdbname = "homo_sapiens_core_".$api_version."_37";
}
#
# Connect to the core database
#
my $core_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $chost||'ens-staging1',
-user => $cuser||'ensadmin',
-pass => $cpass,
-species => "test",
-dbname => $cdbname||"homo_sapiens_core_63_37");
#
# get ensembl gene ids and vega stable ids from the core database
#
my %vega_to_ens_id;
my ($vega_stable_id, $gene_id);
my $sth = $core_dba->dbc->prepare("select ensembl_id, display_label from object_xref join xref using(xref_id) join external_db using(external_db_id) where db_name = 'OTTG' and ensembl_object_type = 'Gene'");
$sth->execute;
$sth->bind_columns(\$gene_id, \$vega_stable_id);
while ($sth->fetch){
$vega_to_ens_id{$vega_stable_id} = $gene_id;
}
$sth->finish;
my $vega_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $vhost||'ens-staging1',
-user => $vuser||'ensro',
-port => $vport||3306,
-dbname => $vdbname||"homo_sapiens_vega_63_37");
#
# the magic SQL to get the data from vega
# SQL to get alt_allele data from vega
#
my $sql =(<<EOS);
select aa.alt_allele_id, g.stable_id, x2.display_label
from alt_allele aa, xref x1, gene g
left join object_xref ox on g.gene_id = ox.ensembl_id
left join xref x2 on ox.xref_id = x2.xref_id
left join external_db edb on x2.external_db_id = edb.external_db_id
select aa.alt_allele_id, g.stable_id
from alt_allele aa, gene g
where aa.gene_id = g.gene_id
and g.display_xref_id = x1.xref_id
and ox.ensembl_object_type = 'Gene'
and edb.db_name = 'ENSG';
EOS
#
# Store data in a hash where the key is the alt_id and the stable ids
# Store data in a hash where the key is the alt_id and the ensembl gene ids
# stored in an anonymous array (value of the hash).
#
my $sth = $vega_dba->dbc->prepare($sql);
$sth->execute;
my ($alt_id, $vega_stable_id, $core_stable_id);
my %alt_to_stable;
$sth->bind_columns(\$alt_id, \$vega_stable_id, \$core_stable_id);
my ($alt_id, $vega_stable_id);
my %alt_to_gene;
$sth->bind_columns(\$alt_id, \$vega_stable_id);
my $max_alt_id = 0;
my %no_gene_id;
while($sth->fetch()){
push @{$alt_to_stable{$alt_id}}, $core_stable_id;
my $gene_id = $vega_to_ens_id{$vega_stable_id};
if ( defined($gene_id) ) {
push @{$alt_to_gene{$alt_id}}, $gene_id ;
} else {
push @{$no_gene_id{$alt_id}}, $vega_stable_id;
print STDERR "no ensembl gene_id found for vega stable id $vega_stable_id in core\n";
}
if($alt_id > $max_alt_id){
$max_alt_id = $alt_id;
}
......@@ -84,62 +115,6 @@ $sth->finish;
$max_alt_id += 1000;
#
# Test case for invalid stable id. (Delete after testing
#
push @{$alt_to_stable{9999}}, "INVALID1";
# initial testing to make sure sensible info was obtained.
#foreach my $key (keys %alt_to_stable){
# my @arr = @{$alt_to_stable{$key}};
# if(scalar(@arr) > 1){
# print $key;
# foreach my $stable_id (@arr){
# print "\t".$stable_id;
# }
# print "\n";
# }
# else{
# print "$key has only one ensembl stable id ".$arr[0]." so ignoring\n";
# }
#}
#EXAMPLE:
#54 ENSG00000206285 ENSG00000236802 ENSG00000226936 ENSG00000235155 ENSG00000235863
#
# Connect to the core database to store the data in.
#
my $core_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $chost||'ens-staging1',
-user => $cuser||'ensadmin',
-pass => $cpass,
-species => "test",
-dbname => $cdbname||"homo_sapiens_core_63_37");
#
# for each stable_id look it up in the gene_stable_id table and get the gene_id
# store this in a hash.
my %stable_id_to_gene_id;
$sth = $core_dba->dbc->prepare("Select stable_id, gene_id from gene");
$sth->execute;
my ($gene_id);
$sth->bind_columns(\$core_stable_id, \$gene_id);
while ($sth->fetch){
$stable_id_to_gene_id{$core_stable_id} = $gene_id;
}
$sth->finish;
my %non_reference; # $non_reference{$gene_id} = 1;
......@@ -160,7 +135,6 @@ while($sth->fetch()){
}
#
# Delete the old data
#
......@@ -170,28 +144,17 @@ $sth->execute;
#
# Check that all stable_ids are valid.
# NOTE: if one is invalid then if the alt_allele only had two members
# then with the reduction of one member will invalidate the alt_allele.
# Check that all alt_alleles are valid.
# NOTE: if an alt allele has only one gene_id delete it from the hash
#
foreach my $key (keys %alt_to_stable){
my @arr = @{$alt_to_stable{$key}};
my @arr2;
if(scalar(@arr) > 1){
foreach my $stable_id (@arr){
if(defined($stable_id_to_gene_id{$stable_id})){
push @arr2, $stable_id;
}
else{
print STDERR "$stable_id in vega database but not in core\n";
}
}
}
else{
print "$key has only one ensembl stable id ".join(", ",@arr)." so ignoring\n";
foreach my $key (keys %alt_to_gene){
my @arr = @{$alt_to_gene{$key}};
if(scalar(@arr) <= 1){
delete $alt_to_gene{$key};
my @unmapped_vega = $no_gene_id{$key};
print STDERR "ignoring alt allele $key: unmapped vega stable id(s): ". join(", ",@unmapped_vega) ."\n";
}
$alt_to_stable{$key} = \@arr2;
}
#
......@@ -210,20 +173,20 @@ my $alt_id_count = 0;
#
#my %gene_id_to_alt_id;
foreach my $key (keys %alt_to_stable){
my @arr = @{$alt_to_stable{$key}};
foreach my $key (keys %alt_to_gene){
my @arr = @{$alt_to_gene{$key}};
if(scalar(@arr) > 1){
my $sanity_check =0; # should be 1 refernce gene only if 0 or more than 1 we may have a problem
foreach my $stable_id (@arr){
foreach my $gene_id (@arr){
my $ref = 1;
if(defined( $non_reference{$stable_id_to_gene_id{$stable_id}})){
if(defined( $non_reference{$gene_id})){
$ref = 0;
}
else{
$sanity_check++;
}
$insert_sth->execute($key, $stable_id_to_gene_id{$stable_id}, $ref);
# $gene_id_to_alt_id{$stable_id_to_gene_id{$stable_id}}= $key;
$insert_sth->execute($key, $gene_id, $ref);
# $gene_id_to_alt_id{$gene_id}= $key;
$count++;
}
if($sanity_check !=1){
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment