Skip to content
Snippets Groups Projects
Commit d0e556a2 authored by Monika Komorowska's avatar Monika Komorowska
Browse files

use gene names from core instead if vega to populate alt_allele table

parent 2a4fda7b
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env perl
use strict;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Getopt::Long qw(:config pass_through);
# (make sure api version is correct
# Usage:
# perl alt_alleles.pl -cpass XXXX > & human_release_63_alt_alleles
# perl alt_alleles.pl -pass XXXX > & human_release_63_alt_alleles
#
#
# long way
# perl alt_alleles.pl -vhost ens-staging1 -vport 3306 -vdbname homo_sapiens_vega_63_37 -cdbname homo_sapiens_core_63_37 -chost ens-staging1 -cpass XXXX > & human_release_63_alt_alleles
# perl alt_alleles.pl -dbname homo_sapiens_core_63_37 -host ens-staging1 -pass XXXX > & human_release_63_alt_alleles
#
my ($vhost, $vpass, $vport, $vdbname, $vuser, $chost, $cpass, $cport, $cdbname, $cuser);
my ($host, $pass, $port, $dbname, $user);
GetOptions(
'vuser=s' => \$vuser,
'vpass=s' => \$vpass,
'vhost=s' => \$vhost,
'vport=i' => \$vport,
'vdbname=s' => \$vdbname,
'cuser=s' => \$cuser,
'cpass=s' => \$cpass,
'chost=s' => \$chost,
'cport=i' => \$cport,
'cdbname=s' => \$cdbname);
#
# Connect to the vgea databse to get the alt allele data.
#
'user=s' => \$user,
'pass=s' => \$pass,
'host=s' => \$host,
'port=i' => \$port,
'dbname=s' => \$dbname);
my $api_version = Bio::EnsEMBL::ApiVersion->software_version();
if(!defined($vdbname)){
$vdbname = "homo_sapiens_vega_".$api_version."_37";
}
if(!defined($cdbname)){
$cdbname = "homo_sapiens_core_".$api_version."_37";
}
my $api_version = Bio::EnsEMBL::ApiVersion->software_version();
if(!defined($dbname)){
$dbname = "homo_sapiens_core_".$api_version."_37";
}
#
# Connect to the core database
#
my $core_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $chost||'ens-staging1',
-user => $cuser||'ensadmin',
-pass => $cpass,
-species => "test",
-dbname => $cdbname||"homo_sapiens_core_63_37");
#
# get ensembl gene ids and vega stable ids from the core database
#
my %vega_to_ens_id;
my ($vega_stable_id, $gene_id);
my $core_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $host||'ens-staging1',
-port => $port || '3306',
-user => $user||'ensadmin',
-pass => $pass,
-dbname => $dbname);
my $sth = $core_dba->dbc->prepare("select ensembl_id, display_label from object_xref join xref using(xref_id) join external_db using(external_db_id) where db_name = 'OTTG' and ensembl_object_type = 'Gene'");
$sth->execute;
my $sa = $core_dba->get_adaptor("slice");
my $ga = $core_dba->get_adaptor("gene");
$sth->bind_columns(\$gene_id, \$vega_stable_id);
my @top_slices = @{$sa->fetch_all("toplevel",undef,1)}; #returns all slices including non-reference ("toplevel",undef,1)
print "Looping over chromosomes...\n";
while ($sth->fetch){
$vega_to_ens_id{$vega_stable_id} = $gene_id;
}
$sth->finish;
my %alt_alleles;
my %reference_genes;
my %logic_names = ("havana" => 1,
"ensembl_havana_gene" => 1,
"ensembl_havana_lincrna" => 1,);
my %skip_sources = ("Clone_based_vega_gene" => 1,
"Clone_based_ensembl_gene" => 1);
my $vega_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $vhost||'ens-staging1',
-user => $vuser||'ensro',
-port => $vport||3306,
-dbname => $vdbname||"homo_sapiens_vega_63_37");
foreach my $slice (@top_slices) {
my $slice_name = $slice->seq_region_name;
GENE:
foreach my $gene (@{$ga->fetch_all_by_Slice($slice)}) {
next GENE if (exists($skip_sources{$gene->external_db}) );
#
# SQL to get alt_allele data from vega
#
next GENE unless (exists($logic_names{$gene->analysis->logic_name}) );
my $sql =(<<EOS);
select aa.alt_allele_id, g.stable_id
from alt_allele aa, gene g
where aa.gene_id = g.gene_id
EOS
my $gene_id = $gene->dbID;
#get name and synonyms
my ($syns,$display_id);
eval { $display_id = $gene->display_xref->display_id };
if ($@) {
print "Can't get name for gene $gene_id on slice $slice_name\n";
} else {
push @{$alt_alleles{$display_id}}, $gene_id ;
if ($slice->is_reference) {
$reference_genes{$gene_id} = 1;
}
#
# Store data in a hash where the key is the alt_id and the ensembl gene ids
# stored in an anonymous array (value of the hash).
#
my $sth = $vega_dba->dbc->prepare($sql);
$sth->execute;
my ($alt_id, $vega_stable_id);
my %alt_to_gene;
$sth->bind_columns(\$alt_id, \$vega_stable_id);
my $max_alt_id = 0;
my %no_gene_id;
}
while($sth->fetch()){
my $gene_id = $vega_to_ens_id{$vega_stable_id};
if ( defined($gene_id) ) {
push @{$alt_to_gene{$alt_id}}, $gene_id ;
} else {
push @{$no_gene_id{$alt_id}}, $vega_stable_id;
print STDERR "no ensembl gene_id found for vega stable id $vega_stable_id in core\n";
}
if($alt_id > $max_alt_id){
$max_alt_id = $alt_id;
}
}
$sth->finish;
$max_alt_id += 1000;
my %non_reference; # $non_reference{$gene_id} = 1;
$sql = (<<SEQ);
SELECT g.gene_id
FROM gene g, seq_region_attrib sra, attrib_type at
WHERE g.seq_region_id = sra.seq_region_id AND
at.attrib_type_id = sra.attrib_type_id AND
at.code = 'non_ref'
SEQ
$sth = $core_dba->dbc->prepare($sql);
$sth->execute;
$sth->bind_columns(\$gene_id);
while($sth->fetch()){
$non_reference{$gene_id} = 1;
}
}
#
# Delete the old data
#
$sth = $core_dba->dbc->prepare("delete from alt_allele");
my $sth = $core_dba->dbc->prepare("delete from alt_allele");
$sth->execute;
......@@ -148,12 +95,10 @@ $sth->execute;
# NOTE: if an alt allele has only one gene_id delete it from the hash
#
foreach my $key (keys %alt_to_gene){
my @arr = @{$alt_to_gene{$key}};
foreach my $key (keys %alt_alleles){
my @arr = @{$alt_alleles{$key}};
if(scalar(@arr) <= 1){
delete $alt_to_gene{$key};
my @unmapped_vega = $no_gene_id{$key};
print STDERR "ignoring alt allele $key: unmapped vega stable id(s): ". join(", ",@unmapped_vega) ."\n";
delete $alt_alleles{$key};
}
}
......@@ -161,116 +106,34 @@ foreach my $key (keys %alt_to_gene){
# Then store the alt_allele and gene_id in the alt_allele table.
#
$sql = "insert into alt_allele (alt_allele_id, gene_id, is_ref) values(?, ?, ?)";
my $sql = "insert into alt_allele (alt_allele_id, gene_id, is_ref) values(?, ?, ?)";
my $insert_sth = $core_dba->dbc->prepare($sql);
my $count=0;
my $alt_id_count = 0;
#
# gene_id_to_alt_id needed for LRG stuff
#
#my %gene_id_to_alt_id;
foreach my $key (keys %alt_to_gene){
my @arr = @{$alt_to_gene{$key}};
foreach my $key (keys %alt_alleles){
my @arr = @{$alt_alleles{$key}};
if(scalar(@arr) > 1){
my $sanity_check =0; # should be 1 refernce gene only if 0 or more than 1 we may have a problem
my $sanity_check =0; # should be 1 reference gene only if 0 or more than 1 we may have a problem
$alt_id_count++;
foreach my $gene_id (@arr){
my $ref = 1;
if(defined( $non_reference{$gene_id})){
$ref = 0;
}
else{
my $ref = 0;
if(exists($reference_genes{$gene_id})){
$ref = 1;
$sanity_check++;
}
$insert_sth->execute($key, $gene_id, $ref);
# $gene_id_to_alt_id{$gene_id}= $key;
$insert_sth->execute($alt_id_count, $gene_id, $ref);
$count++;
}
if($sanity_check !=1){
print STDERR "Problem we have $sanity_check genes on the reference sequence for alt_id $key\n";
print STDERR "Problem we have $sanity_check genes on the reference sequence for alt_id $alt_id_count gene display_id $key\n";
}
$alt_id_count++;
}
elsif(scalar(@arr) == 0){ # been removed due to stable id error
}
}
}
print "Added $alt_id_count alt_allele ids for $count genes\n";
#
# useful sql to look at sanity check problems
#
#select g.stable_id,g.seq_region_start, g.seq_region_end, s.name, x.display_label from gene g, seq_region s, alt_allele aa, xref x where g.display_xref_id = x.xref_id and g.gene_id = aa.gene_id and g.seq_region_id = s.seq_region_id and aa.alt_allele_id =37;
exit;
## LRG SQL. How to fit this in?
##select ox.ensembl_id, g.gene_id from xref x, object_xref ox, external_db e, gene g where x.xref_id = ox.xref_id and e.external_db_id = x.external_db_id and e.db_name like "Ens_Hs_gene" and ox.ensembl_object_type = "Gene" and x.display_label = g.stable_id ;
##
## Use $max_alt_id for new ones.
##
#$sql =(<<LRG);
#SELECT ox.ensembl_id, g.gene_id
# FROM xref x, object_xref ox, external_db e, gene g
# WHERE x.xref_id = ox.xref_id AND
# e.external_db_id = x.external_db_id AND
# e.db_name like "Ens_Hs_gene" AND
# ox.ensembl_object_type = "Gene" AND
# x.display_label = g.stable_id
#LRG
#$sth = $core_dba->dbc->prepare($sql);
#my ($core_gene_id, $lrg_gene_id);
#$sth->execute();
#$sth->bind_columns(\$lrg_gene_id, \$core_gene_id);
#$count =0;
#my $old_count = 0;
#my $new_count = 0;
#my $lrg_count = 0;
##
## If the core gene is already in an alt_allele set then use that alt_id for the LRG gene only.
## Else use a new one and add both core and LRG.
##
#while ($sth->fetch()){
# if(defined($gene_id_to_alt_id{$core_gene_id})){
# $insert_sth->execute($gene_id_to_alt_id{$core_gene_id}, $lrg_gene_id);
# $old_count++;
# }
# elsif(defined($gene_id_to_alt_id{$lrg_gene_id})){
# $insert_sth->execute($gene_id_to_alt_id{$lrg_gene_id}, $core_gene_id);
# print "LRG perculiarity\t$core_gene_id\t$lrg_gene_id\n";
# $lrg_count++;
# }
# else{ # new one.
# $max_alt_id++;
# $insert_sth->execute($max_alt_id, $lrg_gene_id);
# $insert_sth->execute($max_alt_id, $core_gene_id);
# $new_count++;
# }
# $count++;
#}
#print "Added $count alt_allels for the lrgs. $old_count added to previous alt_alleles and $new_count new ones\n";
#print "LRG problem count = $lrg_count\n";
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment