alt_alleles.pl 4.77 KB
Newer Older
1
#!/usr/bin/env perl
Magali Ruffier's avatar
Magali Ruffier committed
2
# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Tiago Grego's avatar
Tiago Grego committed
3
# Copyright [2016-2019] EMBL-European Bioinformatics Institute
4 5 6 7 8 9 10 11 12 13 14 15 16
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#      http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

17

18
use strict;
19
use warnings;
20
use Bio::EnsEMBL::DBSQL::DBAdaptor;
21
use Bio::EnsEMBL::AltAlleleGroup;
22 23
use Getopt::Long qw(:config pass_through);

24 25
# (make sure api version is correct
# Usage:
26
# perl alt_alleles.pl -cpass XXXX > & human_release_63_alt_alleles
27 28 29
#
#
# long way
30
# perl alt_alleles.pl -vhost ens-staging1 -vport 3306 -vdbname homo_sapiens_vega_63_37 -cdbname homo_sapiens_core_63_37 -chost ens-staging1 -cpass XXXX > & human_release_63_alt_alleles
31 32
#

33
my ($vhost, $vpass, $vport, $vdbname, $vuser, $chost, $cpass, $cport, $cdbname, $cuser);
34 35

GetOptions(
36 37 38 39 40 41 42 43 44 45 46 47 48
    'vuser=s'  => \$vuser,
    'vpass=s'  => \$vpass,
    'vhost=s'  => \$vhost,
    'vport=i'  => \$vport,
    'vdbname=s'       => \$vdbname,
    'cuser=s'  => \$cuser,
    'cpass=s'  => \$cpass,
    'chost=s'  => \$chost,
    'cport=i'  => \$cport,
    'cdbname=s'       => \$cdbname);
#
# Connect to the vgea databse to get the alt allele data.
#
49

50
my $api_version = Bio::EnsEMBL::ApiVersion->software_version();
51

52 53
if(!defined($vdbname)){
  $vdbname = "homo_sapiens_vega_".$api_version."_37";
54
}
55

56 57 58 59 60
if(!defined($cdbname)){
  $cdbname = "homo_sapiens_core_".$api_version."_37";
}

#
61
# Connect to the core & vega database 
62
#
63

64
my $core_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
65
  -host => $chost||'ens-staging1',
66 67 68
  -user => $cuser||'ensadmin',
  -pass => $cpass,
  -group => 'core',
69 70
  -dbname => $cdbname,
  -port => $cport
71
);
72

73
my $vega_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
74
  -host => $vhost||'ens-staging1',
75 76 77
  -user => $vuser||'ensadmin',
  -pass => $vpass,
  -group => 'vega',
78 79
  -dbname => $vdbname,
  -port => $vport
80
);
81 82


83
#
84
# get ensembl gene ids and vega stable ids from the *core* database
85
# 
86
my $vega_core_sql = <<'SQL';
87
select display_label, ensembl_id
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
from object_xref 
join xref using(xref_id) 
join external_db using(external_db_id) 
where db_name = 'OTTG' 
and ensembl_object_type = 'Gene'
SQL

# sometimes we will see more than one gene associated with an OTTG
# this happens when an OTTG on the primary assemby has been projected to a patch.
my %vega_to_ensembl_core_gene_id;
$core_dba->dbc->sql_helper()->execute_no_return(-SQL => $vega_core_sql, -CALLBACK => sub {
  my ($row) = @_;
  my ($vega_stable_id, $gene_id) = @{$row};
  $vega_to_ensembl_core_gene_id{$vega_stable_id}{$gene_id} = $gene_id;
});

print "\nFetched ".(scalar(keys %vega_to_ensembl_core_gene_id))." Vega Stable IDs\n";

#
# Get AltAlleles from vega
#
my $vega_aaga = $vega_dba->get_AltAlleleGroupAdaptor();
Andy Yates's avatar
Andy Yates committed
110
my $vega_groups = $vega_aaga->fetch_all();
111 112 113

my $cnt_vega_rows = @{$vega_groups};
print STDERR "Fetched $cnt_vega_rows rows from the vega db alt_allele table\n";
114

115
my %no_gene_id;
116 117 118
my @new_groups;
foreach my $group (@{$vega_groups}) {
  my $members = $group->get_all_Genes_types();
119
  my $new_core_group = Bio::EnsEMBL::AltAlleleGroup->new();
120 121
  foreach my $member (@{$members}) {
    my ($vega_gene, $attribs_hash) = @{$member};
122 123 124
    my $vega_stable_id = $vega_gene->stable_id();
    if(exists $vega_to_ensembl_core_gene_id{$vega_stable_id}) {
      foreach my $gene_id (keys %{$vega_to_ensembl_core_gene_id{$vega_stable_id}} ) {
125 126 127 128 129 130 131 132
        #Add each gene in. If we had a 1:m relationship then we copy the attribute already assigned
        #across
        $new_core_group->add_member($gene_id, $attribs_hash);
      }
    }
    else {
      push @{$no_gene_id{$group->dbID()}}, $vega_stable_id;
      print STDERR "no ensembl gene_id found for vega stable id $vega_stable_id in core\n";
133
    }
134
  }
135 136 137
  if($new_core_group->size() > 0) {
    push(@new_groups, $new_core_group);
  }
138 139 140
}

#
141
# Delete the old data
142
#
143
print STDERR "\n\nDeleting all alt_alleles...\n\n";
144
$core_dba->dbc->do("delete from alt_allele");
145 146
$core_dba->dbc->do("delete from alt_allele_attrib");
$core_dba->dbc->do("delete from alt_allele_group");
147

148 149 150
#
# Store alt_alleles.
#
151
print STDERR "Storing new alt alleles...\n\n";
152 153
my $alt_allele_count=0;
my $gene_count = 0;
154

155 156 157 158 159
my $core_aaga = $core_dba->get_AltAlleleGroupAdaptor();
foreach my $group (@new_groups) {
  my $alt_allele_id = $core_aaga->store($group);
  $alt_allele_count++;
  $gene_count += $group->size()
160
}
161

162
print "Added $alt_allele_count alt_allele ids for $gene_count genes\nDONE\n";