diff --git a/misc-scripts/xref_mapping/XrefParser/ArrayExpressParser.pm b/misc-scripts/xref_mapping/XrefParser/ArrayExpressParser.pm index c16dbf5ef16fd3d2dba49ad0f7eaf5b3c41ed271..8d7753ba8b47130d4b00b56513f04ecda8852a00 100644 --- a/misc-scripts/xref_mapping/XrefParser/ArrayExpressParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/ArrayExpressParser.pm @@ -13,51 +13,133 @@ use strict; use warnings; use Carp; use base qw( XrefParser::BaseParser ); +use Bio::EnsEMBL::Registry; -sub meta_key { - my ($self) = @_; - return "array_express.exported"; -} -sub run { +sub run_script { + my ($self, $ref_arg) = @_; - my $source_id = $ref_arg->{source_id}; my $species_id = $ref_arg->{species_id}; - my $files = $ref_arg->{files}; + my $file = $ref_arg->{file}; my $verbose = $ref_arg->{verbose}; - - if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){ - croak "Need to pass source_id, species_id and files as pairs"; + + if((!defined $source_id) or (!defined $species_id) or (!defined $file) ){ + croak "Need to pass source_id, species_id and file as pairs"; + } + $verbose |=0; + + my $project; + my $wget = ""; + + if($file =~ /project[=][>](\S+?)[,]/){ + $project = $1; + } + if($file =~ /wget[=][>](\S+?)[,]/){ + $wget = $1; + } + + + my $ua = LWP::UserAgent->new(); + $ua->timeout(10); + $ua->env_proxy(); + + my $response = $ua->get($wget); + + if ( !$response->is_success() ) { + warn($response->status_line); + return 1; } - $verbose ||=0; + my @lines = split(/\n/,$response->content); my %species_id_to_names = $self->species_id2name(); my $species_id_to_names = \%species_id_to_names; my $names = $species_id_to_names->{$species_id}; - my $contents_lookup = $self->_get_contents($files, $verbose); + my $contents_lookup = $self->_get_contents(\@lines, $verbose); my $active = $self->_is_active($contents_lookup, $names, $verbose); + + if (!$active) { + return; + } - if ($active && $verbose) { - print "ArrayExpress xrefs will be created when running xref_mapper.pl/DirectXrefs.pm as gene stable ids are required to create the xrefs\n"; + #get stable_ids from core and create xrefs + + my $registry = "Bio::EnsEMBL::Registry"; + + if ($project eq 'ensembl') { + $registry->load_registry_from_multiple_dbs( + { + '-host' => 'ens-staging1', + '-user' => 'ensro', + }, + { + '-host' => 'ens-staging2', + '-user' => 'ensro', + }, + ); + } elsif ($project eq 'ensemblgenomes') { + + $registry->load_registry_from_multiple_dbs( + { + '-host' => 'mysql-eg-staging-1.ebi.ac.uk', + '-port' => 4160, + '-user' => 'ensro', + }, + { + '-host' => 'mysql-eg-staging-2.ebi.ac.uk', + '-port' => 4275, + '-user' => 'ensro', + }, + + ); + + } else { + die("Missing or unsupported project value. Supported values: ensembl, ensemblgenomes"); } - $self->add_meta_pair($self->meta_key(),$active); - $self->add_meta_pair('species_id',$species_id); - return; + #get the species name + + my $species_name = $species_id_to_names{$species_id}[0]; + my $gene_adaptor = $registry->get_adaptor($species_name, 'core', 'Gene'); + + my @stable_ids = map { $_->stable_id } @{$gene_adaptor->fetch_all()}; + + my $xref_count = 0; + foreach my $gene_stable_id (@stable_ids) { + + my $xref_id = $self->add_xref({ acc => $gene_stable_id, + label => $gene_stable_id, + source_id => $source_id, + species_id => $species_id, + info_type => "DIRECT"} ); + + $self->add_direct_xref( $xref_id, $gene_stable_id, 'gene', ''); + if ($xref_id) { + $xref_count++; + } + } + + print "Added $xref_count DIRECT xrefs\n" if($verbose); + if ( !$xref_count ) { + return 1; # 1 error + } + + return 0; # successfull + } + + sub _get_contents { - my ($self, $files, $verbose) = @_; + my ($self, $lines, $verbose) = @_; + my @lines = @$lines; my %lookup; - my $fh = $self->get_filehandle($files->[0]); - while(my $line = <$fh>) { - chomp $line; + + foreach my $line (@lines) { my ($species, $remainder) = $line =~ /^([a-z|A-Z]+)_(.+)$/; croak "The line '$line' is not linked to a gene set. This is unexpected." if $remainder !~ /gene/; $lookup{$species} = 1; } - close ($fh); if($verbose) { printf("ArrayExpress is using the species [%s]\n", join(q{, }, keys %lookup)); } @@ -90,53 +172,5 @@ sub _is_active { return $active; } -#this method is called from XrefMapper/DirectXrefs.pm - -sub create_xrefs { - my $self = shift; - my $verbose = shift; - - my $array_xrefs_meta_key = $self->meta_key(); - - if ($array_xrefs_meta_key) { - my $active = $self->get_meta_value($array_xrefs_meta_key); - if ($active) { - #create ArrayExpress direct xrefs - my $source_name = 'ArrayExpress'; - my $source_id = $self->get_source_id_for_source_name($source_name); - - my $species_id = $self->get_meta_value('species_id'); - #get gene stable_ids - my $gene_id_sth = $self->dbi()->prepare("select stable_id from gene_stable_id order by stable_id"); - $gene_id_sth->execute(); - my $gene_stable_id; - $gene_id_sth->bind_columns(\$gene_stable_id); - my $xref_count = 0; - while ($gene_id_sth->fetch()) { - - my $xref_id = $self->add_xref({ acc => $gene_stable_id, - label => $gene_stable_id, - source_id => $source_id, - species_id => $species_id, - info_type => "DIRECT"} ); - - $self->add_direct_xref( $xref_id, $gene_stable_id, 'gene', ''); - if ($xref_id) { - $xref_count++; - } - } - $gene_id_sth->finish(); - - if ($xref_count > 0) { - print "Loaded $xref_count $source_name DIRECT xrefs\n" if $verbose; - } else { - - print "Warning: 0 $source_name DIRECT xrefs loaded even though $source_name is active for the species.\n"; - } - } - - } -} - 1;