Skip to content
Snippets Groups Projects
RFAMParser.pm 5.47 KiB
Newer Older
Magali Ruffier's avatar
Magali Ruffier committed
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Tiago Grego's avatar
Tiago Grego committed
Copyright [2016-2019] EMBL-European Bioinformatics Institute

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

=cut

package XrefParser::RFAMParser;

use strict;
use warnings;
use Carp;
use DBI;

use base qw( XrefParser::BaseParser );
use Bio::EnsEMBL::Registry;


  my ($self, $ref_arg) = @_;
  my $source_id    = $ref_arg->{source_id};
  my $species_id   = $ref_arg->{species_id};
  my $species_name = $ref_arg->{species};
  my $file         = $ref_arg->{file};
  my $verbose      = $ref_arg->{verbose};
  my $core_db      = $ref_arg->{dba};
  my $dbi          = $ref_arg->{dbi};
  $dbi = $self->dbi unless defined $dbi;
  if((!defined $source_id) or (!defined $species_id) or (!defined $file) ){
    croak "Need to pass source_id, species_id and file as pairs";
  }
  $verbose |=0;

  my $wget;
  my $user = "ensro";
  my $host;
  my $port = 3306;
  my $dbname;
  my $pass;

  if($file =~ /wget[=][>](\S+?)[,]/){
    $wget = $1;
  }
  if($file =~ /host[=][>](\S+?)[,]/){
    $host = $1;
  }
  if($file =~ /port[=][>](\S+?)[,]/){
    $port =  $1;
  }
  if($file =~ /dbname[=][>](\S+?)[,]/){
    $dbname = $1;
  }
  if($file =~ /pass[=][>](\S+?)[,]/){
    $pass = $1;
  }
  if($file =~ /user[=][>](\S+?)[,]/){
    $user = $1;
  }


  #get direct RFAM xrefs from core
  my $registry = "Bio::EnsEMBL::Registry";
  my %id2name = $self->species_id2name($dbi);
  if (defined $species_name) { push @{$id2name{$species_id}}, $species_name; }
  if (!defined $id2name{$species_id}) { next; }
  $species_name = $id2name{$species_id}[0];

  if ($host) {
      $dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
         '-host'     => $host,
         '-user'     => $user,
         '-pass'     => $pass,
         '-dbname'   => $dbname,
Magali Ruffier's avatar
Magali Ruffier committed
         '-port'     => $port,
         '-species'  => $species_name,
         '-group'    => 'core',
       );
  } elsif (defined $core_db) {
    $dba = $core_db;
  } else {
      $registry->load_registry_from_multiple_dbs( 
Magali Ruffier's avatar
Magali Ruffier committed
        '-host'    => 'mysql-ens-sta-1',
	'-port'    => 4519,
        '-user'    => 'ensro',
      },
      );
      $dba = $registry->get_DBAdaptor($species_name, 'core');
  }
  my $rfam_sql = "select distinct t.stable_id, hit_name from analysis a join transcript t on (a.analysis_id = t.analysis_id and a.logic_name like 'ncrna%' and t.biotype != 'miRNA') join exon_transcript et on (t.transcript_id = et.transcript_id) join supporting_feature sf on (et.exon_id = sf.exon_id and sf.feature_type = 'dna_align_feature' ) join dna_align_feature df on (sf.feature_id = df.dna_align_feature_id) order by hit_name";

  my $sth = $dba->dbc->prepare($rfam_sql);
  $sth->execute();

  #hash keyed on RFAM accessions, value is an array of ensembl transcript stable_ids
  my %rfam_transcript_stable_ids;

  while (my ($stable_id, $hit_name) = $sth->fetchrow_array ) {

      my $rfam_id;
      if ($hit_name =~ /^(RF\d+)/) {
	  $rfam_id = $1;
      }
      if ($rfam_id) {
	  push @{$rfam_transcript_stable_ids{$rfam_id}}, $stable_id; 
      }
  }
  $sth->finish;     

  my @lines;
  if (defined $wget) {
    my $ua = LWP::UserAgent->new();
    $ua->timeout(10);
    $ua->env_proxy();
    my $request = HTTP::Request->new(GET => $wget);
    my $response = $ua->request($request);

    if ( !$response->is_success() ) {
      warn($response->status_line);
      return 1;
    }
    @lines = split(/\n\n\n/, $response->decoded_content);
  } else {
    my $file_io = $self->get_filehandle($file);
    if ( !defined $file_io ) {
      print "ERROR: Can't open RFAM file $file\n";
    while (my $line = $file_io->getline()) {
      if ($line =~ /\/\//) {
        push @lines, $temp_line;
        $temp_line = '';
      } else {
        $temp_line .= $line . "\n";
      }
  my $xref_count = 0;
  my $direct_count = 0;
  while (my $entry = shift @lines) {
    my ($accession) = $entry =~ /#=GF\sAC\s+(\w+)/ ;
    my ($label) = $entry =~ /#=GF\sID\s+([^\n]+)/;
    my ($description) = $entry =~ /#=GF\sDE\s+([^\n]+)/;
    if ($accession) {
      if (exists($rfam_transcript_stable_ids{$accession})){
      #add xref
        my $xref_id = $self->add_xref({ acc      => $accession,
  				      version    => 0,
				      label      => $label || $accession ,
				      desc       => $description,
				      source_id  => $source_id,
				      species_id => $species_id,
                                      dbi        => $dbi,
        my @transcript_stable_ids = @{$rfam_transcript_stable_ids{$accession}};
        foreach my $stable_id (@transcript_stable_ids){
           $self->add_direct_xref($xref_id, $stable_id, "Transcript", "", $dbi);
    }
  }

  print "Added $xref_count RFAM xrefs and $direct_count direct xrefs\n" if($verbose);

  return 0; # successfull
 

}

1;