Skip to content
Snippets Groups Projects
RFAMParser.pm 4.04 KiB
Newer Older
package XrefParser::RFAMParser;

use strict;
use warnings;
use Carp;
use DBI;

use base qw( XrefParser::BaseParser );
use Bio::EnsEMBL::Registry;


  my ($self, $ref_arg) = @_;
  my $source_id    = $ref_arg->{source_id};
  my $species_id   = $ref_arg->{species_id};
  my $file        = $ref_arg->{file};
  if((!defined $source_id) or (!defined $species_id) or (!defined $file) ){
    croak "Need to pass source_id, species_id and file as pairs";
  }
  $verbose |=0;

  my $wget = "";
  my $user = "ensro";
  my $host;
  my $port = 3306;
  my $dbname;
  my $pass;

  if($file =~ /wget[=][>](\S+?)[,]/){
    $wget = $1;
  }
  if($file =~ /host[=][>](\S+?)[,]/){
    $host = $1;
  }
  if($file =~ /port[=][>](\S+?)[,]/){
    $port =  $1;
  }
  if($file =~ /dbname[=][>](\S+?)[,]/){
    $dbname = $1;
  }
  if($file =~ /pass[=][>](\S+?)[,]/){
    $pass = $1;
  }
  if($file =~ /user[=][>](\S+?)[,]/){
    $user = $1;
  }


  #get direct RFAM xrefs from core
  my $registry = "Bio::EnsEMBL::Registry";
  #get the species name
  my %id2name = $self->species_id2name;
  my $species_name = $id2name{$species_id}[0];

  if ($host) {
      $dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
         '-host'     => $host,
         '-user'     => $user,
         '-pass'     => $pass,
         '-dbname'   => $dbname,
         '-species'  => $species_name,
         '-group'    => 'core',
       );
  } else {
      $registry->load_registry_from_multiple_dbs( 
      {
        '-host'    => 'ens-staging1',
        '-user'    => 'ensro',
      },
      {
        '-host'     => 'ens-staging2',
        '-user'     => 'ensro',
      },
      );
      $dba = $registry->get_DBAdaptor($species_name, 'core');
  }

  my $rfam_sql = "select distinct t.stable_id, hit_name from analysis a join transcript t on (a.analysis_id = t.analysis_id and a.logic_name = 'ncRNA' and t.biotype != 'miRNA') join exon_transcript et on (t.transcript_id = et.transcript_id) join supporting_feature sf on (et.exon_id = sf.exon_id and sf.feature_type = 'dna_align_feature' ) join dna_align_feature df on (sf.feature_id = df.dna_align_feature_id) order by hit_name";

  my $sth = $dba->dbc->prepare($rfam_sql);
  $sth->execute();

  #hash keyed on RFAM accessions, value is an array of ensembl transcript stable_ids
  my %rfam_transcript_stable_ids;

  while (my ($stable_id, $hit_name) = $sth->fetchrow_array ) {

      my $rfam_id;
      if ($hit_name =~ /^(RF\d+)/) {
	  $rfam_id = $1;
      }
      if ($rfam_id) {
	  push @{$rfam_transcript_stable_ids{$rfam_id}}, $stable_id; 
      }
  }
  $sth->finish;     

  my $ua = LWP::UserAgent->new();
  $ua->timeout(10);
  $ua->env_proxy();
  my $request = HTTP::Request->new(GET => $wget);
  my $response = $ua->request($request);

  if ( !$response->is_success() ) {
    warn($response->status_line);
    return 1;
  my @lines = split(/\n\n/, $response->decoded_content);
  my $xref_count = 0;
  my $direct_count = 0;
  while (my $entry = shift @lines) {
    my ($accession) = $entry =~ /#=GF\sAC\s+(\w+)/ ;
    my ($label) = $entry =~ /\n#=GF\sID\s+([^\n]+)/; 
    my ($description) = $entry =~ /\n#=GF\sDE\s+([^\n]+)/;
    if ($accession) {
      if (exists($rfam_transcript_stable_ids{$accession})){
      #add xref
        my $xref_id = $self->add_xref({ acc      => $accession,
  				      version    => 0,
				      label      => $label || $accession ,
				      desc       => $description,
				      source_id  => $source_id,
				      species_id => $species_id,
				      info_type  => "DIRECT"} );

        my @transcript_stable_ids = @{$rfam_transcript_stable_ids{$accession}};
        foreach my $stable_id (@transcript_stable_ids){
           $self->add_direct_xref($xref_id, $stable_id, "Transcript", "");
           $direct_count++;
         }	
         $xref_count++;
      }
    }
  }

  print "Added $xref_count RFAM xrefs and $direct_count direct xrefs\n" if($verbose);
  if ( !$xref_count ) {
      return 1;    # 1 error
  }

  return 0; # successfull
 

}

1;