Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
ZFINParser.pm 4.49 KiB
package XrefParser::ZFINParser;

use strict;
use warnings;
use Carp;
use POSIX qw(strftime);
use File::Basename;
use File::Spec::Functions;

use base qw( XrefParser::BaseParser );

sub run {
  my ($self, $ref_arg) = @_;
  my $source_id    = $ref_arg->{source_id};
  my $species_id   = $ref_arg->{species_id};
  my $files        = $ref_arg->{files};
  my $verbose      = $ref_arg->{verbose};

  if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){
    croak "Need to pass source_id, species_id and files as pairs";
  }
  $verbose |=0;

  my $file = @{$files}[0];
  my $dir = dirname($file);

  my (%swiss) = %{$self->get_valid_codes("uniprot/",$species_id)};
  my (%refseq) = %{$self->get_valid_codes("refseq",$species_id)};

  my $swissprot_io =
    $self->get_filehandle( catfile( $dir, 'uniprot.txt' ) );

  if ( !defined $swissprot_io ) {
    print STDERR "ERROR: Could not open " . catfile( $dir, 'uniprot.txt' ). "\n" ;
    return 1;    # 1 error
  }

#e.g.
#ZDB-GENE-000112-30      couptf2 O42532
#ZDB-GENE-000112-32      couptf3 O42533
#ZDB-GENE-000112-34      couptf4 O42534


  my %description;

  my $dbi = $self->dbi();


  my $sql = "insert into synonym (xref_id, synonym) values (?, ?)";
  my $add_syn_sth = $dbi->prepare($sql);    

  #get the source ids for HGNC refseq, entrezgene and unitprot
  $sql = 'select source_id, priority_description from source where name like "ZFIN_ID"';
  my $sth = $dbi->prepare($sql);

  $sth->execute();


  my ($hgnc_source_id, $desc);
  $sth->bind_columns(\$hgnc_source_id, \$desc);
  my @arr;
  while($sth->fetch()){
    push @arr, $hgnc_source_id;
  }
  $sth->finish;

  $sql = "select accession, label, version,  description from xref where source_id in (".join(", ",@arr).")";
  $sth = $dbi->prepare($sql);
  $sth->execute();
  my ($acc, $lab, $ver);
  my $hgnc_loaded_count = 0;
  $sth->bind_columns(\$acc, \$lab, \$ver, \$desc);
  while (my @row = $sth->fetchrow_array()) {
    $description{$acc} = $desc if(defined($desc));
    $hgnc_loaded_count++;
  }
  $sth->finish;

  my $spcount =0;
  my $rscount =0;
  my $mismatch=0;

  while ( $_ = $swissprot_io->getline() ) {
    chomp;
    my ($zfin, $label, $acc) = split (/\s+/,$_);
    if(defined($swiss{$acc})){
      foreach my $xref_id (@{$swiss{$acc}}){
	$self->add_dependent_xref({ master_xref_id => $xref_id,
			      acc            => $zfin,
			      label          => $label,
			      desc           => $description{$zfin},
			      source_id      => $source_id,
			      species_id     => $species_id} );
	$spcount++;
      }
    }
    else{
      $mismatch++;
    }
  }

  $swissprot_io->close();

  my $refseq_io = $self->get_filehandle( catfile( $dir, 'refseq.txt' ) );

  if ( !defined $refseq_io ) {
    print STDERR "ERROR: Could not open " . catfile( $dir, 'refseq.txt' ),"\n" ;
    return 1;
  }

#ZDB-GENE-000125-12      igfbp2  NM_131458
#ZDB-GENE-000125-12      igfbp2  NP_571533
#ZDB-GENE-000125-4       dlc     NP_571019

  while ( $_ = $refseq_io->getline() ) {
    chomp;
    my ($zfin, $label, $acc) = split (/\s+/,$_);
    if(defined($refseq{$acc})){
      foreach my $xref_id (@{$refseq{$acc}}){
	$self->add_dependent_xref({ master_xref_id => $xref_id,
				    acc            => $zfin,
				    label          => $label,
				    desc           => $description{$zfin},
				    source_id      => $source_id,
				    species_id     => $species_id} );
	$rscount++;
      }
    }
    else{
      $mismatch++;
    }
  }

  $refseq_io->close();

  my (%zfin) = %{$self->get_valid_codes("zfin",$species_id)};

  my $zfin_io = $self->get_filehandle( catfile( $dir, 'aliases.txt' ) );

  if ( !defined $zfin_io ) {
    print STDERR  "ERROR: Could not open " . catfile( $dir, 'aliases.txt' ), "\n" ;
    return 1;
  }

#ZDB-GENE-000125-4       deltaC  dlc     bea
#ZDB-GENE-000125-4       deltaC  dlc     beamter

  my $syncount = 0;

  $sth = $dbi->prepare('SELECT source_id from source where name like "ZFIN_ID"');

  $sth->execute;
  my $s1;
  $sth->bind_columns(\$s1);
  my $sources;
  while($sth->fetch()){
    push @$sources, $s1;
  }
  $sth->finish;

  while ( $_ = $zfin_io->getline() ) {
    chomp;
    my ($acc, undef, undef, $syn) = split (/\t/,$_);
    if(defined($zfin{$acc})){
      $self->add_to_syn_for_mult_sources($acc, $sources, $syn, $species_id);
      $syncount++;
    }
  }

  $zfin_io->close();

  if($verbose){
    print "\t$spcount xrefs from UniProt and\n";
    print "\t$rscount xrefs from RefSeq succesfully loaded\n";
    print "\t$syncount synonyms loaded\n";
    print "\t$mismatch xrefs ignored\n";
  }
  return 0;
}

1;