Skip to content
Snippets Groups Projects
Commit 74a69505 authored by Marek Szuba's avatar Marek Szuba
Browse files

RGDParser: backport further changes from ensembl-xref

Summary:
 - new "please see the NOTICE file" copyright note
 - add more POD blocks
 - direct-xref entries now use the same source ID as xrefs themselves
 - remove the final bit of custom SQL
 - better handling of RefSeq accessions and synonym lines
 - get rid of more occurrences of postfix conditionals and "unless"
 - quite a lot of reformatting

Issue: ENSCORESW-3219
parent f468be64
No related branches found
No related tags found
1 merge request!445Merge the output of the 2018 xref sprint
=head1 LICENSE
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016-2019] EMBL-European Bioinformatics Institute
=head1 LICENSE
See the NOTICE file distributed with this work for additional information
regarding copyright ownership.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
......@@ -18,7 +18,7 @@ limitations under the License.
=head1 DESCRIPTION
Designed to parse the Rat Genome Database download file, historically hosted at
ftp://ftp.rgd.mcw.edu/pub/data_release/GENES_RAT.txt . It comprises 40+ columns in a
ftp://ftp.rgd.mcw.edu/pub/data_release/GENES_RAT.txt . It comprises 40+ columns in a
tab-separated format
It contains RGD IDs (which are numeric), and associates them either with Ensembl genes or
......@@ -30,56 +30,64 @@ package XrefParser::RGDParser;
use strict;
use warnings;
use Carp;
use Text::CSV;
use parent qw( XrefParser::BaseParser );
sub run {
my ($self, $ref_arg) = @_;
my $source_id = $ref_arg->{source_id};
my $species_id = $ref_arg->{species_id};
my $files = $ref_arg->{files};
my $verbose = $ref_arg->{verbose};
my $dbi = $ref_arg->{dbi};
$dbi = $self->dbi unless defined $dbi;
=head2 run
if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){
confess "Need to pass source_id, species_id and files as pairs";
}
$verbose //= 0;
Description: Triggers the parsing of the RGD file specified in files parameter
It uses Text::CSV to consume the source file.
=cut
my $source_sql = "select source_id from source where name = 'RGD' and priority_description = 'direct_xref'";
my $sth = $dbi->prepare($source_sql);
$sth->execute();
my ($direct_source_id);
$sth->bind_columns(\$direct_source_id);
$sth->fetch();
$sth->finish();
sub run {
my ( $self, $ref_arg ) = @_;
my $source_id = $ref_arg->{source_id};
my $species_id = $ref_arg->{species_id};
my $files = $ref_arg->{files};
my $verbose = $ref_arg->{verbose} // 0;
my $dbi = $ref_arg->{dbi} // $self->dbi;
if ( ( !defined $source_id ) or
( !defined $species_id ) or
( !defined $files ) )
{
confess 'Need to pass source_id, species_id and files as pairs';
}
my $file = @{$files}[0];
# Used to assign dbIDs for when RGD Xrefs are dependent on RefSeq xrefs
my (%preloaded_refseq) = %{$self->get_valid_codes("refseq",$species_id, $dbi)};
# Used to assign dbIDs for when RGD Xrefs are dependent on RefSeq xrefs
my (%preloaded_refseq) =
%{ $self->get_valid_codes( 'refseq', $species_id, $dbi ) };
my $rgd_io = $self->get_filehandle($file);
if ( !defined $rgd_io ) {
confess "Could not open $file when trying to parse RGD";
}
my $csv = Text::CSV->new({ sep => "\t", blank_is_undef => 1, strict => 0, auto_diag => 1, binary => 1, allow_loose_quotes => 1})
or confess "Cannot use CSV: ".Text::CSV->error_diag ();
# WARNING - Text::CSV does not like the GENES-RAT.txt file. It is improperly formatted and contains a non-ASCII character
# Make sure binary is turned on or it silently fails and you get 1/3rd of the records.
# strict is turned off to prevent failure on a blank line at the end
my $line = '#';
while (substr($line,0,1) eq '#') {
my $csv = Text::CSV->new({
sep => "\t",
blank_is_undef => 1,
auto_diag => 1,
binary => 1,
allow_loose_quotes => 1,
}) || confess 'Cannot use CSV: ' . Text::CSV->error_diag();
# WARNING - Text::CSV does not like the GENES-RAT.txt file. It is improperly formatted and contains a non-ASCII character
# Make sure binary is turned on or it silently fails and you get 1/3rd of the records.
# strict is turned off to prevent failure on a blank line at the end
my $line = q{#};
while ( substr( $line, 0, 1 ) eq q{#} ) {
$line = $rgd_io->getline;
}
$csv->parse($line);
$csv->column_names($csv->fields);
$csv->column_names( $csv->fields );
# Columns we want
# GENE_RGD_ID => 0,
# SYMBOL => 1,
......@@ -88,113 +96,148 @@ sub run {
# OLD_SYMBOL => 29,
# ENSEMBL_ID => 37
my $count= 0;
my $count = 0;
my $ensembl_count = 0;
my $mismatch = 0;
my $syn_count = 0;
my $cols; # Digested columns from CSV
my $mismatch = 0;
my $syn_count = 0;
my $cols; # Digested columns from CSV
while ( $cols = $csv->getline_hr($rgd_io) ) {
next if exists $cols->{GENE_RGD_ID} && ($cols->{GENE_RGD_ID} eq '' || !defined $cols->{GENE_RGD_ID});
next
if exists $cols->{GENE_RGD_ID} &&
( $cols->{GENE_RGD_ID} eq q{} || !defined $cols->{GENE_RGD_ID} );
my @nucs;
@nucs = split(/\;/,$cols->{GENBANK_NUCLEOTIDE}) if defined $cols->{GENBANK_NUCLEOTIDE};
if ( defined $cols->{GENBANK_NUCLEOTIDE} ) {
@nucs = split qr{ ; }msx, $cols->{GENBANK_NUCLEOTIDE};
}
my $done = 0;
# @nucs are sorted in the file in alphabetical order. Filter them down
# to a higher quality subset, then add dependent Xrefs where possible
foreach my $nuc ($self->sort_refseq_accessions(@nucs)){
if(!$done && exists $preloaded_refseq{$nuc}){
foreach my $xref (@{$preloaded_refseq{$nuc}}){
my $xref_id = $self->add_dependent_xref({
master_xref_id => $xref,
acc => $cols->{GENE_RGD_ID},
label => $cols->{SYMBOL},
desc => $cols->{NAME},
source_id => $source_id,
dbi => $dbi,
species_id => $species_id} );
# @nucs are sorted in the file in alphabetical order. Filter them down
# to a higher quality subset, then add dependent Xrefs where possible
foreach my $nuc ( $self->sort_refseq_accessions(@nucs) ) {
if ( !$done && exists $preloaded_refseq{$nuc} ) {
foreach my $xref ( @{ $preloaded_refseq{$nuc} } ) {
my $xref_id =
$self->add_dependent_xref({
master_xref_id => $xref,
acc => $cols->{GENE_RGD_ID},
label => $cols->{SYMBOL},
desc => $cols->{NAME},
source_id => $source_id,
dbi => $dbi,
species_id => $species_id,
});
$count++;
$syn_count += $self->add_synonyms($cols->{OLD_SYMBOL},$xref_id,$dbi);
$syn_count +=
$self->process_synonyms( $xref_id, $cols->{OLD_SYMBOL},
$dbi );
$done = 1;
}
}
}
if (defined $cols->{ENSEMBL_ID}) {
my @ensembl_ids = split(/\;/, $cols->{ENSEMBL_ID});
if ( defined $cols->{ENSEMBL_ID} ) {
my @ensembl_ids = split qr{ ; }msx, $cols->{ENSEMBL_ID};
foreach my $id (@ensembl_ids) {
$ensembl_count++;
$self->add_to_direct_xrefs({ stable_id => $id,
type => 'gene',
acc => $cols->{GENE_RGD_ID},
label => $cols->{SYMBOL},
desc => $cols->{NAME},
dbi => $dbi,
source_id => $direct_source_id,
species_id => $species_id} );
my $xref_id = $self->get_xref($cols->{GENE_RGD_ID}, $direct_source_id, $species_id, $dbi);
$syn_count += $self->add_synonyms($cols->{OLD_SYMBOL},$xref_id,$dbi);
$self->add_to_direct_xrefs({
stable_id => $id,
type => 'gene',
acc => $cols->{GENE_RGD_ID},
label => $cols->{SYMBOL},
desc => $cols->{NAME},
dbi => $dbi,
source_id => $source_id,
species_id => $species_id,
});
my $xref_id =
$self->get_xref( $cols->{GENE_RGD_ID}, $source_id,
$species_id, $dbi );
$syn_count +=
$self->process_synonyms( $xref_id, $cols->{OLD_SYMBOL},
$dbi );
$done = 1;
}
}
if(!$done){
$self->add_xref({
if ( !$done ) {
$self->add_xref({
acc => $cols->{GENE_RGD_ID},
label => $cols->{SYMBOL},
desc => $cols->{NAME},
source_id => $source_id,
species_id => $species_id,
dbi => $dbi,
info_type => "MISC"} );
info_type => 'MISC',
});
$mismatch++;
}
}
if (! $csv->eof) {
confess 'Failed to finish parsing RGD file: '.$csv->error_diag();
} ## end while ( $cols = $csv->getline_hr...)
if ( !$csv->eof ) {
confess 'Failed to finish parsing RGD file: ' . $csv->error_diag();
}
$rgd_io->close();
if($verbose){
print "\t$count xrefs succesfully loaded and dependent on refseq\n";
print "\t$mismatch xrefs added but with NO dependencies\n";
print "\t$ensembl_count direct xrefs successfully loaded\n";
print "\tTried to add $syn_count synonyms, including duplicates\n";
if ($verbose) {
print "$count xrefs succesfully loaded and dependent on refseq\n" .
"$mismatch xrefs added but with NO dependencies\n" .
"$ensembl_count direct xrefs successfully loaded\n" .
"Tried to add $syn_count synonyms, including duplicates\n";
}
return 0;
}
} ## end sub run
# Predefined importance levels for the most valued RefSeq accession types
my %refseq_priorities = (
NM => 1,
NP => 1,
NR => 1,
XM => 2,
XP => 2,
XR => 2,
);
# Filter out any accessions which are not in the "normal" set of genomic features
# The column in question contains EMBL accessions as well as other things, and we don't
# have the ability to make Xrefs to those. It wouldn't be useful to do so in any case
my %refseq_priorities =
( NM => 1, NP => 1, NR => 1, XM => 2, XP => 2, XR => 2, );
=head2 sort_refseq_accessions
Arg [1..n] : Original list of accessions
Description : Filter out any accessions which are not in the "normal" set of
genomic features. The column in question contains EMBL accessions
as well as other things, and we don't have the ability to make
Xrefs to all sources
Returntype : List of sorted and filtered accessions
=cut
sub sort_refseq_accessions {
my ($self,@accessions) = @_;
@accessions = sort { $refseq_priorities{substr $a, 0,2} <=> $refseq_priorities{substr $b, 0,2} }
grep { exists $refseq_priorities{substr $_, 0,2} } @accessions;
my ( $self, @accessions ) = @_;
@accessions = sort {
$refseq_priorities{ substr $a, 0, 2 }
<=> $refseq_priorities{ substr $b, 0, 2 } ||
$a cmp $b
} grep { exists $refseq_priorities{ substr $_, 0, 2 } } @accessions;
return @accessions;
}
# Process the synonym column into potentially many items, add them to the synonym table
sub add_synonyms {
my ($self,$synonym_string,$xref_id,$dbi) = @_;
=head2 process_synonyms
Arg [1] : Xref dbID to attach synonyms to
Arg [2] : Synonym string as read from file
Description : Process the synonym column into potentially many items and add
them to the synonym table. Synonyms are ';' separated
Returntype : Int - the count of synonyms added
=cut
sub process_synonyms {
my ( $self, $xref_id, $synonym_string, $dbi ) = @_;
my $syn_count = 0;
return $syn_count if (!defined $synonym_string || !defined $xref_id || !defined $dbi);
if ( ( !defined $synonym_string ) || ( !defined $xref_id ) ) {
return $syn_count;
}
my @syns = split(/\;/,$synonym_string);
foreach my $syn(@syns){
$self->add_synonym($xref_id,$syn,$dbi);
my @syns = split qr{ ; }msx, $synonym_string;
foreach my $syn (@syns) {
$self->add_synonym( $xref_id, $syn, $dbi );
$syn_count++;
}
return $syn_count;
}
1;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment