Skip to content
Snippets Groups Projects

Xref parser mgiparser

Merged Marek Szuba requested to merge xref_parser_mgiparser into feature/xref_sprint
1 file
+ 62
76
Compare changes
  • Side-by-side
  • Inline
=head1 LICENSE
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
@@ -23,101 +24,86 @@ use strict;
use warnings;
use Carp;
use DBI;
use Text::CSV;
use base qw(XrefParser::BaseParser);
use parent qw(XrefParser::BaseParser);
sub run {
my ($self, $ref_arg) = @_;
my $source_id = $ref_arg->{source_id};
my $species_id = $ref_arg->{species_id};
my $files = $ref_arg->{files};
my $verbose = $ref_arg->{verbose};
my $dbi = $ref_arg->{dbi};
$dbi = $self->dbi unless defined $dbi;
if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){
my ( $self, $ref_arg ) = @_;
my $source_id = $ref_arg->{source_id};
my $species_id = $ref_arg->{species_id};
my $files = $ref_arg->{files};
my $verbose = $ref_arg->{verbose} // 0;
my $dbi = $ref_arg->{dbi} // $self->dbi;
if ( ( !defined $source_id )
or ( !defined $species_id )
or ( !defined $files ) )
{
croak "Need to pass source_id, species_id and files as pairs";
}
$verbose |=0;
my $file = @{$files}[0];
my $file = shift @{$files};
my $file_io = $self->get_filehandle($file);
if ( !defined $file_io ) {
print STDERR "ERROR: Could not open $file\n";
return 1; # 1 is an error
}
my %label;
my %version;
my %description;
my %accession;
my $sql = 'select source_id from source where name like "MGI" and priority_description like "descriptions"';
my $sth = $dbi->prepare($sql);
$sth->execute();
my ($mgi_source_id);
$sth->bind_columns(\$mgi_source_id);
my @arr;
while($sth->fetch()){
push @arr, $mgi_source_id;
croak "Could not open $file\n";
}
$sth->finish;
$sql = "select accession, label, version, description from xref where source_id in (".join(", ",@arr).")";
$sth = $dbi->prepare($sql);
$sth->execute();
my ($acc, $lab, $ver, $desc);
$sth->bind_columns(\$acc, \$lab, \$ver, \$desc);
while (my @row = $sth->fetchrow_array()) {
if(defined($desc)){
$accession{$lab} = $acc;
$label{$acc} = $lab;
$version{$acc} = $ver;
$description{$acc} = $desc;
}
}
$sth->finish;
#synonyms
$sql = "insert ignore into synonym (xref_id, synonym) values (?, ?)";
my $add_syn_sth = $dbi->prepare($sql);
my $syn_hash = $self->get_ext_synonyms("MGI", $dbi);
my $syn_hash = $self->get_ext_synonyms( "MGI", $dbi );
#Init input file
my $input_file = Text::CSV->new(
{
sep_char => "\t",
empty_is_undef => 1,
strict => 1,
allow_loose_quotes => 1,
}
) or croak "Cannot use file $file: " . Text::CSV->error_diag();
my $count = 0;
# init headers
# MGI:1915941 1110028C15Rik RIKEN cDNA 1110028C15 gene 33.61 1 ENSMUSG00000026004 ENSMUST00000042389 ENSMUST00000068168 ENSMUST00000113987 ENSMUST00000129190 ENSMUST00000132960 ENSMUSP00000036975 ENSMUSP00000063843 ENSMUSP00000109620 ENSMUSP00000118603
$input_file->column_names( [qw(accession symbol name position chrom ens_gene_stableid)] ); #ignore last two columns EnsemblTranscriptIDs and EnsemblProteinIDs
my $count = 0;
my $syn_count = 0;
while ( my $line = $file_io->getline() ) {
#MGI:1915941 1110028C15Rik RIKEN cDNA 1110028C15 gene 33.61 1 ENSMUSG00000026004 ENSMUST00000042389 ENSMUST00000068168 ENSMUST00000113987 ENSMUST00000129190 ENSMUST00000132960 ENSMUSP00000036975 ENSMUSP00000063843 ENSMUSP00000109620 ENSMUSP00000118603
if($line =~ /(MGI:\d+).*(ENSMUSG\d+)/){
my $acc = $1;
my $ensid = $2;
my $xref_id = $self->add_xref({ acc => $acc,
version => $version{$acc},
label => $label{$acc},
desc => $description{$acc},
source_id => $source_id,
dbi => $dbi,
species_id => $species_id,
info_type => "DIRECT"} );
$self->add_direct_xref( $xref_id, $ensid, "Gene", '', $dbi);
if(defined($syn_hash->{$acc})){
foreach my $syn (@{$syn_hash->{$acc}}){
$add_syn_sth->execute($xref_id, $syn);
}
while ( my $data = $input_file->getline_hr($file_io) ) {
my $acc = $data->{'accession'};
my $ensid = $data->{'ens_gene_stableid'};
my $xref_id = $self->add_xref(
{
acc => $acc,
version => 0,
label => $data->{'symbol'},
desc => $data->{'name'},
source_id => $source_id,
dbi => $dbi,
species_id => $species_id,
info_type => "DIRECT"
}
);
$self->add_direct_xref( $xref_id, $ensid, "Gene", undef, $dbi );
if ( defined( $syn_hash->{$acc} ) ) {
foreach my $syn ( @{ $syn_hash->{$acc} } ) {
$self->add_to_syn( $acc, $source_id, $syn, $species_id, $dbi );
$syn_count++;
}
$count++;
}
else{
print STDERR "PROBLEM: $line";
}
$count++;
}
$input_file->eof
or croak "Error parsing file $file: " . $input_file->error_diag();
$file_io->close();
if ($verbose) {
print "$count direct MGI xrefs added\n";
print $syn_count. " synonyms added\n";
}
print "$count direct MGI xrefs added\n";
return 0;
}