From 95fde0140ae51a5eb74875d3c4295f5d6469bec4 Mon Sep 17 00:00:00 2001 From: Ian Longden <ianl@sanger.ac.uk> Date: Wed, 5 Nov 2008 15:44:37 +0000 Subject: [PATCH] new parsers --- .../XrefParser/MGI_CCDS_Parser.pm | 134 ++++++++++ .../XrefParser/MGI_Desc_Parser.pm | 119 +++++++++ .../XrefParser/MGI_Vega_Parser.pm | 241 ++++++++++++++++++ .../MGI_curated_transcriptParser.pm | 181 +++++++++++++ 4 files changed, 675 insertions(+) create mode 100644 misc-scripts/xref_mapping/XrefParser/MGI_CCDS_Parser.pm create mode 100644 misc-scripts/xref_mapping/XrefParser/MGI_Desc_Parser.pm create mode 100644 misc-scripts/xref_mapping/XrefParser/MGI_Vega_Parser.pm create mode 100644 misc-scripts/xref_mapping/XrefParser/MGI_curated_transcriptParser.pm diff --git a/misc-scripts/xref_mapping/XrefParser/MGI_CCDS_Parser.pm b/misc-scripts/xref_mapping/XrefParser/MGI_CCDS_Parser.pm new file mode 100644 index 0000000000..e829934ff4 --- /dev/null +++ b/misc-scripts/xref_mapping/XrefParser/MGI_CCDS_Parser.pm @@ -0,0 +1,134 @@ +package XrefParser::MGI_CCDS_Parser; + +use strict; + +use DBI; + +use base qw( XrefParser::BaseParser ); + +# Parse file of HGNC records and assign direct xrefs +# All assumed to be linked to genes + + +if (!defined(caller())) { + + if (scalar(@ARGV) != 1) { + print STDERR "\nUsage: MGI_CCDS_Parser.pm file <source_id> <species_id>\n\n"; + exit(1); + } + + run(@ARGV); +} + + +sub run_script { + + my ($self, $file, $source_id, $species_id, $verbose) = @_; + + my $wget = ""; + + if($file =~ /wget[=][>](\S+?)[,]/){ + $wget = $1; + } + + + my %label; + my %version; + my %description; + my %accession; + + my $dbi = $self->dbi(); + + my $sql = 'select source_id, priority_description from source where name like "MGI"'; + my $sth = $dbi->prepare($sql); + + $sth->execute(); + my ($mgi_source_id, $desc); + $sth->bind_columns(\$mgi_source_id, \$desc); + my @arr; + while($sth->fetch()){ + push @arr, $mgi_source_id; + } + $sth->finish; + + $sql = "select accession, label, version, description from xref where source_id in (".join(", ",@arr).")"; + + $sth = $dbi->prepare($sql); + $sth->execute(); + my ($acc, $lab, $ver, $desc); + $sth->bind_columns(\$acc, \$lab, \$ver, \$desc); + while (my @row = $sth->fetchrow_array()) { + if(defined($desc)){ + $accession{$lab} = $acc; + $label{$acc} = $lab; + $version{$acc} = $ver; + $description{$acc} = $desc; + } + } + $sth->finish; + + + + # + # Get master xref ids via the ccds label. + # + + $sql = 'select x.label, x.xref_id from xref x, source s where x.source_id = s.source_id and s.name ="CCDS"'; + + my %ccds_label_to_xref_id; + $sth = $dbi->prepare($sql); + $sth->execute(); + my ($xref_id); + $sth->bind_columns(\$lab, \$xref_id); + while (my @row = $sth->fetchrow_array()) { + $ccds_label_to_xref_id{$row[0]} = $row[1]; + } + $sth->finish; + + + + my $ua = LWP::UserAgent->new(); + $ua->timeout(10); + $ua->env_proxy(); + + + my $count = 0; + my $ccds_missing = 0; + my $entrezgene_missing = 0; + + my $response = $ua->get($wget); + + if ( !$response->is_success() ) { + die $response->status_line; + } + else{ + # + # + ##chromosome g_accession gene gene_id ccds_id ccds_status cds_strand cds_from cds_to cds_locations match_type + #1 NC_000067.5 Xkr4 497097 CCDS14803.1 Public - 3206102 3661428 [3206102-3207048, 3411782-3411981, 3660632-3661428] Identical + #1 NC_000067.5 Rp1h 19888 CCDS14804.1 Public - 4334680 4342905 [4334680-4340171, 4341990-4342161, 4342282-4342905] Identical + my @lines = split(/\n/,$response->content); + foreach my $line (@lines){ + my($chrom, $g_acc, $gene_name, $entrez_id, $ccds, @junk) = split(/\t/,$line); + if(defined($ccds_label_to_xref_id{$ccds})){ + if(defined($accession{$gene_name}) and + defined($label{$accession{$gene_name}})){ + my $acc = $accession{$gene_name}; + $self->add_to_xrefs($ccds_label_to_xref_id{$ccds}, $acc, $version{$acc}, $label{$acc}, $description{$acc}, "", $source_id, $species_id); + $count++; + } + else{ + $entrezgene_missing++; + } + } + else{ + $ccds_missing++; + } + } + } + print "$ccds_missing ccds not resolved, $entrezgene_missing mgi not found. Added $count MGI xrefs via CCDS\n" if($verbose); + +} + +1; + diff --git a/misc-scripts/xref_mapping/XrefParser/MGI_Desc_Parser.pm b/misc-scripts/xref_mapping/XrefParser/MGI_Desc_Parser.pm new file mode 100644 index 0000000000..5eeb670990 --- /dev/null +++ b/misc-scripts/xref_mapping/XrefParser/MGI_Desc_Parser.pm @@ -0,0 +1,119 @@ +package XrefParser::MGI_Desc_Parser; + +use strict; +use File::Basename; + +use base qw( XrefParser::BaseParser ); + +use strict; +use Bio::EnsEMBL::DBSQL::DBAdaptor; + + +#my $dbi2; + +if (!defined(caller())) { + + if (scalar(@ARGV) != 1) { + print STDERR "\nUsage: MGI_Desc_Parser.pm file <source_id> <species_id>\n\n"; + exit(1); + } + + run(@ARGV); +} + +sub run { + + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; + my $syn_file = @{$files}[1]; + + if(!defined($source_id)){ + $source_id = XrefParser::BaseParser->get_source_id_for_filename($file); + } + if(!defined($species_id)){ + $species_id = XrefParser::BaseParser->get_species_id_for_filename($file); + } + + + + my $mgi_io = $self->get_filehandle($file); + + if ( !defined $mgi_io ) { + print STDERR "ERROR: Could not open $file\n"; + return 1; # 1 is an error + } + + my $xref_count =0; + my $syn_count =0; + + my %acc_to_xref; +#MGI Marker associations to Sequence (GenBank or RefSeq) information (tab-delimited) +#MGI Marker Accession ID Marker Symbol Status Marker Type Marker Name cM Position Chromosome GenBank Accession IDs +#(space-delimited) Unigene ID +#(if any) RefSeq ID +#(if any) + while ( $_ = $mgi_io->getline() ) { + chomp; + if($_ =~ /^ MGI:/){ + my ($junk, $acc, $chr, $pos, $label, $status, @part_desc) = split(/\s+/,$_); + + my $desc= join(" ",@part_desc); + $acc_to_xref{$acc} = $self->add_xref($acc,"",$label,$desc,$source_id,$species_id); + if($verbose and $desc eq ""){ + print "$acc has no description\n"; + } + $xref_count++; + } + } + + $mgi_io->close(); + + print $xref_count." MGI Description Xrefs added\n" if($verbose); + + # + # Now process the synonyms + # + my $mgi_io = $self->get_filehandle($syn_file); + + if ( !defined $mgi_io ) { + print STDERR "ERROR: Could not open $file\n"; + return 1; # 1 is an error + } + + + my $syn_count = 0; + while ( $_ = $mgi_io->getline() ) { + chomp; + if($_ =~ /^ MGI:/){ + + my ($junk, $acc, $chr, $pos, $symbol, @part_synonym) = split(/\s+/,$_); + my $syn = join(" ",@part_synonym); + + if(defined($acc_to_xref{$acc})){ + $self->add_synonym($acc_to_xref{$acc}, $syn); + $syn_count++; + } +# Lots of withdrawn entrys. +# else{ +# print "Could not find xref for $acc to add synonym $syn\n" if($verbose); +# } + } + } + $mgi_io->close(); + print $syn_count." synonyms added\n" if($verbose); + + return 0; #successful +} + + + + +1; + diff --git a/misc-scripts/xref_mapping/XrefParser/MGI_Vega_Parser.pm b/misc-scripts/xref_mapping/XrefParser/MGI_Vega_Parser.pm new file mode 100644 index 0000000000..a7fcf99b5d --- /dev/null +++ b/misc-scripts/xref_mapping/XrefParser/MGI_Vega_Parser.pm @@ -0,0 +1,241 @@ +package XrefParser::MGI_Vega_Parser; + +use strict; +use File::Basename; + +use base qw( XrefParser::BaseParser ); + +use strict; +use Bio::EnsEMBL::DBSQL::DBAdaptor; + + +#my $dbi2; + +if (!defined(caller())) { + + if (scalar(@ARGV) != 1) { + print STDERR "\nUsage: MGI_Vega_Parser.pm file <source_id> <species_id>\n\n"; + exit(1); + } + + run(@ARGV); +} + +sub run_script { + my $self = shift if (defined(caller(1))); + + my $file = shift; + my $source_id = shift; + my $species_id = shift; + my $verbose = shift; + + my ($type, $my_args) = split(/:/,$file); + + my $cuser = "ensro"; + my $chost ="ens-staging"; + my $cport = "3306"; + my $cdbname = ""; + my $cpass; + + my $vuser = "ensro"; + my $vhost ="ens-staging"; + my $vport = "3306"; + my $vdbname = "mus_musculus_vega_51_37d"; + my $vpass; + + if($my_args =~ /chost[=][>](\S+?)[,]/){ + $chost = $1; + } + if($my_args =~ /cport[=][>](\S+?)[,]/){ + $cport = $1; + } + if($my_args =~ /cdbname[=][>](\S+?)[,]/){ + $cdbname = $1; + } + if($my_args =~ /cpass[=][>](\S+?)[,]/){ + $cpass = $1; + } + + if($my_args =~ /vhost[=][>](\S+?)[,]/){ + $vhost = $1; + } + if($my_args =~ /vport[=][>](\S+?)[,]/){ + $vport = $1; + } + if($my_args =~ /vdbname[=][>](\S+?)[,]/){ + $vdbname = $1; + } + if($my_args =~ /vpass[=][>](\S+?)[,]/){ + $vpass = $1; + } + + + my $xref_count = 0; + + + my $clone_source_id = + $self->get_source_id_for_source_name('Clone_based_vega_transcript'); + my $curated_source_id = + $self->get_source_id_for_source_name('MGI_curated_transcript'); + + + # + # need to get label and derscriptions fro primary acc. + # + + my %mgi_to_label; + my %mgi_to_desc; + my %mgi_syn; + + my $sth = $self->dbi()->prepare("SELECT x.accession, x.label, x.description from xref x, source s where x.source_id = s.source_id and s.name like 'MGI' and s.priority_description like 'descriptions'"); + + $sth->execute() or croak( $self->dbi()->errstr() ); + while ( my @row = $sth->fetchrow_array() ) { + $mgi_to_label{$row[0]} = $row[1]; + $mgi_to_desc{$row[0]} = $row[2]; + } + $sth->finish; + + # + # Also add synonyms + # + + $sth = $self->dbi()->prepare("SELECT sy.synonym, x.accession from xref x, source s, synonym sy where sy.xref_id = x.xref_id and x.source_id = s.source_id and s.name like 'MGI' and s.priority_description like 'descriptions'"); + + $sth->execute() or croak( $self->dbi()->errstr() ); + while ( my @row = $sth->fetchrow_array() ) { + $mgi_syn{$row[0]} = $row[1]; + } + $sth->finish; + + + + + my $core_sql = 'select tsi.stable_id, x.dbprimary_acc from transcript_stable_id tsi, transcript t, object_xref ox, xref x, external_db e where tsi.transcript_id = t.transcript_id and ox.ensembl_id = t.transcript_id and ox.ensembl_object_type = "Transcript" and ox.xref_id = x.xref_id and x.external_db_id = e.external_db_id and e.db_name like "%OTTT"'; + + + my %ott_to_enst; + + my $dbi2 = $self->dbi2($chost, $cport, $cuser, $cdbname, $cpass); + if(!defined($dbi2)){ + return 1; + } + + + $sth = $dbi2->prepare($core_sql); + $sth->execute() or croak( $dbi2->errstr() ); + while ( my @row = $sth->fetchrow_array() ) { + $ott_to_enst{$row[1]} = $row[0]; + } + $sth->finish; + + + # + # get the enst->ensg mappings. + # + my %enst_to_ensg; + $sth = $dbi2->prepare("select gsi.stable_id, tsi.stable_id from transcript t, gene_stable_id gsi, transcript_stable_id tsi where tsi.transcript_id = t.transcript_id and t.gene_id = gsi.gene_id"); + $sth->execute() or croak( $dbi2->errstr() ); + while ( my @row = $sth->fetchrow_array() ) { + $enst_to_ensg{$row[1]} = $row[0]; + } + $sth->finish; + + + # + # Get the ott -> mgi mappings + # + + + my $vega_sql = (<<VSQL); + SELECT DISTINCT(tsi.stable_id) , x.dbprimary_acc, x.display_label + FROM transcript_stable_id tsi + INNER JOIN transcript t ON tsi.transcript_id = t.transcript_id + INNER JOIN gene g ON g.gene_id = t.gene_id + INNER JOIN object_xref ox ON ox.ensembl_id = g.gene_id + INNER JOIN xref x ON x.xref_id = ox.xref_id + INNER JOIN external_db e ON e.external_db_id = x.external_db_id + WHERE ox.ensembl_object_type = "Gene" + AND e.db_name like "MGI" +VSQL + + my $dbi3 = $self->dbi2($vhost, $vport, $vuser, $vdbname, $vpass); + if(!defined($dbi3)){ + return 1; + } + + + my %seen; + $sth = $dbi3->prepare($vega_sql); + $sth->execute() or croak( $dbi3->errstr() ); + while ( my @row = $sth->fetchrow_array() ) { + # [0] OTTMUST..., [1] MGI:123456 [2] Asx15 etc + my $desc= ""; + my $prim_acc = $row[1]; + if(defined($ott_to_enst{$row[0]})){ + my $tran_stable_id = $ott_to_enst{$row[0]}; + my $name = $prim_acc; + my $desc = ""; + my $label = ""; + if(defined($mgi_to_desc{$name})){ + $desc = $mgi_to_desc{$name}; + $label = $mgi_to_label{$name}; + } + elsif( defined( $mgi_syn{$row[2]} ) and defined( $mgi_to_desc{$mgi_syn{$row[2]}})){ # synonym + $prim_acc = $mgi_syn{$row[2]}; + $desc = $mgi_to_desc{$prim_acc}; + $label = $mgi_to_label{$name}; + } + else{ + print "VEGA: $name [".$row[2]."} has no description\n" if($verbose); + } + my $xref_id = $self->add_xref($prim_acc, "" , $label , $desc, $source_id, $species_id); + my $ensg = $enst_to_ensg{$tran_stable_id}; + if(!defined($seen{$xref_id.$ensg})){ + $xref_count++; + $self->add_direct_xref($xref_id, $ensg , "Gene", ""); + $seen{$xref_id.$ensg} = 1; + } + } + } + + print "$xref_count direct xrefs succesfully parsed\n" if($verbose); + + +# Done in the mapper +# # +# # Finally addd the synonyms +# # + +# my $synonym_sql = (<<SYNO); +#SELECT x2.xref_id, s.synonym +# FROM synonym s +# INNER JOIN xref x1 ON x1.xref_id = s.xref_id +# INNER JOIN xref x2 ON x2.accession = x1.accession +# INNER JOIN source s1 ON s1.source_id = x1.source_id +# INNER JOIN source s2 ON s2.source_id = x2.source_id +# WHERE x2.xref_id != x1.xref_id +# AND s2.name = "MGI" +# AND s2.priority_description = "vega" +# AND s1.name = "MGI" +# AND s1.priority_description = "descriptions" +#SYNO + +# $sth = $self->dbi()->prepare($synonym_sql); + +# $sth->execute() or croak( $self->dbi()->errstr() ); +# while ( my @row = $sth->fetchrow_array() ) { +# $self->add_synonym($row[0], $row[1]); +# } +# $sth->finish; + + +return 0; +} + + + + + +1; + diff --git a/misc-scripts/xref_mapping/XrefParser/MGI_curated_transcriptParser.pm b/misc-scripts/xref_mapping/XrefParser/MGI_curated_transcriptParser.pm new file mode 100644 index 0000000000..d9f219c410 --- /dev/null +++ b/misc-scripts/xref_mapping/XrefParser/MGI_curated_transcriptParser.pm @@ -0,0 +1,181 @@ +package XrefParser::MGI_curated_transcriptParser; + +use strict; +use File::Basename; + +use base qw( XrefParser::BaseParser ); + +use strict; +use Bio::EnsEMBL::DBSQL::DBAdaptor; + + +#my $dbi2; + +if (!defined(caller())) { + + if (scalar(@ARGV) != 1) { + print STDERR "\nUsage: MGI_curated_transcriptParser.pm file <source_id> <species_id>\n\n"; + exit(1); + } + + run(@ARGV); +} + +sub run_script { + my $self = shift if (defined(caller(1))); + + my $file = shift; + my $source_id = shift; + my $species_id = shift; + my $verbose = shift; + + my ($type, $my_args) = split(/:/,$file); + + my $cuser = "ensro"; + my $chost ="ens-staging"; + my $cport = "3306"; + my $cdbname = ""; + my $cpass; + + my $vuser = "ensro"; + my $vhost ="ens-staging"; + my $vport = "3306"; + my $vdbname = "mus_musculus_vega_51_37d"; + my $vpass; + + if($my_args =~ /chost[=][>](\S+?)[,]/){ + $chost = $1; + } + if($my_args =~ /cport[=][>](\S+?)[,]/){ + $cport = $1; + } + if($my_args =~ /cdbname[=][>](\S+?)[,]/){ + $cdbname = $1; + } + if($my_args =~ /cpass[=][>](\S+?)[,]/){ + $cpass = $1; + } + + if($my_args =~ /vhost[=][>](\S+?)[,]/){ + $vhost = $1; + } + if($my_args =~ /vport[=][>](\S+?)[,]/){ + $vport = $1; + } + if($my_args =~ /vdbname[=][>](\S+?)[,]/){ + $vdbname = $1; + } + if($my_args =~ /vpass[=][>](\S+?)[,]/){ + $vpass = $1; + } + + + my $xref_count = 0; + + + my $clone_source_id = + $self->get_source_id_for_source_name('Clone_based_vega_transcript'); + my $curated_source_id = + $self->get_source_id_for_source_name('MGI_curated_transcript'); + + my %name_to_mgi_number = %{ $self->get_label_to_acc( "MGI", $species_id ) }; + my %name_to_mgi_desc = %{ $self->get_label_to_desc( "MGI", $species_id, "descriptions") }; + + my $core_sql = 'select tsi.stable_id, x.dbprimary_acc from transcript_stable_id tsi, transcript t, object_xref ox, xref x, external_db e where tsi.transcript_id = t.transcript_id and ox.ensembl_id = t.transcript_id and ox.ensembl_object_type = "Transcript" and ox.xref_id = x.xref_id and x.external_db_id = e.external_db_id and e.db_name like "%OTTT"'; + + my $vega_sql = 'select x.dbprimary_acc, x.display_label from xref x, external_db e where x.external_db_id = e.external_db_id and e.db_name like "Vega_transcript" and display_label not like "OTT%"'; + + my %ott_to_vega_name; + my %ott_to_enst; + + my $dbi2 = $self->dbi2($chost, $cport, $cuser, $cdbname, $cpass); + if(!defined($dbi2)){ + return 1; + } + + + my $sth = $dbi2->prepare($core_sql); + $sth->execute() or croak( $dbi2->errstr() ); + while ( my @row = $sth->fetchrow_array() ) { + $ott_to_enst{$row[1]} = $row[0]; + } + $sth->finish; + + my $dbi3 = $self->dbi2($vhost, $vport, $vuser, $vdbname, $vpass); + if(!defined($dbi3)){ + return 1; + } + + + $sth = $dbi3->prepare($vega_sql); + $sth->execute() or croak( $dbi3->errstr() ); + while ( my @row = $sth->fetchrow_array() ) { + # [0] OTTMUST..., [1] Mrpl15-001 etc + my $desc= ""; + if(defined($ott_to_enst{$row[0]})){ + my $id = $curated_source_id; + my $name = $row[1]; + $name =~ s/^WU://; + my $prim_acc = $name; + my $stable_id = $ott_to_enst{$row[0]}; + if($name =~ /[.]/){ + $id = $clone_source_id; + } + else{ + my $mgi_name = $name; + # find MGI name + my($mgi_bit, $num) = split(/-\d\d\d/,$name); + if(defined($name_to_mgi_number{$mgi_bit})){ + $prim_acc = $name_to_mgi_number{$mgi_bit}; + } + if(defined($name_to_mgi_desc{$mgi_bit})){ + $desc = $name_to_mgi_desc{$mgi_bit}; + } + else{ + print "$mgi_bit has no description\n" if($verbose); + } + } + my $xref_id = $self->add_xref($prim_acc, "" , $name , $desc, $id, $species_id); + $xref_count++; + + $self->add_direct_xref($xref_id, $ott_to_enst{$row[0]}, "Transcript", ""); + + + } + } + + print "$xref_count direct xrefs succesfully parsed\n" if($verbose); + + #Finally add the synonyms:- + my $synonym_sql = (<<SYNO); +SELECT x2.xref_id, s.synonym + FROM synonym s + INNER JOIN xref x1 ON x1.xref_id = s.xref_id + INNER JOIN xref x2 ON x2.accession = x1.accession + INNER JOIN source s1 ON s1.source_id = x1.source_id + INNER JOIN source s2 ON s2.source_id = x2.source_id + WHERE x2.xref_id != x1.xref_id + AND s2.name = "MGI_curated_transcript" + AND s1.name = "MGI" + AND s1.priority_description = "descriptions" +SYNO + + + $sth = $self->dbi()->prepare($synonym_sql); + + $sth->execute() or croak( $self->dbi()->errstr() ); + while ( my @row = $sth->fetchrow_array() ) { + $self->add_synonym($row[0], $row[1]); + } + $sth->finish; + + + return 0; +} + + + + + +1; + -- GitLab