Skip to content
Snippets Groups Projects
Commit 95fde014 authored by Ian Longden's avatar Ian Longden
Browse files

new parsers

parent 4bcc8e84
No related branches found
No related tags found
No related merge requests found
package XrefParser::MGI_CCDS_Parser;
use strict;
use DBI;
use base qw( XrefParser::BaseParser );
# Parse file of HGNC records and assign direct xrefs
# All assumed to be linked to genes
if (!defined(caller())) {
if (scalar(@ARGV) != 1) {
print STDERR "\nUsage: MGI_CCDS_Parser.pm file <source_id> <species_id>\n\n";
exit(1);
}
run(@ARGV);
}
sub run_script {
my ($self, $file, $source_id, $species_id, $verbose) = @_;
my $wget = "";
if($file =~ /wget[=][>](\S+?)[,]/){
$wget = $1;
}
my %label;
my %version;
my %description;
my %accession;
my $dbi = $self->dbi();
my $sql = 'select source_id, priority_description from source where name like "MGI"';
my $sth = $dbi->prepare($sql);
$sth->execute();
my ($mgi_source_id, $desc);
$sth->bind_columns(\$mgi_source_id, \$desc);
my @arr;
while($sth->fetch()){
push @arr, $mgi_source_id;
}
$sth->finish;
$sql = "select accession, label, version, description from xref where source_id in (".join(", ",@arr).")";
$sth = $dbi->prepare($sql);
$sth->execute();
my ($acc, $lab, $ver, $desc);
$sth->bind_columns(\$acc, \$lab, \$ver, \$desc);
while (my @row = $sth->fetchrow_array()) {
if(defined($desc)){
$accession{$lab} = $acc;
$label{$acc} = $lab;
$version{$acc} = $ver;
$description{$acc} = $desc;
}
}
$sth->finish;
#
# Get master xref ids via the ccds label.
#
$sql = 'select x.label, x.xref_id from xref x, source s where x.source_id = s.source_id and s.name ="CCDS"';
my %ccds_label_to_xref_id;
$sth = $dbi->prepare($sql);
$sth->execute();
my ($xref_id);
$sth->bind_columns(\$lab, \$xref_id);
while (my @row = $sth->fetchrow_array()) {
$ccds_label_to_xref_id{$row[0]} = $row[1];
}
$sth->finish;
my $ua = LWP::UserAgent->new();
$ua->timeout(10);
$ua->env_proxy();
my $count = 0;
my $ccds_missing = 0;
my $entrezgene_missing = 0;
my $response = $ua->get($wget);
if ( !$response->is_success() ) {
die $response->status_line;
}
else{
#
#
##chromosome g_accession gene gene_id ccds_id ccds_status cds_strand cds_from cds_to cds_locations match_type
#1 NC_000067.5 Xkr4 497097 CCDS14803.1 Public - 3206102 3661428 [3206102-3207048, 3411782-3411981, 3660632-3661428] Identical
#1 NC_000067.5 Rp1h 19888 CCDS14804.1 Public - 4334680 4342905 [4334680-4340171, 4341990-4342161, 4342282-4342905] Identical
my @lines = split(/\n/,$response->content);
foreach my $line (@lines){
my($chrom, $g_acc, $gene_name, $entrez_id, $ccds, @junk) = split(/\t/,$line);
if(defined($ccds_label_to_xref_id{$ccds})){
if(defined($accession{$gene_name}) and
defined($label{$accession{$gene_name}})){
my $acc = $accession{$gene_name};
$self->add_to_xrefs($ccds_label_to_xref_id{$ccds}, $acc, $version{$acc}, $label{$acc}, $description{$acc}, "", $source_id, $species_id);
$count++;
}
else{
$entrezgene_missing++;
}
}
else{
$ccds_missing++;
}
}
}
print "$ccds_missing ccds not resolved, $entrezgene_missing mgi not found. Added $count MGI xrefs via CCDS\n" if($verbose);
}
1;
package XrefParser::MGI_Desc_Parser;
use strict;
use File::Basename;
use base qw( XrefParser::BaseParser );
use strict;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
#my $dbi2;
if (!defined(caller())) {
if (scalar(@ARGV) != 1) {
print STDERR "\nUsage: MGI_Desc_Parser.pm file <source_id> <species_id>\n\n";
exit(1);
}
run(@ARGV);
}
sub run {
my $self = shift if (defined(caller(1)));
my $source_id = shift;
my $species_id = shift;
my $files = shift;
my $release_file = shift;
my $verbose = shift;
my $file = @{$files}[0];
my $syn_file = @{$files}[1];
if(!defined($source_id)){
$source_id = XrefParser::BaseParser->get_source_id_for_filename($file);
}
if(!defined($species_id)){
$species_id = XrefParser::BaseParser->get_species_id_for_filename($file);
}
my $mgi_io = $self->get_filehandle($file);
if ( !defined $mgi_io ) {
print STDERR "ERROR: Could not open $file\n";
return 1; # 1 is an error
}
my $xref_count =0;
my $syn_count =0;
my %acc_to_xref;
#MGI Marker associations to Sequence (GenBank or RefSeq) information (tab-delimited)
#MGI Marker Accession ID Marker Symbol Status Marker Type Marker Name cM Position Chromosome GenBank Accession IDs
#(space-delimited) Unigene ID
#(if any) RefSeq ID
#(if any)
while ( $_ = $mgi_io->getline() ) {
chomp;
if($_ =~ /^ MGI:/){
my ($junk, $acc, $chr, $pos, $label, $status, @part_desc) = split(/\s+/,$_);
my $desc= join(" ",@part_desc);
$acc_to_xref{$acc} = $self->add_xref($acc,"",$label,$desc,$source_id,$species_id);
if($verbose and $desc eq ""){
print "$acc has no description\n";
}
$xref_count++;
}
}
$mgi_io->close();
print $xref_count." MGI Description Xrefs added\n" if($verbose);
#
# Now process the synonyms
#
my $mgi_io = $self->get_filehandle($syn_file);
if ( !defined $mgi_io ) {
print STDERR "ERROR: Could not open $file\n";
return 1; # 1 is an error
}
my $syn_count = 0;
while ( $_ = $mgi_io->getline() ) {
chomp;
if($_ =~ /^ MGI:/){
my ($junk, $acc, $chr, $pos, $symbol, @part_synonym) = split(/\s+/,$_);
my $syn = join(" ",@part_synonym);
if(defined($acc_to_xref{$acc})){
$self->add_synonym($acc_to_xref{$acc}, $syn);
$syn_count++;
}
# Lots of withdrawn entrys.
# else{
# print "Could not find xref for $acc to add synonym $syn\n" if($verbose);
# }
}
}
$mgi_io->close();
print $syn_count." synonyms added\n" if($verbose);
return 0; #successful
}
1;
package XrefParser::MGI_Vega_Parser;
use strict;
use File::Basename;
use base qw( XrefParser::BaseParser );
use strict;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
#my $dbi2;
if (!defined(caller())) {
if (scalar(@ARGV) != 1) {
print STDERR "\nUsage: MGI_Vega_Parser.pm file <source_id> <species_id>\n\n";
exit(1);
}
run(@ARGV);
}
sub run_script {
my $self = shift if (defined(caller(1)));
my $file = shift;
my $source_id = shift;
my $species_id = shift;
my $verbose = shift;
my ($type, $my_args) = split(/:/,$file);
my $cuser = "ensro";
my $chost ="ens-staging";
my $cport = "3306";
my $cdbname = "";
my $cpass;
my $vuser = "ensro";
my $vhost ="ens-staging";
my $vport = "3306";
my $vdbname = "mus_musculus_vega_51_37d";
my $vpass;
if($my_args =~ /chost[=][>](\S+?)[,]/){
$chost = $1;
}
if($my_args =~ /cport[=][>](\S+?)[,]/){
$cport = $1;
}
if($my_args =~ /cdbname[=][>](\S+?)[,]/){
$cdbname = $1;
}
if($my_args =~ /cpass[=][>](\S+?)[,]/){
$cpass = $1;
}
if($my_args =~ /vhost[=][>](\S+?)[,]/){
$vhost = $1;
}
if($my_args =~ /vport[=][>](\S+?)[,]/){
$vport = $1;
}
if($my_args =~ /vdbname[=][>](\S+?)[,]/){
$vdbname = $1;
}
if($my_args =~ /vpass[=][>](\S+?)[,]/){
$vpass = $1;
}
my $xref_count = 0;
my $clone_source_id =
$self->get_source_id_for_source_name('Clone_based_vega_transcript');
my $curated_source_id =
$self->get_source_id_for_source_name('MGI_curated_transcript');
#
# need to get label and derscriptions fro primary acc.
#
my %mgi_to_label;
my %mgi_to_desc;
my %mgi_syn;
my $sth = $self->dbi()->prepare("SELECT x.accession, x.label, x.description from xref x, source s where x.source_id = s.source_id and s.name like 'MGI' and s.priority_description like 'descriptions'");
$sth->execute() or croak( $self->dbi()->errstr() );
while ( my @row = $sth->fetchrow_array() ) {
$mgi_to_label{$row[0]} = $row[1];
$mgi_to_desc{$row[0]} = $row[2];
}
$sth->finish;
#
# Also add synonyms
#
$sth = $self->dbi()->prepare("SELECT sy.synonym, x.accession from xref x, source s, synonym sy where sy.xref_id = x.xref_id and x.source_id = s.source_id and s.name like 'MGI' and s.priority_description like 'descriptions'");
$sth->execute() or croak( $self->dbi()->errstr() );
while ( my @row = $sth->fetchrow_array() ) {
$mgi_syn{$row[0]} = $row[1];
}
$sth->finish;
my $core_sql = 'select tsi.stable_id, x.dbprimary_acc from transcript_stable_id tsi, transcript t, object_xref ox, xref x, external_db e where tsi.transcript_id = t.transcript_id and ox.ensembl_id = t.transcript_id and ox.ensembl_object_type = "Transcript" and ox.xref_id = x.xref_id and x.external_db_id = e.external_db_id and e.db_name like "%OTTT"';
my %ott_to_enst;
my $dbi2 = $self->dbi2($chost, $cport, $cuser, $cdbname, $cpass);
if(!defined($dbi2)){
return 1;
}
$sth = $dbi2->prepare($core_sql);
$sth->execute() or croak( $dbi2->errstr() );
while ( my @row = $sth->fetchrow_array() ) {
$ott_to_enst{$row[1]} = $row[0];
}
$sth->finish;
#
# get the enst->ensg mappings.
#
my %enst_to_ensg;
$sth = $dbi2->prepare("select gsi.stable_id, tsi.stable_id from transcript t, gene_stable_id gsi, transcript_stable_id tsi where tsi.transcript_id = t.transcript_id and t.gene_id = gsi.gene_id");
$sth->execute() or croak( $dbi2->errstr() );
while ( my @row = $sth->fetchrow_array() ) {
$enst_to_ensg{$row[1]} = $row[0];
}
$sth->finish;
#
# Get the ott -> mgi mappings
#
my $vega_sql = (<<VSQL);
SELECT DISTINCT(tsi.stable_id) , x.dbprimary_acc, x.display_label
FROM transcript_stable_id tsi
INNER JOIN transcript t ON tsi.transcript_id = t.transcript_id
INNER JOIN gene g ON g.gene_id = t.gene_id
INNER JOIN object_xref ox ON ox.ensembl_id = g.gene_id
INNER JOIN xref x ON x.xref_id = ox.xref_id
INNER JOIN external_db e ON e.external_db_id = x.external_db_id
WHERE ox.ensembl_object_type = "Gene"
AND e.db_name like "MGI"
VSQL
my $dbi3 = $self->dbi2($vhost, $vport, $vuser, $vdbname, $vpass);
if(!defined($dbi3)){
return 1;
}
my %seen;
$sth = $dbi3->prepare($vega_sql);
$sth->execute() or croak( $dbi3->errstr() );
while ( my @row = $sth->fetchrow_array() ) {
# [0] OTTMUST..., [1] MGI:123456 [2] Asx15 etc
my $desc= "";
my $prim_acc = $row[1];
if(defined($ott_to_enst{$row[0]})){
my $tran_stable_id = $ott_to_enst{$row[0]};
my $name = $prim_acc;
my $desc = "";
my $label = "";
if(defined($mgi_to_desc{$name})){
$desc = $mgi_to_desc{$name};
$label = $mgi_to_label{$name};
}
elsif( defined( $mgi_syn{$row[2]} ) and defined( $mgi_to_desc{$mgi_syn{$row[2]}})){ # synonym
$prim_acc = $mgi_syn{$row[2]};
$desc = $mgi_to_desc{$prim_acc};
$label = $mgi_to_label{$name};
}
else{
print "VEGA: $name [".$row[2]."} has no description\n" if($verbose);
}
my $xref_id = $self->add_xref($prim_acc, "" , $label , $desc, $source_id, $species_id);
my $ensg = $enst_to_ensg{$tran_stable_id};
if(!defined($seen{$xref_id.$ensg})){
$xref_count++;
$self->add_direct_xref($xref_id, $ensg , "Gene", "");
$seen{$xref_id.$ensg} = 1;
}
}
}
print "$xref_count direct xrefs succesfully parsed\n" if($verbose);
# Done in the mapper
# #
# # Finally addd the synonyms
# #
# my $synonym_sql = (<<SYNO);
#SELECT x2.xref_id, s.synonym
# FROM synonym s
# INNER JOIN xref x1 ON x1.xref_id = s.xref_id
# INNER JOIN xref x2 ON x2.accession = x1.accession
# INNER JOIN source s1 ON s1.source_id = x1.source_id
# INNER JOIN source s2 ON s2.source_id = x2.source_id
# WHERE x2.xref_id != x1.xref_id
# AND s2.name = "MGI"
# AND s2.priority_description = "vega"
# AND s1.name = "MGI"
# AND s1.priority_description = "descriptions"
#SYNO
# $sth = $self->dbi()->prepare($synonym_sql);
# $sth->execute() or croak( $self->dbi()->errstr() );
# while ( my @row = $sth->fetchrow_array() ) {
# $self->add_synonym($row[0], $row[1]);
# }
# $sth->finish;
return 0;
}
1;
package XrefParser::MGI_curated_transcriptParser;
use strict;
use File::Basename;
use base qw( XrefParser::BaseParser );
use strict;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
#my $dbi2;
if (!defined(caller())) {
if (scalar(@ARGV) != 1) {
print STDERR "\nUsage: MGI_curated_transcriptParser.pm file <source_id> <species_id>\n\n";
exit(1);
}
run(@ARGV);
}
sub run_script {
my $self = shift if (defined(caller(1)));
my $file = shift;
my $source_id = shift;
my $species_id = shift;
my $verbose = shift;
my ($type, $my_args) = split(/:/,$file);
my $cuser = "ensro";
my $chost ="ens-staging";
my $cport = "3306";
my $cdbname = "";
my $cpass;
my $vuser = "ensro";
my $vhost ="ens-staging";
my $vport = "3306";
my $vdbname = "mus_musculus_vega_51_37d";
my $vpass;
if($my_args =~ /chost[=][>](\S+?)[,]/){
$chost = $1;
}
if($my_args =~ /cport[=][>](\S+?)[,]/){
$cport = $1;
}
if($my_args =~ /cdbname[=][>](\S+?)[,]/){
$cdbname = $1;
}
if($my_args =~ /cpass[=][>](\S+?)[,]/){
$cpass = $1;
}
if($my_args =~ /vhost[=][>](\S+?)[,]/){
$vhost = $1;
}
if($my_args =~ /vport[=][>](\S+?)[,]/){
$vport = $1;
}
if($my_args =~ /vdbname[=][>](\S+?)[,]/){
$vdbname = $1;
}
if($my_args =~ /vpass[=][>](\S+?)[,]/){
$vpass = $1;
}
my $xref_count = 0;
my $clone_source_id =
$self->get_source_id_for_source_name('Clone_based_vega_transcript');
my $curated_source_id =
$self->get_source_id_for_source_name('MGI_curated_transcript');
my %name_to_mgi_number = %{ $self->get_label_to_acc( "MGI", $species_id ) };
my %name_to_mgi_desc = %{ $self->get_label_to_desc( "MGI", $species_id, "descriptions") };
my $core_sql = 'select tsi.stable_id, x.dbprimary_acc from transcript_stable_id tsi, transcript t, object_xref ox, xref x, external_db e where tsi.transcript_id = t.transcript_id and ox.ensembl_id = t.transcript_id and ox.ensembl_object_type = "Transcript" and ox.xref_id = x.xref_id and x.external_db_id = e.external_db_id and e.db_name like "%OTTT"';
my $vega_sql = 'select x.dbprimary_acc, x.display_label from xref x, external_db e where x.external_db_id = e.external_db_id and e.db_name like "Vega_transcript" and display_label not like "OTT%"';
my %ott_to_vega_name;
my %ott_to_enst;
my $dbi2 = $self->dbi2($chost, $cport, $cuser, $cdbname, $cpass);
if(!defined($dbi2)){
return 1;
}
my $sth = $dbi2->prepare($core_sql);
$sth->execute() or croak( $dbi2->errstr() );
while ( my @row = $sth->fetchrow_array() ) {
$ott_to_enst{$row[1]} = $row[0];
}
$sth->finish;
my $dbi3 = $self->dbi2($vhost, $vport, $vuser, $vdbname, $vpass);
if(!defined($dbi3)){
return 1;
}
$sth = $dbi3->prepare($vega_sql);
$sth->execute() or croak( $dbi3->errstr() );
while ( my @row = $sth->fetchrow_array() ) {
# [0] OTTMUST..., [1] Mrpl15-001 etc
my $desc= "";
if(defined($ott_to_enst{$row[0]})){
my $id = $curated_source_id;
my $name = $row[1];
$name =~ s/^WU://;
my $prim_acc = $name;
my $stable_id = $ott_to_enst{$row[0]};
if($name =~ /[.]/){
$id = $clone_source_id;
}
else{
my $mgi_name = $name;
# find MGI name
my($mgi_bit, $num) = split(/-\d\d\d/,$name);
if(defined($name_to_mgi_number{$mgi_bit})){
$prim_acc = $name_to_mgi_number{$mgi_bit};
}
if(defined($name_to_mgi_desc{$mgi_bit})){
$desc = $name_to_mgi_desc{$mgi_bit};
}
else{
print "$mgi_bit has no description\n" if($verbose);
}
}
my $xref_id = $self->add_xref($prim_acc, "" , $name , $desc, $id, $species_id);
$xref_count++;
$self->add_direct_xref($xref_id, $ott_to_enst{$row[0]}, "Transcript", "");
}
}
print "$xref_count direct xrefs succesfully parsed\n" if($verbose);
#Finally add the synonyms:-
my $synonym_sql = (<<SYNO);
SELECT x2.xref_id, s.synonym
FROM synonym s
INNER JOIN xref x1 ON x1.xref_id = s.xref_id
INNER JOIN xref x2 ON x2.accession = x1.accession
INNER JOIN source s1 ON s1.source_id = x1.source_id
INNER JOIN source s2 ON s2.source_id = x2.source_id
WHERE x2.xref_id != x1.xref_id
AND s2.name = "MGI_curated_transcript"
AND s1.name = "MGI"
AND s1.priority_description = "descriptions"
SYNO
$sth = $self->dbi()->prepare($synonym_sql);
$sth->execute() or croak( $self->dbi()->errstr() );
while ( my @row = $sth->fetchrow_array() ) {
$self->add_synonym($row[0], $row[1]);
}
$sth->finish;
return 0;
}
1;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment