new parsers

95fde014 · Ian Longden · 4bcc8e84 · 95fde014 · 95fde014 · 95fde014
Commit 95fde014 authored 16 years ago by Ian Longden
--- a/misc-scripts/xref_mapping/XrefParser/MGI_CCDS_Parser.pm
+++ b/misc-scripts/xref_mapping/XrefParser/MGI_CCDS_Parser.pm
+package XrefParser::MGI_CCDS_Parser;
+
+use strict;
+
+use DBI;
+
+use base qw( XrefParser::BaseParser );
+
+# Parse file of HGNC records and assign direct xrefs
+# All assumed to be linked to genes
+
+
+if (!defined(caller())) {
+
+  if (scalar(@ARGV) != 1) {
+    print STDERR "\nUsage: MGI_CCDS_Parser.pm file <source_id> <species_id>\n\n";
+    exit(1);
+  }
+
+  run(@ARGV);
+}
+
+
+sub run_script {
+
+  my ($self, $file, $source_id, $species_id, $verbose) = @_;
+
+  my $wget = "";
+
+  if($file =~ /wget[=][>](\S+?)[,]/){
+    $wget = $1;
+  }
+
+
+  my %label;
+  my %version;
+  my %description;
+  my %accession;
+
+  my $dbi = $self->dbi();  
+
+  my $sql = 'select source_id, priority_description from source where name like "MGI"';
+  my $sth = $dbi->prepare($sql);
+  
+  $sth->execute();
+  my ($mgi_source_id, $desc);
+  $sth->bind_columns(\$mgi_source_id, \$desc);
+  my @arr;
+  while($sth->fetch()){
+    push @arr, $mgi_source_id;
+  }
+  $sth->finish;
+  
+  $sql = "select accession, label, version,  description from xref where source_id in (".join(", ",@arr).")";
+
+  $sth = $dbi->prepare($sql);
+  $sth->execute();
+  my ($acc, $lab, $ver, $desc);
+  $sth->bind_columns(\$acc, \$lab, \$ver, \$desc);
+  while (my @row = $sth->fetchrow_array()) {
+    if(defined($desc)){
+      $accession{$lab} = $acc;
+      $label{$acc} = $lab;
+      $version{$acc} = $ver;
+      $description{$acc} = $desc;
+    }
+  }
+  $sth->finish;
+
+
+
+  #
+  # Get master xref ids via the ccds label.
+  #
+
+  $sql = 'select x.label, x.xref_id from xref x, source s where x.source_id = s.source_id and s.name ="CCDS"';
+  
+  my %ccds_label_to_xref_id;
+  $sth = $dbi->prepare($sql);
+  $sth->execute();
+  my ($xref_id);
+  $sth->bind_columns(\$lab, \$xref_id);
+  while (my @row = $sth->fetchrow_array()) {
+    $ccds_label_to_xref_id{$row[0]} = $row[1];
+  }
+  $sth->finish;
+
+
+
+  my $ua = LWP::UserAgent->new();
+  $ua->timeout(10);
+  $ua->env_proxy();
+  
+
+  my $count = 0;
+  my $ccds_missing = 0;
+  my $entrezgene_missing = 0;
+
+  my $response = $ua->get($wget);
+  
+  if ( !$response->is_success() ) {
+    die $response->status_line;
+  }
+  else{
+    #
+    #
+    ##chromosome	g_accession	gene	gene_id	ccds_id	ccds_status	cds_strand	cds_from	cds_to	cds_locations	match_type
+    #1	NC_000067.5	Xkr4	497097	CCDS14803.1	Public	-	3206102	3661428	[3206102-3207048, 3411782-3411981, 3660632-3661428]	Identical
+    #1	NC_000067.5	Rp1h	19888	CCDS14804.1	Public	-	4334680	4342905	[4334680-4340171, 4341990-4342161, 4342282-4342905]	Identical
+    my @lines = split(/\n/,$response->content);
+    foreach my $line (@lines){
+      my($chrom, $g_acc, $gene_name, $entrez_id, $ccds, @junk) = split(/\t/,$line);
+      if(defined($ccds_label_to_xref_id{$ccds})){ 
+	if(defined($accession{$gene_name}) and
+	   defined($label{$accession{$gene_name}})){
+	  my $acc = $accession{$gene_name};
+	  $self->add_to_xrefs($ccds_label_to_xref_id{$ccds}, $acc, $version{$acc}, $label{$acc}, $description{$acc}, "", $source_id, $species_id);
+	  $count++;
+	}
+	else{
+	  $entrezgene_missing++;
+	}
+      }
+      else{
+	$ccds_missing++;
+      }
+    }
+  }
+  print "$ccds_missing ccds not resolved, $entrezgene_missing mgi not found. Added $count MGI xrefs via CCDS\n" if($verbose);
+  
+}
+
+1;
+
--- a/misc-scripts/xref_mapping/XrefParser/MGI_Desc_Parser.pm
+++ b/misc-scripts/xref_mapping/XrefParser/MGI_Desc_Parser.pm
+package XrefParser::MGI_Desc_Parser;
+
+use strict;
+use File::Basename;
+
+use base qw( XrefParser::BaseParser );
+
+use strict;
+use Bio::EnsEMBL::DBSQL::DBAdaptor;
+
+
+#my $dbi2;
+
+if (!defined(caller())) {
+
+  if (scalar(@ARGV) != 1) {
+    print STDERR "\nUsage: MGI_Desc_Parser.pm file <source_id> <species_id>\n\n";
+    exit(1);
+  }
+
+  run(@ARGV);
+}
+
+sub run {
+
+  my $self = shift if (defined(caller(1)));
+
+  my $source_id = shift;
+  my $species_id = shift;
+  my $files       = shift;
+  my $release_file   = shift;
+  my $verbose       = shift;
+
+  my $file = @{$files}[0];
+  my $syn_file = @{$files}[1];
+
+  if(!defined($source_id)){
+    $source_id = XrefParser::BaseParser->get_source_id_for_filename($file);
+  }
+  if(!defined($species_id)){
+    $species_id = XrefParser::BaseParser->get_species_id_for_filename($file);
+  }
+
+
+
+  my $mgi_io = $self->get_filehandle($file);
+
+  if ( !defined $mgi_io ) {
+    print STDERR "ERROR: Could not open $file\n";
+    return 1;    # 1 is an error
+  }
+
+  my $xref_count =0;
+  my $syn_count =0;
+
+  my %acc_to_xref;
+#MGI Marker associations to Sequence (GenBank or RefSeq) information (tab-delimited)
+#MGI Marker Accession ID	Marker Symbol	Status	Marker Type	Marker Name	cM Position	Chromosome 	GenBank Accession IDs
+#(space-delimited)	Unigene ID
+#(if any)	RefSeq ID
+#(if any)  
+  while ( $_ = $mgi_io->getline() ) {
+    chomp;
+    if($_ =~ /^ MGI:/){
+      my ($junk, $acc, $chr, $pos, $label, $status, @part_desc) = split(/\s+/,$_);
+    
+      my $desc= join(" ",@part_desc);
+      $acc_to_xref{$acc} = $self->add_xref($acc,"",$label,$desc,$source_id,$species_id);
+      if($verbose and $desc eq ""){
+	print "$acc has no description\n";
+      }
+      $xref_count++;
+    }
+  }
+  
+  $mgi_io->close();
+  
+  print $xref_count." MGI Description Xrefs added\n" if($verbose);
+
+  #
+  # Now process the synonyms
+  #
+  my $mgi_io = $self->get_filehandle($syn_file);
+
+  if ( !defined $mgi_io ) {
+    print STDERR "ERROR: Could not open $file\n";
+    return 1;    # 1 is an error
+  }
+
+
+  my $syn_count = 0;
+  while ( $_ = $mgi_io->getline() ) {
+    chomp;
+    if($_ =~ /^ MGI:/){
+     
+      my ($junk, $acc, $chr, $pos, $symbol, @part_synonym) = split(/\s+/,$_);
+      my $syn = join(" ",@part_synonym);
+    
+      if(defined($acc_to_xref{$acc})){
+        $self->add_synonym($acc_to_xref{$acc}, $syn);
+        $syn_count++;
+      }
+#      Lots of withdrawn entrys.
+#      else{
+#        print "Could not find xref for $acc to add synonym $syn\n" if($verbose);
+#      } 
+    }
+  }
+  $mgi_io->close();
+  print $syn_count." synonyms added\n" if($verbose);
+
+  return 0; #successful
+}
+	
+
+
+
+1;
+
--- a/misc-scripts/xref_mapping/XrefParser/MGI_Vega_Parser.pm
+++ b/misc-scripts/xref_mapping/XrefParser/MGI_Vega_Parser.pm
+package XrefParser::MGI_Vega_Parser;
+
+use strict;
+use File::Basename;
+
+use base qw( XrefParser::BaseParser );
+
+use strict;
+use Bio::EnsEMBL::DBSQL::DBAdaptor;
+
+
+#my $dbi2;
+
+if (!defined(caller())) {
+
+  if (scalar(@ARGV) != 1) {
+    print STDERR "\nUsage: MGI_Vega_Parser.pm file <source_id> <species_id>\n\n";
+    exit(1);
+  }
+
+  run(@ARGV);
+}
+
+sub run_script {
+  my $self = shift if (defined(caller(1)));
+
+  my $file = shift;
+  my $source_id = shift;
+  my $species_id = shift;
+  my $verbose = shift;
+
+  my ($type, $my_args) = split(/:/,$file);
+  
+  my $cuser = "ensro";
+  my $chost ="ens-staging";
+  my $cport = "3306";
+  my $cdbname = "";
+  my $cpass;
+
+  my $vuser = "ensro";
+  my $vhost ="ens-staging";
+  my $vport = "3306";
+  my $vdbname = "mus_musculus_vega_51_37d";
+  my $vpass;
+
+  if($my_args =~ /chost[=][>](\S+?)[,]/){
+    $chost = $1;
+  }
+  if($my_args =~ /cport[=][>](\S+?)[,]/){
+    $cport =  $1;
+  }
+  if($my_args =~ /cdbname[=][>](\S+?)[,]/){
+    $cdbname = $1;
+  }
+  if($my_args =~ /cpass[=][>](\S+?)[,]/){
+    $cpass = $1;
+  }
+
+  if($my_args =~ /vhost[=][>](\S+?)[,]/){
+    $vhost = $1;
+  }
+  if($my_args =~ /vport[=][>](\S+?)[,]/){
+    $vport =  $1;
+  }
+  if($my_args =~ /vdbname[=][>](\S+?)[,]/){
+    $vdbname = $1;
+  }
+  if($my_args =~ /vpass[=][>](\S+?)[,]/){
+    $vpass = $1;
+  }
+
+
+  my $xref_count = 0;
+
+
+  my $clone_source_id =
+    $self->get_source_id_for_source_name('Clone_based_vega_transcript');
+  my $curated_source_id =
+    $self->get_source_id_for_source_name('MGI_curated_transcript');
+ 
+
+  #
+  # need to get label and derscriptions fro primary acc.
+  #
+
+  my %mgi_to_label;
+  my %mgi_to_desc;
+  my %mgi_syn;
+
+  my $sth = $self->dbi()->prepare("SELECT x.accession, x.label, x.description from xref x, source s where x.source_id = s.source_id and s.name like 'MGI' and s.priority_description like 'descriptions'");
+  
+  $sth->execute() or croak( $self->dbi()->errstr() );
+  while ( my @row = $sth->fetchrow_array() ) {
+    $mgi_to_label{$row[0]} = $row[1];
+    $mgi_to_desc{$row[0]} = $row[2];
+  }
+  $sth->finish;
+
+  #
+  # Also add synonyms
+  #
+
+  $sth = $self->dbi()->prepare("SELECT sy.synonym, x.accession from xref x, source s, synonym sy where sy.xref_id = x.xref_id and x.source_id = s.source_id and s.name like 'MGI' and s.priority_description like 'descriptions'");
+  
+  $sth->execute() or croak( $self->dbi()->errstr() );
+  while ( my @row = $sth->fetchrow_array() ) {
+    $mgi_syn{$row[0]} = $row[1]; 
+  }
+  $sth->finish;
+
+
+
+
+  my $core_sql = 'select tsi.stable_id, x.dbprimary_acc from transcript_stable_id tsi, transcript t, object_xref ox, xref x, external_db e where tsi.transcript_id = t.transcript_id and ox.ensembl_id = t.transcript_id and ox.ensembl_object_type = "Transcript" and ox.xref_id = x.xref_id and x.external_db_id = e.external_db_id and e.db_name like "%OTTT"';
+
+
+  my %ott_to_enst;
+  
+  my $dbi2 = $self->dbi2($chost, $cport, $cuser, $cdbname, $cpass);
+  if(!defined($dbi2)){
+    return 1;
+  }
+  
+  
+  $sth = $dbi2->prepare($core_sql); 
+  $sth->execute() or croak( $dbi2->errstr() );
+  while ( my @row = $sth->fetchrow_array() ) {
+    $ott_to_enst{$row[1]} = $row[0];
+  }
+  $sth->finish;
+  
+
+  #
+  # get the enst->ensg mappings.
+  #
+  my %enst_to_ensg;
+  $sth = $dbi2->prepare("select gsi.stable_id, tsi.stable_id from transcript t, gene_stable_id gsi, transcript_stable_id tsi where tsi.transcript_id = t.transcript_id and t.gene_id = gsi.gene_id"); 
+  $sth->execute() or croak( $dbi2->errstr() );
+  while ( my @row = $sth->fetchrow_array() ) {
+    $enst_to_ensg{$row[1]} = $row[0];
+  }
+  $sth->finish;
+
+
+  #
+  # Get the ott -> mgi mappings
+  #
+
+
+  my $vega_sql = (<<VSQL);
+  SELECT DISTINCT(tsi.stable_id) , x.dbprimary_acc, x.display_label    
+    FROM        transcript_stable_id tsi      
+     INNER JOIN transcript t              ON tsi.transcript_id = t.transcript_id      
+     INNER JOIN gene g                    ON g.gene_id = t.gene_id      
+     INNER JOIN object_xref ox            ON ox.ensembl_id = g.gene_id      
+     INNER JOIN xref x                    ON x.xref_id = ox.xref_id      
+     INNER JOIN external_db e             ON e.external_db_id = x.external_db_id      
+     WHERE ox.ensembl_object_type = "Gene"         
+       AND e.db_name like "MGI"
+VSQL
+
+  my $dbi3 = $self->dbi2($vhost, $vport, $vuser, $vdbname, $vpass);
+  if(!defined($dbi3)){
+    return 1;
+  }
+
+
+  my %seen;
+  $sth = $dbi3->prepare($vega_sql); 
+  $sth->execute() or croak( $dbi3->errstr() );
+  while ( my @row = $sth->fetchrow_array() ) {
+    # [0] OTTMUST...,   [1] MGI:123456 [2]  Asx15 etc 
+    my $desc= "";
+    my $prim_acc = $row[1];
+    if(defined($ott_to_enst{$row[0]})){
+      my $tran_stable_id = $ott_to_enst{$row[0]};
+      my $name = $prim_acc;
+      my $desc = "";
+      my $label = "";
+      if(defined($mgi_to_desc{$name})){
+	$desc = $mgi_to_desc{$name};
+	$label  = $mgi_to_label{$name};
+      }	
+      elsif( defined( $mgi_syn{$row[2]} ) and defined( $mgi_to_desc{$mgi_syn{$row[2]}})){ # synonym
+        $prim_acc = $mgi_syn{$row[2]};
+	$desc = $mgi_to_desc{$prim_acc};
+        $label = $mgi_to_label{$name};
+      }
+      else{
+	print "VEGA: $name [".$row[2]."} has no description\n" if($verbose);
+      }
+      my $xref_id = $self->add_xref($prim_acc, "" , $label , $desc, $source_id, $species_id);
+      my $ensg = $enst_to_ensg{$tran_stable_id};
+      if(!defined($seen{$xref_id.$ensg})){
+	$xref_count++;
+	$self->add_direct_xref($xref_id, $ensg , "Gene", "");
+	$seen{$xref_id.$ensg} = 1;
+      }	
+    }
+  }
+
+  print "$xref_count direct xrefs succesfully parsed\n" if($verbose);
+ 
+
+# Done in the mapper
+#  #
+#  # Finally addd the synonyms
+#  #
+
+#  my $synonym_sql = (<<SYNO);
+#SELECT x2.xref_id, s.synonym
+#  FROM synonym s
+#    INNER JOIN xref x1 ON  x1.xref_id = s.xref_id
+#    INNER JOIN xref x2 ON  x2.accession = x1.accession 
+#    INNER JOIN source s1 ON s1.source_id = x1.source_id
+#    INNER JOIN source s2 ON s2.source_id = x2.source_id    
+#      WHERE x2.xref_id != x1.xref_id
+#	AND s2.name = "MGI" 
+#	AND s2.priority_description = "vega" 
+#        AND s1.name = "MGI" 
+#        AND s1.priority_description = "descriptions"
+#SYNO
+
+#  $sth = $self->dbi()->prepare($synonym_sql);
+  
+#  $sth->execute() or croak( $self->dbi()->errstr() );
+#  while ( my @row = $sth->fetchrow_array() ) {
+#    $self->add_synonym($row[0], $row[1]);
+#  }
+#  $sth->finish;  
+
+
+return 0;
+}
+
+
+
+
+
+1;
+
--- a/misc-scripts/xref_mapping/XrefParser/MGI_curated_transcriptParser.pm
+++ b/misc-scripts/xref_mapping/XrefParser/MGI_curated_transcriptParser.pm
+package XrefParser::MGI_curated_transcriptParser;
+
+use strict;
+use File::Basename;
+
+use base qw( XrefParser::BaseParser );
+
+use strict;
+use Bio::EnsEMBL::DBSQL::DBAdaptor;
+
+
+#my $dbi2;
+
+if (!defined(caller())) {
+
+  if (scalar(@ARGV) != 1) {
+    print STDERR "\nUsage: MGI_curated_transcriptParser.pm file <source_id> <species_id>\n\n";
+    exit(1);
+  }
+
+  run(@ARGV);
+}
+
+sub run_script {
+  my $self = shift if (defined(caller(1)));
+
+  my $file = shift;
+  my $source_id = shift;
+  my $species_id = shift;
+  my $verbose = shift;
+
+  my ($type, $my_args) = split(/:/,$file);
+  
+  my $cuser = "ensro";
+  my $chost ="ens-staging";
+  my $cport = "3306";
+  my $cdbname = "";
+  my $cpass;
+
+  my $vuser = "ensro";
+  my $vhost ="ens-staging";
+  my $vport = "3306";
+  my $vdbname = "mus_musculus_vega_51_37d";
+  my $vpass;
+
+  if($my_args =~ /chost[=][>](\S+?)[,]/){
+    $chost = $1;
+  }
+  if($my_args =~ /cport[=][>](\S+?)[,]/){
+    $cport =  $1;
+  }
+  if($my_args =~ /cdbname[=][>](\S+?)[,]/){
+    $cdbname = $1;
+  }
+  if($my_args =~ /cpass[=][>](\S+?)[,]/){
+    $cpass = $1;
+  }
+
+  if($my_args =~ /vhost[=][>](\S+?)[,]/){
+    $vhost = $1;
+  }
+  if($my_args =~ /vport[=][>](\S+?)[,]/){
+    $vport =  $1;
+  }
+  if($my_args =~ /vdbname[=][>](\S+?)[,]/){
+    $vdbname = $1;
+  }
+  if($my_args =~ /vpass[=][>](\S+?)[,]/){
+    $vpass = $1;
+  }
+
+
+  my $xref_count = 0;
+
+
+  my $clone_source_id =
+    $self->get_source_id_for_source_name('Clone_based_vega_transcript');
+  my $curated_source_id =
+    $self->get_source_id_for_source_name('MGI_curated_transcript');
+ 
+  my %name_to_mgi_number = %{ $self->get_label_to_acc( "MGI", $species_id ) };
+  my %name_to_mgi_desc = %{ $self->get_label_to_desc( "MGI", $species_id, "descriptions") };
+
+  my $core_sql = 'select tsi.stable_id, x.dbprimary_acc from transcript_stable_id tsi, transcript t, object_xref ox, xref x, external_db e where tsi.transcript_id = t.transcript_id and ox.ensembl_id = t.transcript_id and ox.ensembl_object_type = "Transcript" and ox.xref_id = x.xref_id and x.external_db_id = e.external_db_id and e.db_name like "%OTTT"';
+
+  my $vega_sql = 'select x.dbprimary_acc, x.display_label from xref x, external_db e where  x.external_db_id = e.external_db_id and e.db_name like "Vega_transcript" and display_label not like "OTT%"';
+  
+  my %ott_to_vega_name;
+  my %ott_to_enst;
+  
+  my $dbi2 = $self->dbi2($chost, $cport, $cuser, $cdbname, $cpass);
+  if(!defined($dbi2)){
+    return 1;
+  }
+  
+
+  my $sth = $dbi2->prepare($core_sql); 
+  $sth->execute() or croak( $dbi2->errstr() );
+  while ( my @row = $sth->fetchrow_array() ) {
+    $ott_to_enst{$row[1]} = $row[0];
+  }
+  $sth->finish;
+  
+  my $dbi3 = $self->dbi2($vhost, $vport, $vuser, $vdbname, $vpass);
+  if(!defined($dbi3)){
+    return 1;
+  }
+  
+  
+  $sth = $dbi3->prepare($vega_sql); 
+  $sth->execute() or croak( $dbi3->errstr() );
+  while ( my @row = $sth->fetchrow_array() ) {
+    # [0] OTTMUST...,   [1] Mrpl15-001 etc 
+    my $desc= "";
+    if(defined($ott_to_enst{$row[0]})){
+      my $id = $curated_source_id;
+      my $name = $row[1];
+      $name =~ s/^WU://;
+      my $prim_acc = $name;
+      my $stable_id = $ott_to_enst{$row[0]};
+      if($name =~ /[.]/){
+        $id = $clone_source_id;
+      }
+      else{
+	my $mgi_name = $name;
+	# find MGI name 
+	my($mgi_bit, $num) = split(/-\d\d\d/,$name);
+	if(defined($name_to_mgi_number{$mgi_bit})){
+	  $prim_acc = $name_to_mgi_number{$mgi_bit};
+	}
+	if(defined($name_to_mgi_desc{$mgi_bit})){
+	  $desc = $name_to_mgi_desc{$mgi_bit};
+	}
+	else{
+	  print "$mgi_bit has no description\n" if($verbose);
+	}
+      }
+      my $xref_id = $self->add_xref($prim_acc, "" , $name , $desc, $id, $species_id);
+      $xref_count++;
+      
+      $self->add_direct_xref($xref_id, $ott_to_enst{$row[0]}, "Transcript", "");
+
+      
+    }
+  }
+
+  print "$xref_count direct xrefs succesfully parsed\n" if($verbose);
+
+  #Finally add the synonyms:-
+  my $synonym_sql = (<<SYNO);
+SELECT x2.xref_id, s.synonym
+  FROM synonym s
+    INNER JOIN xref x1 ON  x1.xref_id = s.xref_id
+    INNER JOIN xref x2 ON  x2.accession = x1.accession 
+    INNER JOIN source s1 ON s1.source_id = x1.source_id
+    INNER JOIN source s2 ON s2.source_id = x2.source_id    
+      WHERE x2.xref_id != x1.xref_id
+	AND s2.name = "MGI_curated_transcript" 
+        AND s1.name = "MGI" 
+        AND s1.priority_description = "descriptions"
+SYNO
+ 
+
+  $sth = $self->dbi()->prepare($synonym_sql);
+  
+  $sth->execute() or croak( $self->dbi()->errstr() );
+  while ( my @row = $sth->fetchrow_array() ) {
+    $self->add_synonym($row[0], $row[1]);
+  }
+  $sth->finish;  
+
+
+  return 0;
+}
+
+
+
+
+
+1;
+