From 55dce2ed2b8f4c3d4da552c782baee0ef1349b5a Mon Sep 17 00:00:00 2001 From: Will Spooner <whs@sanger.ac.uk> Date: Thu, 14 Jun 2007 14:49:27 +0000 Subject: [PATCH] Reimplemented the dump_interpro method so that it dumps any xrefs, object_xrefs and go_xrefs that may be dependent on the interpro xrefs --- .../xref_mapping/XrefMapper/BasicMapper.pm | 126 +++++++++++++++--- 1 file changed, 108 insertions(+), 18 deletions(-) diff --git a/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm b/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm index b49bca5fd5..2f1c7ee19e 100644 --- a/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm +++ b/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm @@ -228,6 +228,7 @@ sub build_list_and_map { my @list=(); my $i = 0; + foreach my $method (@{$self->method()}){ my @dna=(); my $q_dna_file = $self->xref->dir."/xref_".$i."_dna.fasta"; @@ -248,7 +249,6 @@ sub build_list_and_map { } $i++; } - $self->run_mapping(\@list); } @@ -376,7 +376,6 @@ sub dump_xref{ } my @method=(); - my @lists =@{$self->get_set_lists()}; my $i=0; @@ -1243,18 +1242,20 @@ sub parse_mappings { } # write relevant xrefs to file - $max_object_xref_id = $self->dump_core_xrefs(\%primary_xref_ids, $max_object_xref_id, $xref_id_offset); - - # dump interpro table as well - $self->dump_interpro(); + $max_object_xref_id + = $self->dump_core_xrefs(\%primary_xref_ids, + $max_object_xref_id, $xref_id_offset); # dump direct xrefs - $self->dump_direct_xrefs($xref_id_offset, $max_object_xref_id); + $max_object_xref_id + = $self->dump_direct_xrefs($xref_id_offset, $max_object_xref_id); - # dump xrefs that don't appear in either the primary_xref or dependent_xref tables + # dump xrefs that don't appear in primary_xref, direct_xref or + # dependent_xref tables (e.g. interpro) $self->dump_orphan_xrefs($xref_id_offset); - + # dump interpro table as well + $self->dump_interpro($xref_id_offset,$max_object_xref_id); } @@ -2050,31 +2051,120 @@ XSQL $xref_sth->finish(); print " Wrote $count direct xrefs\n"; - + return $object_xref_id; } # Dump the interpro table from the xref database sub dump_interpro { - my $self = shift; + my $xref_id_offset = shift; + my $oxref_id_offset = shift; - open (INTERPRO, ">" . $self->core->dir() . "/interpro.txt"); + print "Writing InterPro\n"; + my( $ipro_count, $xref_count, $oxref_count, $goxref_count ) = (0,0,0,0); - my $sth = $self->xref->dbc->prepare("SELECT * FROM interpro"); - $sth->execute(); + open (INTERPRO, ">" . $self->core->dir() . "/interpro.txt"); + open (XREF, ">>" . $self->core->dir() . "/xref.txt"); + open (OBJECT_XREF, ">>" . $self->core->dir() . "/object_xref.txt"); + open (GO_XREF, ">>" . $self->core->dir() . "/go_xref.txt"); + + # Get a mapping of protein domains to ensembl translations for + # interpro dependent xrefs + my $core_sql = "SELECT hit_id, translation_id FROM protein_feature" ; + my $core_sth = $self->core->dbc->prepare($core_sql); + $core_sth->execute(); + my %domain_to_translation = (); + my ($domain, $translation); + $core_sth->bind_columns(\$domain, \$translation); + while ($core_sth->fetch()) { + $domain_to_translation{$domain} ||= []; + push @{$domain_to_translation{$domain}}, $translation; + } - my ($interpro, $pfam); - $sth->bind_columns(\$interpro, \$pfam); - while ($sth->fetch()) { - print INTERPRO $interpro . "\t" . $pfam . "\n"; + # Get a list of interpro data, including dependent xrefs if avail + my $sth = $self->xref->dbc->prepare(" + SELECT ip.interpro, ip.pfam, x2.xref_id, x2.source_id, + x2.accession, x2.version, x2.label, x2.description, + dx.linkage_annotation + FROM interpro ip, xref x + LEFT JOIN dependent_xref dx ON x.xref_id=dx.master_xref_id + LEFT JOIN xref x2 ON dx.dependent_xref_id=x2.xref_id + WHERE ip.interpro = x.accession"); + my $rv = $sth->execute(); + my %interpro_cache; + my %xref_cache; + my %oxref_cache; + my %goxref_cache; + while( my $row = $sth->fetchrow_arrayref() ){ + my ( $interpro, $pfam, $dx_xref_id, $dx_source_id, $dx_accession, + $dx_version, $dx_label, $dx_description, $go_linkage ) = @$row; + unless( $interpro_cache{$interpro.$pfam} ){ + # We have a fresh interpro. + # Note; interpro xrefs themselves are handled by dump_orphan_xrefs + print INTERPRO $interpro . "\t" . $pfam . "\n"; + $interpro_cache{$interpro.$pfam} ++; + $ipro_count++; + } + if( $dx_accession ){ + # We have a dependent xref for this interpro... + my $xref_id; + unless( $xref_id = $xref_cache{$dx_accession} ){ + $xref_id = $dx_xref_id + $xref_id_offset; + $xref_cache{$dx_accession} = $xref_id; + printf XREF ("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", + $xref_id, + $source_to_external_db{$dx_source_id}, + $dx_accession, + $dx_label, + $dx_version, + $dx_description, + 'DEPENDENT', + "Generated via $interpro"); + $xref_count++; + } + foreach my $ensembl_id( @{$domain_to_translation{$pfam}||[]} ){ + #...And the interpro domain maps to a translation + my $oxref_id; + unless( $oxref_id = $oxref_cache{$dx_accession.$ensembl_id} ){ + $oxref_id = $oxref_count + 1 + $oxref_id_offset; + $oxref_cache{$dx_accession.$ensembl_id} = $oxref_id; + printf OBJECT_XREF ( "%s\t%s\t%s\t%s\n", + $oxref_id, + $ensembl_id, + 'Translation', + $xref_id ); + $oxref_count ++; + } + if( $go_linkage ){ + #...And we have linkage data, indicating a GO sref + unless( $goxref_cache{$oxref_id.$go_linkage} ){ + $goxref_cache{$oxref_id.$go_linkage} ++; + printf GO_XREF ( "%s\t%s\n", + $oxref_id, + $go_linkage ); + $goxref_count ++; + } + } + } + } } $sth->finish(); close (INTERPRO); + close (XREF); + close (OBJECT_XREF); + close (GO_XREF); + print(" Wrote $ipro_count interpro table entries\n"); + print(" Wrote $xref_count interpro-dependent xrefs \n"); + print(" including $oxref_count object xrefs, \n"); + print(" and $goxref_count go xrefs\n"); + + return $oxref_id_offset + $oxref_count; } + sub build_stable_id_to_internal_id_hash { my ($self) = @_; -- GitLab