From 8396a2619aad2bca33a9f0d2093862105ada217b Mon Sep 17 00:00:00 2001 From: Glenn Proctor <gp1@sanger.ac.uk> Date: Fri, 29 Jun 2007 14:05:46 +0000 Subject: [PATCH] Don't project if a GO term with the same accession, but IEA evidence code, exists, as this will lead to duplicates when the projected term has its evidence code changed to IEA after projection --- .../xref_projection/project_display_xrefs.pl | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/misc-scripts/xref_projection/project_display_xrefs.pl b/misc-scripts/xref_projection/project_display_xrefs.pl index 98f2371358..4c28c5907c 100644 --- a/misc-scripts/xref_projection/project_display_xrefs.pl +++ b/misc-scripts/xref_projection/project_display_xrefs.pl @@ -355,6 +355,10 @@ sub go_xref_exists { foreach my $xref (@{$to_go_xrefs}) { +# if ($dbEntry->primary_id() eq "GO:0005515" && $xref->primary_id() eq "GO:0005515") { +# print $xref->dbname() . " " . $xref->primary_id() . " " . join("", @{$xref->get_all_linkage_types()}) . " -> " . $dbEntry->dbname() . " " . $dbEntry->primary_id() . " " . join("", @{$dbEntry->get_all_linkage_types()}) . "\n"; +# } + next if (ref($dbEntry) ne "Bio::EnsEMBL::GoXref" || ref($xref) ne "Bio::EnsEMBL::GoXref"); if ($xref->dbname() eq $dbEntry->dbname() && @@ -363,6 +367,14 @@ sub go_xref_exists { return 1; } + # if a GO term with the same accession, but IEA evidence code, exists, also don't project, as this + # will lead to duplicates when the projected term has its evidence code changed to IEA after projection + if ($xref->primary_id() eq $dbEntry->primary_id()) { + foreach my $evidence_code (@{$xref->get_all_linkage_types()}) { + return 1 if ($evidence_code eq "IEA"); + } + } + } return 0; @@ -384,8 +396,8 @@ sub print_stats { $count = count_rows($to_ga, "SELECT COUNT(*) FROM gene g, xref x WHERE g.display_xref_id=x.xref_id AND g.display_xref_id IS NOT NULL AND (x.info_type != 'PROJECTION' || x.info_type IS NULL)"); printf("Gene names: unprojected %d (%3.1f\%)" , $count, (100 * $count / $total_genes)); - $count = count_rows($to_ga, "SELECT COUNT(*) FROM gene g, xref x WHERE g.display_xref_id=x.xref_id AND x.info_type='PROJECTION'"); - printf(" projected %d (%3.1f\%)" , $count, (100 * $count / $total_genes)); + my $projected = count_rows($to_ga, "SELECT COUNT(*) FROM gene g, xref x WHERE g.display_xref_id=x.xref_id AND x.info_type='PROJECTION'"); + printf(" projected %d (%3.1f\%)" , $projected, (100 * $projected / $total_genes)); $count = count_rows($to_ga, "SELECT COUNT(*) FROM gene g, xref x, external_db e WHERE g.display_xref_id=x.xref_id AND x.external_db_id=e.external_db_id AND e.db_name IN ('RefSeq_dna_predicted', 'RefSeq_peptide_predicted')"); printf(" predicted %d (%3.1f\%)" , $count, (100 * $count / $total_genes)); @@ -393,6 +405,11 @@ sub print_stats { $count = count_rows($to_ga, "SELECT COUNT(*) FROM gene g WHERE display_xref_id IS NOT NULL"); printf(" total genes with names %d (%3.1f\%)\n" , $count, (100 * $count / $total_genes)); + if ($projected > 0) { + my $one2many = count_rows($to_ga, "SELECT COUNT(*) FROM gene g, xref x WHERE g.display_xref_id=x.xref_id AND x.info_type='PROJECTION' AND x.display_label LIKE '%(% of %)%'"); + my $one2one = $projected - $one2many; + printf("Of the %d projected genes, %d (%3.1f\%) are from one-one mappings, %d (%3.1f\%) from one-many mappings\n", $projected, $one2one, (100 * $one2one/$projected), $one2many, (100 * $one2many / $projected)); + } } if ($go_terms) { -- GitLab