From f32dd2657f5bff969bcc33fc30f9f1113a3fb155 Mon Sep 17 00:00:00 2001 From: Andrew Yates <ayates@ebi.ac.uk> Date: Thu, 1 Nov 2012 16:12:48 +0000 Subject: [PATCH] [ENSCORESW-301]. Adding INSERT IGNORE to a lot of statements as EFO can repeat terms and subsets from other ontologies. --- .../ontology/scripts/load_OBO_file.pl | 75 ++++++++++--------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/misc-scripts/ontology/scripts/load_OBO_file.pl b/misc-scripts/ontology/scripts/load_OBO_file.pl index 840fb91c10..4bef63d44b 100755 --- a/misc-scripts/ontology/scripts/load_OBO_file.pl +++ b/misc-scripts/ontology/scripts/load_OBO_file.pl @@ -110,11 +110,12 @@ sub write_subset { $dbh->do("LOCK TABLE subset WRITE"); - my $statement = "INSERT INTO subset " . "(name, definition) " . "VALUES (?,?)"; + my $statement = q{INSERT IGNORE INTO subset (name, definition) VALUES (?,?)}; + my $lookup_sql = 'select subset_id from subset where name =?'; my $sth = $dbh->prepare($statement); + my $lookup_sth = $dbh->prepare($lookup_sql); - my $id; my $count = 0; local $SIG{ALRM} = sub { @@ -135,18 +136,23 @@ sub write_subset { $sth->bind_param(2, $subset->{'definition'}, SQL_VARCHAR); $sth->execute(); - - if (!defined($id)) { - $id = $dbh->last_insert_id(undef, undef, 'subset', 'subset_id'); - } - else { - ++$id; - } + + my $id = $dbh->last_insert_id(undef, undef, 'subset', 'subset_id'); + if(! $id) { + $lookup_sth->bind_param(1, $subset->{name}, SQL_VARCHAR); + $lookup_sth->execute(); + ($id) = $lookup_sth->fetchrow_array(); + printf(STDERR "CLASH: SUBSET '%s' already exists in this database. Reusing ID %d\n", $subset->{name}, $id); + } + $subset->{'id'} = $id; ++$count; } alarm(0); + + $lookup_sth->finish(); + $sth->finish(); $dbh->do("OPTIMIZE TABLE term"); $dbh->do("UNLOCK TABLES"); @@ -163,7 +169,7 @@ sub write_term { $dbh->do("LOCK TABLES term WRITE, synonym WRITE"); - my $statement = "INSERT INTO term " . "(ontology_id, subsets, accession, name, definition) " . "VALUES (?,?,?,?,?)"; + my $statement = "INSERT IGNORE INTO term (ontology_id, subsets, accession, name, definition) VALUES (?,?,?,?,?)"; my $syn_stmt = "INSERT INTO synonym (term_id, name) VALUES (?,?)"; @@ -176,7 +182,6 @@ sub write_term { my $syn_sth = $dbh->prepare($syn_stmt); my $existing_term_sth = $dbh->prepare($existing_term_st); - my $id; my $count = 0; my $updated_count = 0; my $syn_count = 0; @@ -191,6 +196,7 @@ sub write_term { my $term = $terms->{$accession}; my $term_subsets; + my $reuse = 0; if (exists($term->{'subsets'})) { $term_subsets = join(',', map { $subsets->{$_}{'name'} } @{$term->{'subsets'}}); @@ -205,23 +211,14 @@ sub write_term { $term->{'id'} = $existing_term_id; } else { - #if not link it to Unknown ontology - $sth->bind_param(1, $unknown_onto_id, SQL_INTEGER); $sth->bind_param(2, $term_subsets, SQL_VARCHAR); $sth->bind_param(3, $term->{'accession'}, SQL_VARCHAR); $sth->bind_param(4, 'UNKNOWN NAME', SQL_VARCHAR); $sth->bind_param(5, 'UNKNOWN DEFINITION', SQL_VARCHAR); - $sth->execute(); - - if (!defined($id)) { - $id = $dbh->last_insert_id(undef, undef, 'term', 'term_id'); - } - else { - ++$id; - } + my $id = $dbh->last_insert_id(undef, undef, 'term', 'term_id'); $term->{'id'} = $id; ++$count; } @@ -255,25 +252,33 @@ sub write_term { $sth->bind_param(5, $term->{'definition'}, SQL_VARCHAR); $sth->execute(); - - if (!defined($id)) { - $id = $dbh->last_insert_id(undef, undef, 'term', 'term_id'); - } - else { - ++$id; + my $id = $dbh->last_insert_id(undef, undef, 'term', 'term_id'); + if(! $id) { + $existing_term_sth->execute($term->{'accession'}); + my ($existing_term_id, $ontology_id) = $existing_term_sth->fetchrow_array; + $id = $existing_term_id; + printf(STDERR "CLASH: TERM '%s' already exists in this database. Reusing ID %d\n", $term->{accession}, $existing_term_id); + $reuse = 1; } $term->{'id'} = $id; ++$count; } - - foreach my $syn (@{$term->{'synonyms'}}) { - $syn_sth->bind_param(1, $id, SQL_INTEGER); - $syn_sth->bind_param(2, $syn, SQL_VARCHAR); - - $syn_sth->execute(); - - ++$syn_count; + + if(@{$term->{synonyms}}) { + if($reuse) { + print STDERR "REUSE: Skipping synonym writing as term already exists in this database\n"; + } + else { + foreach my $syn (@{$term->{'synonyms'}}) { + $syn_sth->bind_param(1, $term->{id}, SQL_INTEGER); + $syn_sth->bind_param(2, $syn, SQL_VARCHAR); + + $syn_sth->execute(); + + ++$syn_count; + } + } } } } ## end foreach my $accession ( sort...) -- GitLab