diff --git a/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm b/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm index 86f69ebaed1d0116861201996ca482a57602d468..54bedd9eb6935bb85bded3792b12dda6c089e022 100644 --- a/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm +++ b/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm @@ -8,5367 +8,374 @@ use DBI; use File::Basename; use IPC::Open3; -use Bio::EnsEMBL::DBSQL::DBAdaptor; -use Bio::EnsEMBL::Translation; - -use XrefMapper::db; -use XrefMapper::CoordinateMapper; - -use vars qw(@ISA @EXPORT_OK); - -@EXPORT_OK = ( '%stable_id_to_internal_id', '%xref_to_source', - '%xref_accessions', '%source_to_external_db' ); - -use vars ( '%stable_id_to_internal_id', '%xref_to_source', - '%xref_accessions', '%source_to_external_db' ); - - -=head1 NAME - -XrefMapper::BasicMapper - -=head1 DESCIPTION - -This is the basic mapper routine. It will create the necessary fasta files for -both the xref and ensembl sequences. These will then be matched using exonerate -and the results written to another file. By creating a <species>.pm file and -inheriting from this base class different matching routines, parameters, data -sets etc can be set. - -=head1 CONTACT - -Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk - - -=cut - -# hold alist of source that have more than 1 priority -# and hence can come from several sources. -my %priority_source_id; # {1090} = 1 -my %priority_source_name; # {HGNC} = 2 -my %priority_source_id_to_name; # {1091} = HGNC -my %priority_source_id_to_desc; # {1090} = havana - -#be able to get from xref to the accession or source for those needed -my %priority_xref_acc; # {12} = "567"; -my %priority_xref_source_id;# {12} = 1090; - -# hold a hash of the values with the highest priority -my %priority_xref; # {HGNC:567} = 12 -my %priority_xref_extra_bit; # {12} = "\tDIRECT\tExternally assigned relationship between ENST123 and Q12345\n"; -my %priority_xref_state; # {HGNC:567} = "primary" or "dependent" or "direct"; -my %priority_xref_priority; # {HGNC:567} = 1; -my %priority_object_xref; # {HGNC:567} = {Gene:123456}; -my %priority_identity_xref; # {HGNC:567} = as normal print except no object_xref_id at the start -# = "100\t100\t1\t100\t110-210\t100M\t100\t987\n"; -my %priority_identity_temp_xref; - -my %priority_failed; # {1090:12} = "gene|123456|78|35|90|90\n"; - - - - - -my %unmapped_primary_xref; # $unmapped_primary_xref{1234} = " master QZ1234 had no match" - # store list of primary unmapped xrefs - # to use to write secondary unmapped_object data - -my %object_succesfully_mapped; # $object_succesfully_mapped{1235} = 1; - -my %primary_identity; # hold the identity info for primary xrefs . - -# Hashes to hold method-specific thresholds -my %method_query_threshold; -my %method_target_threshold; - -# Various useful variables. -my %translation_to_transcript; -my %transcript_to_translation; -my %genes_to_transcripts; -my %transcript_length; -my %internal_id_to_stable_id; -my %xref_descriptions; -my %xrefs_written; -my %priority_seenit; -my %object_xrefs_written; # Needed in case an xref if matched to one enembl object - # by more than one method. On the display we only want to see it once. -my %failed_xref_mappings; -my %updated_source; -my %XXXxref_id_to_accession; -my %XXXxref_id_to_source_id; -my %xref_source_id_to_name; -my %external_db_release; - -my %go_done; #state wethere this go has already been done. - - -sub create_source_id_to_source_name{ - my($self) = @_; - - my $sql = (<<SQL); - SELECT DISTINCT(source.source_id), source.name - FROM source, xref - WHERE xref.source_id = source.source_id -SQL - - my $sth = $self->xref->dbc->prepare($sql); - $sth->execute(); - my ($source_id, $name, $priority); - $sth->bind_columns(\$source_id,\$name); - while($sth->fetch()){ - $xref_source_id_to_name{$source_id} = $name; - } -} - -=head2 find_priority_sources - - Description: Finds those sources that hae more than one source - Stores the results in the global hashes - %priority_source_id and %priority_source_name - Returntype : none - Exceptions : none - Caller : General - -=cut - -sub find_priority_sources{ - my($self) = @_; - - - -# need to check if this needed for this species. -# NOTE: only store in priority_source_name if same name seen more than once :-) -# i.e. more than one priority source is "USED" - - my $sql = (<<SQL); - SELECT DISTINCT(source.source_id), source.name, source.priority - FROM source, xref - WHERE xref.source_id = source.source_id -SQL - - my $sth = $self->xref->dbc->prepare($sql); - $sth->execute(); - my ($source_id, $name, $priority, $desc); - $sth->bind_columns(\$source_id,\$name, \$priority); - my %more_than_one; - while($sth->fetch()){ -# if($name eq "RefSeq_dna"){ -# next; -# } - if(!defined($more_than_one{$name})){ - $more_than_one{$name} = $priority; - } - else{ - $priority_source_name{$name} = $priority; - } - } - $sth->finish; - - - - - foreach my $name (keys %priority_source_name){ - $sql = 'SELECT source_id, name, priority, priority_description from source where name ="'.$name.'"'; - print "\t".$name." to be processed using prioritys of the sources\n"; - - #for each name get all the sources and then store the xref data needed. - my $sth = $self->xref->dbc->prepare($sql); - $sth->execute(); - $sth->bind_columns(\$source_id,\$name, \$priority, \$desc); - my @source_ids=(); - while($sth->fetch()){ - $priority_source_name{$name} = $priority; - $priority_source_id{$source_id} = $priority; - $priority_source_id_to_name{$source_id} = $name; - $priority_source_id_to_desc{$source_id} = $desc; - push @source_ids, $source_id; - } - $sth->finish; - foreach my $source (@source_ids){ - - my $sql2 = "SELECT xref_id, accession from xref where xref.source_id = $source"; - my $sth2 = $self->xref->dbc->prepare($sql2); - $sth2->execute(); - my ($xref_id, $acc); - $sth2->bind_columns(\$xref_id,\$acc); -# print "\tgetting data for source $source (".$priority_source_id_to_desc{$source}.")\n"; - my $count =0; - while($sth2->fetch){ - $priority_xref_acc{$xref_id} = $acc; - $priority_xref_source_id{$xref_id} = $source; - $count++; - } - if($count){ - print "\t$count xrefs accessed at priority level ".$priority_source_id{$source}." (".$priority_source_id_to_desc{$source} .")\n"; - } - $sth2->finish; - } - } - -} - -=head2 new - - Description: Constructor for BasicMapper. - Returntype : BasicMapper - Exceptions : none - Caller : general - -=cut - -sub new{ - my($class, @args) = @_; - - my $self ={}; - bless $self,$class; - $self->jobcount(0); - return $self; -} - - -=head2 dump_seqs - - Arg[1]: xref object which holds info needed for the dump of xref - - Description: Dumps out the files for the mapping. Xref object should hold - the value of the databases and source to be used. - Returntype : none - Exceptions : will die if species not known or an error occurs while - : trying to write to files. - Caller : general - -=cut - - - -sub dump_seqs{ - - my ($self, $location) = @_; - - $self->dump_xref(); - $self->dump_ensembl($location); - -} - - - -=head2 build_list_and_map - - Arg[1]: xref object which holds info on method and files. - - Description: runs the mapping of the list of files with species methods - Returntype : none - Exceptions : none - Caller : general - -=cut - -sub build_list_and_map { - - my ($self) = @_; - - my @list=(); - - my $i = 0; - - foreach my $method (@{$self->method()}){ - my @dna=(); - my $q_dna_file = $self->xref->dir."/xref_".$i."_dna.fasta"; - if (-e $q_dna_file and -s $q_dna_file) { - push @dna, $method; - push @dna, $q_dna_file; - push @dna, $self->core->dna_file(); - push @list, \@dna; - } - - my @pep=(); - my $q_pep_file = $self->xref->dir."/xref_".$i."_peptide.fasta"; - if (-e $q_pep_file and -s $q_pep_file) { - push @pep, $method; - push @pep, $self->xref->dir."/xref_".$i."_peptide.fasta"; - push @pep, $self->core->protein_file(); - push @list, \@pep; - } - $i++; - } - $self->run_mapping(\@list); - -} - - -=head2 get_species_id_from_species_name - - Arg[1]: species name - - Description: get the species_id from the database for the named database. - Example : my $id = get_species_id_from_species_name('homo_sapiens'); - Returntype : int (species_id) - Exceptions : will die if species does not exist in given xref database. - Caller : general - -=cut - -sub get_species_id_from_species_name{ - my ($self,$species) = @_; - - - my $sql = "select species_id from species where name = '".$species."'"; - my $sth = $self->dbc->prepare($sql); - $sth->execute(); - my @row = $sth->fetchrow_array(); - my $species_id; - if (@row) { - $species_id = $row[0]; - } else { - print STDERR "Couldn't get ID for species ".$species."\n"; - print STDERR "It must be one of :-\n"; - $sql = "select name from species"; - $sth = $self->dbc->prepare($sql); - $sth->execute(); - while(my @row = $sth->fetchrow_array()){ - print STDERR $row[0]."\n"; - } - die("Please try again :-)\n"); - } - $sth->finish(); - - return $species_id; -} - - -=head2 get_set_lists - - Description: specifies the list of databases and source to be used in the - : generation of one or more data sets. - Returntype : list of lists - Example : my @lists =@{$self->get_set_lists()}; - Exceptions : none - Caller : dump_xref - -=cut - -sub get_set_lists{ - my ($self) = @_; - - return [["ExonerateGappedBest1", ["*","*"]]]; - -} - -=head2 get_source_id_from_source_name - - Arg[1]: source name - - Description: get the source_id from the database for the named source. - Example : my $id = get_source_id_from_source_name('RefSeq'); - Returntype : int (source_id) - Exceptions : will die if source does not exist in given xref database. - Caller : general - -=cut - -sub get_source_id_from_source_name{ - my ($self, $source) = @_; - my $source_id; - - my $sql = "select source_id from source where name = '".$source."'"; - my $sth = $self->dbc->prepare($sql); - $sth->execute(); - my @row = $sth->fetchrow_array(); - if (defined $row[0] and $row[0] ne '') { - $source_id = $row[0]; - } else { - print STDERR "Couldn't get ID for source ".$source."\n"; - print STDERR "It must be one of :-\n"; - $sql = "select name from source"; - $sth = $self->dbc->prepare($sql); - $sth->execute(); - while(my @row = $sth->fetchrow_array()){ - print STDERR $row[0]."\n"; - } - die("Please try again :-)\n"); - } - $sth->finish(); - - return $source_id; -} -=head2 get_source_hash_from_source_name - - Arg[1]: source name - - Description: get the source_id from the database for the named source. - Example : my $id = get_source_id_from_source_name('RefSeq'); - Returntype : hash off source_ids to name for given name - Exceptions : will die if source does not exist in given xref database. - Caller : general - -=cut - -sub get_source_hash_from_source_name{ - my ($self, $source) = @_; - my %source_hash; - my $source_id; - - my $sql = "select source_id from source where name = '".$source."'"; - my $sth = $self->dbc->prepare($sql); - $sth->execute(); - $sth->bind_columns(\$source_id); - while($sth->fetch()){ - $source_hash{$source_id} = $source; - } - $sth->finish(); - return \%source_hash; -} - - -=head2 dump_xref - - Arg[1]: xref object which holds info on method and files. - - Description: Dumps the Xref data as fasta file(s) - Returntype : none - Exceptions : none - Caller : dump_seqs - -=cut - -sub dump_xref{ - my ($self) = @_; - - my $xref =$self->xref(); - if(!defined($xref->dir())){ - if(defined($self->dir)){ - $xref->species($self->dir); - } - else{ - $xref->dir("."); - } - } - - my @method=(); - my @lists =@{$self->get_set_lists()}; - - my $i=0; - if(defined($self->dumpcheck())){ - my $skip = 1; - foreach my $list (@lists){ - if(!-e $xref->dir()."/xref_".$i."_dna.fasta"){ - $skip = 0; - } - if(!-e $xref->dir()."/xref_".$i."_peptide.fasta"){ - $skip = 0; - } - $i++; - } - if($skip){ - my $k = 0; - foreach my $list (@lists){ - $method[$k++] = shift @$list; - } - $self->method(\@method); - print "Xref fasta files found and will be used (No new dumping)\n"; - return; - } - } - - print "Dumping Xref fasta files\n"; - $i=0; - foreach my $list (@lists){ - $method[$i] = shift @$list; - my $j = 0; - my @source_id=(); - my @species_id=(); - foreach my $element (@$list){ - while(my $species = shift(@$element)){ - if($species ne "*"){ - $species_id[$j] = get_species_id_from_species_name($xref,$species); - } - else{ - $species_id[$j] = -1; - } - my $source = shift(@$element); - if($source ne "*"){ - $source_id[$j] = get_source_id_from_source_name($xref,$source); - } - else{ - $source_id[$j] = -1; - } - $j++; - } - } - #method data fully defined now - $self->dump_subset($xref,\@species_id,\@source_id,$i); - $i++; - } - - $self->method(\@method); - - return; - -} - -=head2 dump_subset - - Arg[1]: xref object which holds info on files. - Arg[2]: list of species to use. - Arg[3]: list of sources to use. - Arg[4]: index to be used in file creation. - - Description: Dumps the Xref data for one set of species/databases - Returntype : none - Exceptions : none - Caller : dump_xref - -=cut - - -sub dump_subset{ - - my ($self,$xref,$rspecies_id,$rsource_id,$index) = @_; - - # generate or condition list for species and sources - my $final_clause = ""; - my $use_all = 0; - my @or_list; - for (my $j = 0; $j < scalar(@$rspecies_id); $j++){ - my @condition; - if($$rspecies_id[$j] > 0){ - push @condition, "x.species_id=" . $$rspecies_id[$j]; - } - if($$rsource_id[$j] > 0){ - push @condition, "x.source_id=" . $$rsource_id[$j]; - } - - # note if both source and species are * (-1) there's no need for a final clause - - if ( !@condition ) { - $use_all = 1; - last; - } - - push @or_list, join (" AND ", @condition); - - } - - $final_clause = " AND ((" . join(") OR (", @or_list) . "))" unless ($use_all) ; - - for my $sequence_type ('dna', 'peptide') { - - my $filename = $xref->dir() . "/xref_" . $index . "_" . $sequence_type . ".fasta"; - open(XREF_DUMP,">$filename") || die "Could not open $filename"; - - my $sql = "SELECT p.xref_id, p.sequence, x.species_id , x.source_id "; - $sql .= " FROM primary_xref p, xref x "; - $sql .= " WHERE p.xref_id = x.xref_id AND "; - $sql .= " p.sequence_type ='$sequence_type' "; - $sql .= $final_clause; - - if(defined($self->maxdump())){ - $sql .= " LIMIT ".$self->maxdump()." "; - } - - my $sth = $xref->dbc->prepare($sql); - $sth->execute(); - while(my @row = $sth->fetchrow_array()){ - - $row[1] =~ s/(.{60})/$1\n/g; - print XREF_DUMP ">".$row[0]."\n".$row[1]."\n"; - - } - - close(XREF_DUMP); - $sth->finish(); - - } - -} - -=head2 dump_ensembl - - Description: Dumps the ensembl data to a file in fasta format. - Returntype : none - Exceptions : none - Caller : dump_seqs - -=cut - -sub dump_ensembl{ - my ($self, $location) = @_; - - $self->fetch_and_dump_seq($location); - -} - - -=head2 fetch_and_dump_seq - - Description: Dumps the ensembl data to a file in fasta format. - Returntype : none - Exceptions : wil die if the are errors in db connection or file creation. - Caller : dump_ensembl - -=cut - -sub fetch_and_dump_seq{ - my ($self, $location) = @_; - - my $ensembl = $self->core; - my $logic_name = $self->logic_name; - my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-dbconn => $ensembl->dbc); - - # - # store ensembl dna file name and open it - # - if(!defined($ensembl->dir())){ - $ensembl->dir("."); - } - $ensembl->dna_file($ensembl->dir."/".$ensembl->species."_dna.fasta"); - - - # - # store ensembl protein file name and open it - # - $ensembl->protein_file($ensembl->dir."/".$ensembl->species."_protein.fasta"); - - - if(defined($self->dumpcheck()) and -e $ensembl->protein_file() and -e $ensembl->dna_file()){ - print "Ensembl Fasta files found (no new dumping)\n"; - return; - } - - print "Dumping Ensembl Fasta files\n"; - - open(DNA,">".$ensembl->dna_file()) - || die("Could not open dna file for writing: ".$ensembl->dna_file."\n"); - - open(PEP,">".$ensembl->protein_file()) - || die("Could not open protein file for writing: ".$ensembl->protein_file."\n"); - - my $gene_adaptor = $db->get_GeneAdaptor(); - - - # fetch by location, or everything if not defined - - my @genes; - print "pre get genes\n"; - if ($location) { - - my $slice_adaptor = $db->get_SliceAdaptor(); - my $slice = $slice_adaptor->fetch_by_name($location); - @genes = @{$gene_adaptor->fetch_all_by_Slice( $slice )}; - - } else { - - my $constraint; - if( $logic_name ){ - $constraint = $gene_adaptor->_logic_name_to_constraint( '',$logic_name ); - } - print "pre fetch all\n"; - @genes = @{$gene_adaptor->fetch_all( $constraint )}; - print "post fetch_all\n"; - } - - my $max = undef; - if(defined($self->maxdump())){ - $max = $self->maxdump(); - } - my $i =0; - my $rna = 0; - print "for each gene\n"; - foreach my $gene (@genes){ - next if $gene->biotype eq 'J_segment'; - next if $gene->biotype eq 'D_segment'; - - foreach my $transcript (@{$gene->get_all_Transcripts()}) { - $i++; - my $seq = $transcript->spliced_seq(); - $seq =~ s/(.{60})/$1\n/g; - print DNA ">" . $transcript->dbID() . "\n" .$seq."\n"; - my $trans = $transcript->translation(); - my $translation = $transcript->translate(); - - if(defined($translation)){ - my $pep_seq = $translation->seq(); - $pep_seq =~ s/(.{60})/$1\n/g; - print PEP ">".$trans->dbID()."\n".$pep_seq."\n"; - } - } - - last if(defined($max) and $i > $max); - - } - close DNA; - close PEP; - -} - - - -### -# Getter/Setter methods -### - - - - - -=head2 method - - Arg [1] : (optional) list reference $arg - reference to a list of method names - Example : my @methods = @{$self->method()}; - Description: Getter / Setter for the methods - Returntype : list - Exceptions : none - -=cut - - -sub method{ - my ($self, $arg) = @_; - - (defined $arg) && - ($self->{_method} = $arg ); - return $self->{_method}; -} - -=head2 core - - Arg [1] : (optional) - Example : $mapper->core($new_core); - Description: Getter / Setter for the core. - info for the ensembl core database. - Returntype : XrefMapper::db - Exceptions : none - -=cut - -sub core{ - my ($self, $arg) = @_; - - (defined $arg) && - ($self->{_core} = $arg ); - return $self->{_core}; -} - - -=head2 dumpcheck - - Arg [1] : (optional) - Example : $mapper->dumpcheck("yes"); - Description: Getter / Setter for dumpcheck. - If set the mapper will not dump fasta files - if they exist already. - Returntype : scalar - Exceptions : none - -=cut - -sub dumpcheck { - my ($self, $arg) = @_; - - (defined $arg) && - ($self->{_dumpcheck} = $arg ); - return $self->{_dumpcheck}; -} - -=head2 external_db_file - - Arg [1] : (optional) - Example : $mapper->external_db_file(cwd()."../../external_db.txt"); - Description: Getter / Setter for external_db_file. - Stores the file witj path of where to get the external database file. - Returntype : scalar - Exceptions : none - -=cut - -sub external_db_file { - my ($self, $arg) = @_; - - (defined $arg) && - ($self->{_external_db_file} = $arg ); - return $self->{_external_db_file}; -} - -=head2 upload - - Arg [1] : (optional) - Example : $mapper->upload("yes"); - Description: Getter / Setter for dumpcheck. - If set the mapper will not upload files. - Returntype : scalar - Exceptions : none - -=cut - -sub upload { - my ($self, $arg) = @_; - - (defined $arg) && - ($self->{_upload} = $arg ); - return $self->{_upload}; -} - -sub nofarm{ - my ($self, $arg) = @_; - - (defined $arg) && - ($self->{_nofarm} = $arg ); - return $self->{_nofarm}; -} - - - -=head2 maxdump - - Arg [1] : (optional) - Example : $mapper->maxdump(10); - Description: Getter / Setter for maxdump. - If set the mapper will only dump that number of - sequences into the fasta files. (Mainly used for testing). - Returntype : scalar - Exceptions : none - -=cut - -sub maxdump { - my ($self, $arg) = @_; - - (defined $arg) && - ($self->{_maxdump} = $arg ); - return $self->{_maxdump}; -} - - - -=head2 use_existing_mappings - - Arg [1] : (optional) - Example : $mapper->use_existing_mappings("yes"); - Description: Getter / Setter for use_existing_mappings. - If set the mapper will not redo the mapping - but parse the existing .map files. - Returntype : scalar - Exceptions : none - -=cut - -sub use_existing_mappings { - my ($self, $arg) = @_; - - (defined $arg) && - ($self->{_use_existing_mappings} = $arg ); - return $self->{_use_existing_mappings}; -} - - -=head2 logic_name - - Arg [1] : (optional) - Example : $mapper->logic_name('ensembl_genes'); - Description: Getter / Setter for logic_name. - Only genes corresponding to this analysis.logic_name - will be dumped. - Returntype : scalar - Exceptions : none - -=cut - -sub logic_name { - my ($self, $arg) = @_; - - (defined $arg) && - ($self->{_logic_name} = $arg ); - return $self->{_logic_name}; -} - - -=head2 xref - - Arg [1] : (optional) - Example : $mapper->core($new_core); - Description: Getter / Setter for the core. - info for the xref database. - Returntype : XrefMapper::db - Exceptions : none - -=cut - -sub xref{ - my ($self, $arg) = @_; - - (defined $arg) && - ($self->{_xref} = $arg ); - return $self->{_xref}; -} - -=head2 jobcount - - Arg [1] : (optional) - Example : $mapper->jobcount(1004); - Description: Getter / Setter for number of jobs submitted. - Returntype : scalar - Exceptions : none - -=cut - -sub jobcount { - my ($self, $arg) = @_; - - (defined $arg) && - ($self->{_jobcount} = $arg ); - return $self->{_jobcount}; -} - - -=head2 run_mapping - - Arg[1] : List of lists of (method, query, target) - Arg[2] : - Example : none - Description: Create and submit mapping jobs to LSF, and wait for them to finish. - Returntype : none - Exceptions : none - Caller : general - -=cut - -sub run_mapping { - - my ($self, $lists) = @_; - - # delete old output files in target directory if we're going to produce new ones - if (!defined($self->use_existing_mappings())) { - my $dir = $self->core->dir(); - print "Deleting out, err and map files from output dir: $dir\n"; - unlink (<$dir/*.map $dir/*.out $dir/*.err>); - } - $self->remove_all_old_output_files(); - #disconnect so that we can then reconnect after the long mapping bit. - $self->core->dbc->disconnect_if_idle(1); - $self->xref->dbc->disconnect_if_idle(1); - $self->core->dbc->disconnect_when_inactive(1); - $self->xref->dbc->disconnect_when_inactive(1); - - # foreach method, submit the appropriate job & keep track of the job name - # note we check if use_existing_mappings is set here, not earlier, as we - # still need to instantiate the method object in order to fill - # method_query_threshold and method_target_threshold - - my @job_names; - my @running_methods; - foreach my $list (@$lists){ - - my ($method, $queryfile ,$targetfile) = @$list; - - my $obj_name = "XrefMapper::Methods::$method"; - # check that the appropriate object exists - eval "require $obj_name"; - if($@) { - - warn("Could not find object $obj_name corresponding to mapping method $method, skipping\n$@"); - - } else { - - my $obj = $obj_name->new(); - $method_query_threshold{$method} = $obj->query_identity_threshold(); - $method_target_threshold{$method} = $obj->target_identity_threshold(); - - - if (!defined($self->use_existing_mappings)) { - my $job_name = $obj->run($queryfile, $targetfile, $self->core->dir(),$self->nofarm); - push @job_names, $job_name; - push @running_methods, $obj; - sleep 1; # make sure unique names really are unique - } - $self->jobcount($self->jobcount+$obj->jobcount); - } - - } # foreach method - - if (!defined($self->use_existing_mappings)) { - # submit depend job to wait for all mapping jobs - foreach my $method( @running_methods ){ - # Submit all method-specific depend jobs - if( $method->can('submit_depend_job') ){ - $method->submit_depend_job; - } - } - # Submit generic depend job. Defaults to LSF - $self->submit_depend_job($self->core->dir, @job_names); - } - $self->check_err($self->core->dir); - -} # run_mapping - - -=head2 submit_depend_job - - Arg[1] : List of job names. - Arg[2] : - Example : none - Description: Submit an LSF job that waits for other jobs to finish. - Returntype : none - Exceptions : none - Caller : general - -=cut - -sub submit_depend_job { - - my ($self, $root_dir, @job_names) = @_; - - - if(defined($self->nofarm)){ - return; - } - - # Submit a job that does nothing but wait on the main jobs to - # finish. This job is submitted interactively so the exec does not - # return until everything is finished. - - # build up the bsub command; first part - my @depend_bsub = ('bsub', '-K'); - - # build -w 'ended(job1) && ended(job2)' clause - my $ended_str = "-w "; - my $i = 0; - foreach my $job (@job_names) { - $ended_str .= "ended($job)"; - $ended_str .= " && " if ($i < $#job_names); - $i++; - } - - push @depend_bsub, $ended_str; - - # rest of command - push @depend_bsub, ('-q', 'small', '-o', "$root_dir/depend.out", '-e', "$root_dir/depend.err"); - - - my $jobid = 0; - - eval { - my $pid; - my $reader; - - local *BSUB; - local *BSUB_READER; - - if ( ( $reader = open( BSUB_READER, '-|' ) ) ) { - while (<BSUB_READER>) { - #print "YES:$_"; - if (/^Job <(\d+)> is submitted/) { - $jobid = $1; - print "LSF job ID for depend job: $jobid\n"; - } - } - close(BSUB_READER); - } else { - die("Could not fork : $!\n") unless ( defined($reader) ); - open( STDERR, ">&STDOUT" ); - if ( ( $pid = open( BSUB, '|-' ) ) ) { - print BSUB "/bin/true\n"; - close BSUB; - if ( $? != 0 ) { - die( "bsub exited with non-zero status ($?) " - . "- job not submitted\n" ); - } - } else { - if ( defined($pid) ) { - exec(@depend_bsub); - die("Could not exec bsub : $!\n"); - } else { - die("Could not fork : $!\n"); - } - } - exit(0); - } - }; - - if ($@) { - # Something went wrong - warn("Job submission failed:\n$@\n"); - } - -} - -=head2 parse_mappings - - Example : none - Description: Parse exonerate output files and build files for loading into target db tables. - Returntype : List of strings - Exceptions : none - Caller : general - -=cut - -sub parse_mappings { - - my ($self, $notriage) = @_; - - my $ensembl = $self->core; - my $xref = $self->xref; - my $dir = $ensembl->dir(); - - # incase timed out, force reconnection - $ensembl->dbc->disconnect_if_idle(0); - $ensembl->dbc->disconnect_when_inactive(0); - $ensembl->dbc->connect(); - - $xref->dbc->disconnect_if_idle(0); - $xref->dbc->disconnect_when_inactive(0); - $xref->dbc->connect(); - - - # Will need lookup tables for gene/transcript/translation stable ID to internal ID - $self->build_stable_id_to_internal_id_hash(); - - # get current max object_xref_id - my $row = @{$ensembl->dbc->db_handle->selectall_arrayref("SELECT MAX(object_xref_id) FROM object_xref")}[0]; - my $max_object_xref_id = @{$row}[0] + 1; - if (!defined $max_object_xref_id) { - print "No existing object_xref_ids, will start from 1\n"; - $max_object_xref_id = 1; - } -# else { -# print "Maximum existing object_xref_id = $max_object_xref_id\n"; -# } - - $row = @{$ensembl->dbc->db_handle->selectall_arrayref("SELECT MAX(xref_id) FROM xref")}[0]; - my $max_xref_id = @$row[0]; - if (!defined $max_xref_id) { - print "No existing xref_ids, will start from 1\n"; - $max_xref_id = 1; - } -# else { -# print "Maximum existing xref_id = $max_xref_id\n"; -# } - - my $xref_id_offset = $max_xref_id + 1; - - $self->xref_id_offset($xref_id_offset); #store - - # files to write table data to - open (OBJECT_XREF, ">$dir/object_xref.txt"); - open (IDENTITY_XREF, ">$dir/identity_xref.txt"); - - my $total_lines = 0; - my $last_lines = 0; - my $total_files = 0; - - # keep a (unique) list of xref IDs that need to be written out to file as well - # this is a hash of hashes, keyed on xref id that relates xrefs to e! objects (may be 1-many) - my %primary_xref_ids = (); - - - # and a list of mappings of ensembl objects to xrefs - # (primary now, dependent added in dump_core_xrefs) - # this is required for display_xref generation later - # format: - # key: ensembl object type:ensembl object id - # value: list of xref_id (with offset) - - my @dna_check=(); - my @pep_check=(); - my $count_new=0; - - foreach my $file (glob("$dir/*.map")) { - - #print "Parsing results from " . basename($file) . "\n"; - open(FILE, $file); - $total_files++; - - # files are named Method_(dna|peptide)_N.map - - my ($method, $type, $part) = get_parts($file); - - if($type =~ 'Translation'){ - $pep_check[$part] = $part; - } - elsif($type =~ 'Transcript'){ - $dna_check[$part] = $part; - } - else{ - die "unknown type $type\n"; - } - # get or create the appropriate analysis ID - # XXX restore when using writeable database - my $analysis_id = $self->get_analysis_id($type); - - while (<FILE>) { - - $total_lines++; - chomp(); - my ($label, $query_id, $target_id, $identity, $query_length, $target_length, $query_start, $query_end, $target_start, $target_end, $cigar_line, $score) = split(/:/, $_); - $cigar_line =~ s/ //g; - $cigar_line =~ s/([MDI])(\d+)/$2$1/ig; - - - # calculate percentage identities - my $query_identity = int (100 * $identity / $query_length); - my $target_identity = int (100 * $identity / $target_length); - - # only take mappings where there is a good match on one or both sequences - if ($query_identity < $method_query_threshold{$method} && - $target_identity < $method_target_threshold{$method}){ - my $reason = $type."|".$target_id."|".$query_identity."|".$target_identity."|"; - $reason .= $method_query_threshold{$method}."|". $method_target_threshold{$method}; - - if(!defined($priority_xref_source_id{$query_id})){ - $failed_xref_mappings{$query_id} = $reason; - } - else{ - $priority_failed{$priority_source_id_to_name{$priority_xref_source_id{$query_id}} - .":".$priority_xref_acc{$query_id}} = $reason; - } - next; - } - - - if(defined($priority_xref_source_id{$query_id})){ - my $source_id = $priority_xref_source_id{$query_id}; - if(!defined($priority_source_id_to_name{$source_id}) or length($priority_source_id_to_name{$source_id}) < 2){ - print STDERR "priority_source_id_to_name has ".scalar(%priority_source_id_to_name)." keys for hash\n"; - die "no source name for source id $source_id\n"; - } - my $key = $priority_source_id_to_name{$source_id}.":".$priority_xref_acc{$query_id}; - if(!defined($priority_xref_priority{$key})){ - - $priority_xref{$key} = $query_id; - $priority_xref_priority{$key} = $priority_source_id{$source_id}; - $priority_object_xref{$key} = "$type:$target_id\n"; - $priority_identity_xref{$key} = join("\t", ($query_identity, $target_identity, - $query_start+1, $query_end, - $target_start+1, $target_end, - $cigar_line, $score, "\\N", $analysis_id)) . "\n"; - $priority_xref_state{$key} = "primary"; - $priority_xref_extra_bit{$query_id} = "\tSEQUENCE_MATCH\t" . "Relationship generated from exonerate mapping" . "\n"; - next; - } - if($priority_xref_priority{$key} - > $priority_source_id{$source_id}){ - - $priority_xref{$key} = $query_id; - $priority_xref_priority{$key} = $priority_source_id{$source_id}; - $priority_object_xref{$key} = "$type:$target_id\n"; - $priority_identity_xref{$key} = join("\t", ($query_identity, $target_identity, - $query_start+1, $query_end, - $target_start+1, $target_end, - $cigar_line, $score, "\\N", $analysis_id)) . "\n"; - $priority_xref_state{$key} = "primary"; - $priority_xref_extra_bit{$query_id} = "\tSEQUENCE_MATCH\t" . "Relationship generated from exonerate mapping" . "\n"; - } - next; # do not store OBJECT, IDENTITY or set primary_xref. do much later - - } - else{ - $count_new++; - } - # note we add on $xref_id_offset to avoid clashes - $object_succesfully_mapped{$query_id} = 1; - if(!defined($primary_xref_ids{$query_id}{$target_id."|".$type})){ - print OBJECT_XREF "$max_object_xref_id\t$target_id\t$type\t" . ($query_id+$xref_id_offset) . "\t\\N\n"; - - - my $identity_string = join("\t", ($query_identity, $target_identity, $query_start+1, $query_end, $target_start+1, $target_end, $cigar_line, $score, "\\N", $analysis_id)) . "\n"; - - print IDENTITY_XREF $max_object_xref_id."\t".$identity_string; - - $max_object_xref_id++; - - # store mapping for later - note NON-OFFSET xref_id is used - my $key = $type . "|" . $target_id; - my $xref_id = $query_id; - - - # note the NON-OFFSET xref_id is stored here as the values are used in - # a query against the original xref database - $primary_xref_ids{$query_id}{$target_id."|".$type} = $target_id."|".$type; - $primary_identity{$query_id}{$target_id."|".$type} = $identity_string; - } - } - close(FILE); - $last_lines = $total_lines; - } - - close(IDENTITY_XREF); - close(OBJECT_XREF); - - print "Read $total_lines lines from $total_files exonerate output files\n"; - print "\nnumber of non priority primary xrefs is $count_new\n"; - - - if($self->jobcount() != $total_files){ - print( ( $dna_check[-1] || 0 ) . " dna map files\n" ); - print( ( $pep_check[-1] || 0 ) . " peptide map files\n" ); - my $test_failed = 0; - for ( my $i = 1 ; $i < ( $dna_check[-1] || 0 ) ; $i++ ) { - if ( $dna_check[$i] != $i ) { - print "DNA $i file not found\n"; - $test_failed = 1; - } - } - for ( my $i = 1 ; $i < ( $pep_check[-1] || 0 ) ; $i++ ) { - if ( $pep_check[$i] != $i ) { - print "PEPTIDE $i file not found\n"; - $test_failed = 1; - } - } - if($test_failed){ - die "Missing Files aborting run\n"; - } - if(!defined($self->use_existing_mappings())){ - print "There should be ".$self->jobcount()." files. Please check\n"; - print "As this is the number of jobs submitted\n"; - die "There should be ".$self->jobcount()." files. Please check\n"; - } - } - - # write relevant xrefs to file - $max_object_xref_id - = $self->dump_core_xrefs(\%primary_xref_ids, - $max_object_xref_id, $xref_id_offset); - - # dump direct xrefs - $max_object_xref_id - = $self->dump_direct_xrefs($xref_id_offset, $max_object_xref_id); - - # dump xrefs that don't appear in primary_xref, direct_xref or - # dependent_xref tables (e.g. interpro) - $self->dump_orphan_xrefs($xref_id_offset); - - # dump interpro table as well - $self->dump_interpro($xref_id_offset,$max_object_xref_id); - -} - - -sub process_priority_xrefs{ - my ($self) =@_; - - - my $ensembl = $self->core; - my $xref = $self->xref; - my $dir = $ensembl->dir(); - - my $primary_sql= (<<PSQL); - SELECT DISTINCT(s.name), px.sequence_type - FROM source s, primary_xref px, xref x - WHERE x.xref_id = px.xref_id - AND s.source_id = x.source_id -PSQL - - my $psth = $self->xref->dbc->prepare($primary_sql) || die "prepare failed"; - $psth->execute() || die "execute failed"; - - my %source_2_seqtype=(); - - my ($prim,$seq_type); - $psth->bind_columns(\$prim,\$seq_type); - while($psth->fetch()){ - $source_2_seqtype{$prim} = $seq_type; - } - - - - - my $xref_id_offset = $self->xref_id_offset(); - # get current max object_xref_id - my $row = @{$ensembl->dbc->db_handle->selectall_arrayref("SELECT MAX(object_xref_id) FROM object_xref")}[0]; - my $max_object_xref_id = @{$row}[0]; - if (!defined $max_object_xref_id) { - print "No existing object_xref_ids, will start from 1\n"; - $max_object_xref_id = 1; - } -# else { -# print "Maximum existing object_xref_id = $max_object_xref_id\n"; -# } - my $object_xref_id = $max_object_xref_id + 1; - my @go_depend = (); - open(XREF_P,">$dir/xref_priority.txt") || die "Could not open xref_priority.txt"; - open(OBJECT_XREF_P,">$dir/object_xref_priority.txt") || die "Could not open object_xref_priority.txt"; - open(IDENTITY_XREF_P,">$dir/identity_xref_priority.txt") || die "Could not open identity_xref_priority.txt"; - open(IDENTITY_XREF_TEMP,">>$dir/identity_xref_temp.txt") || die "Could not open identity_xref_temp.txt"; - - - my @xref_list=(); - - # first process the 'primary' ones. As these may generate new ones - # and hence added to the list as we go along. These would be missed if - # all done at once. - foreach my $key (keys %priority_xref){ - my($source_name,$acc) = split(/:/,$key); - my $xref_id = $priority_xref{$key}; - my $dep_list; - my $direct = 0; - if($priority_xref_state{$key} ne "primary"){ - if(($priority_xref_state{$key} eq "direct" and defined($source_2_seqtype{$source_name}))){ - $direct = 1; - $priority_xref_state{$key} = "primary"; - } - else{ - next; - } - } - my ($type,$id) = split(/:/,$priority_object_xref{$key}); - chomp $id; - if(!defined($acc) or $acc eq ""){ - print "No accession for ".($xref_id+$xref_id_offset)."\t1\n"; - } - $XXXxref_id_to_accession{$xref_id} = $acc; -#PROCESS PRIORITY XREFS - $dep_list = $self->dump_all_dependencies($xref_id, $xref_id_offset, $type, $id); - push @xref_list, $xref_id; - if(defined($priority_object_xref{$key})){ - $object_succesfully_mapped{$xref_id} = 1; - print OBJECT_XREF_P $object_xref_id."\t".$id."\t".$type."\t".($xref_id+$xref_id_offset)."\t\\N\n"; - if(defined($priority_identity_xref{$key})){ - print IDENTITY_XREF_P $object_xref_id."\t".$priority_identity_xref{$key}; - } - elsif(defined($priority_identity_temp_xref{$key})){ - print IDENTITY_XREF_TEMP $object_xref_id."\t".$priority_identity_temp_xref{$key}; - } - elsif(!$direct){ - print "No identity xref stuff for object_xref_id $object_xref_id\t$key\n"; - } - $object_xref_id++; - foreach my $dependent (@$dep_list){ - $object_succesfully_mapped{$xref_id} = 1; - print OBJECT_XREF_P $object_xref_id."\t".$id."\t".$type."\t".$dependent. - "\tFROM:".$source_name.":".$acc."\n"; - print IDENTITY_XREF_TEMP $object_xref_id."\t".$primary_identity{$xref_id}{$id."|".$type}; - $object_xref_id++; - } - } - elsif($source_name eq "HGNC"){ - print "HGNC $xref_id has no object_xref?\n"; - } - } - - foreach my $key (keys %priority_xref){ - my($source_name,$acc) = split(/:/,$key); - my $xref_id = $priority_xref{$key}; - my $dep_list; - if($priority_xref_state{$key} eq "primary"){ #ignore as already done. - next; - } - push @xref_list, $xref_id; - if(defined($priority_object_xref{$key})){ - my ($type,$id) = split(/:/,$priority_object_xref{$key}); - $object_succesfully_mapped{$xref_id} = 1; - print OBJECT_XREF_P $object_xref_id."\t".$id."\t".$type."\t".($xref_id+$xref_id_offset)."\t\\N\n"; - if(defined($priority_identity_xref{$key})){ - print IDENTITY_XREF_P $object_xref_id."\t".$priority_identity_xref{$key}; - } - elsif(defined($primary_identity{$xref_id}{$id."|".$type})){ - print IDENTITY_XREF_TEMP $object_xref_id."\t".$primary_identity{$xref_id}{$id."|".$type}; - } - elsif(defined($priority_identity_temp_xref{$key})){ - print IDENTITY_XREF_TEMP $object_xref_id."\t".$priority_identity_temp_xref{$key}; - } - elsif($priority_xref_state{$key} ne "direct"){ - print "No identity for object_xref_id $object_xref_id?? $key\n"; - } - - $object_xref_id++; - } - } - - if(scalar(@xref_list) < 1){ -# print "At end of process prioritys Maximum existing object_xref_id = $max_object_xref_id\n"; - return; - } - my $list = join(", ", @xref_list); - my $sql = "SELECT xref_id, accession, version, label, description, source_id from xref where xref_id in ($list)"; - my $sth = $self->xref->dbc->prepare($sql); - $sth->execute(); - my ($xref_id, $acc,$ver, $label, $desc, $source_id); - $sth->bind_columns(\$xref_id, \$acc, \$ver, \$label, \$desc, \$source_id); - while($sth->fetch()){ - my $key = $priority_source_id_to_name{$priority_xref_source_id{$xref_id}}.":".$acc; - - print XREF_P ( $xref_id + $xref_id_offset ) . "\t" - . ( $source_to_external_db{$source_id} || '' ) . "\t" - . ( $acc || '' ) . "\t" - . ( $label || '' ) . "\t" - . ( $ver || 0 ) . "\t" - . ( $desc || '' . "\t"); - - if(defined( $priority_xref_extra_bit{$xref_id})){ - print XREF_P $priority_xref_extra_bit{$xref_id}; # no need for "\n" already added; - } - else{ - print STDERR "no extra bit for $acc ($xref_id)\n"; - print XREF_P "\n"; - } - $xrefs_written{$xref_id} = 1; - $priority_seenit{$key} = "SEEN_IT"; - } - $sth->finish; - - close XREF_P; - close OBJECT_XREF_P; - close IDENTITY_XREF_P; - close IDENTITY_XREF_TEMP; - - # Do one big query to get a list of all the synonyms; note each xref may have - # more than one synonym so they are stored in a hash of lists - my $syn_count = 0; - my %synonyms; - my $syn_sth = $self->xref->dbc->prepare("SELECT xref_id, synonym FROM synonym where xref_id in ($list)"); - $syn_sth->execute(); - - my ($sxref_id, $synonym); - $syn_sth->bind_columns(\$sxref_id, \$synonym); - while ($syn_sth->fetch()) { - - if(defined($priority_xref_source_id{$sxref_id})){ - push @{$synonyms{$sxref_id}}, $synonym; - } - } - - open (EXTERNAL_SYNONYM, ">$dir/external_synonym_priority.txt"); - # Dump any synonyms for xrefs we've written - # Now write the synonyms we want to the file - foreach my $xref_id (keys %synonyms) { - foreach my $syn (@{$synonyms{$xref_id}}) { - print EXTERNAL_SYNONYM ($xref_id+$xref_id_offset) . "\t" . $syn . "\n"; - $syn_count++; - } - } - close EXTERNAL_SYNONYM; - - - - foreach my $table ("xref","object_xref","identity_xref","external_synonym","go_xref"){ - my $file = $dir."/".$table."_priority.txt"; - - if(-s $file){ - my $sth = $ensembl->dbc->prepare("LOAD DATA LOCAL INFILE \'$file\' IGNORE INTO TABLE $table"); - print "Uploading data in $file to $table\n"; - $sth->execute() || die "error loading file $file $!"; - } - else{ - print "NO file or zero size file, so not able to load file $file to $table\n"; - } - } - - - print "At end of processing prioritys\n"; - -} - - -sub get_stable_ids { - my ( $self, $type, $string, $hashref ) = @_; - - my $sql = - sprintf( 'SELECT %1$s_id, stable_id ' - . 'FROM %1$s_stable_id ' - . 'WHERE %1$s_id IN ( %2$s )', - $type, $string ); - - my $sth = $self->core->dbc->prepare($sql); - $sth->execute(); - - my ( $trans, $stable ); - $sth->bind_columns( \$trans, \$stable ); - - while ( $sth->fetch() ) { - $hashref->{$trans} = $stable; - } - - $sth->finish; -} - -sub get_failed_id{ - my ($self, $q_cut, $t_cut, $summary_failed)= @_; - my $description_failed = "Unable to match at the thresholds of $q_cut\% for the query or $t_cut\% for the target"; - - my $sth = $self->core->dbc->prepare("select unmapped_reason_id from unmapped_reason where full_description like '".$description_failed."'"); - $sth->execute(); - my $xref_failed_id=undef; - $sth->bind_columns(\$xref_failed_id); - $sth->fetch; - if(!defined($xref_failed_id)){ - print STDERR "Could not find the description:\n"; - print STDERR $description_failed."\n"; - print STDERR "In the directory ensembl/misc-scripts/unmapped_reason you "; - print STDERR "can add the new reason to the unmapped_reason.txt file "; - print STDERR "and run the update_unmapped_reasons.pl script to update "; - print STDERR "your core database\n"; - print STDERR "Alterntively do not add the triage data and add -notriage to the command line.\n"; - die(); - } - return $xref_failed_id; -} - -sub dump_triage_data { - my ($self) = @_; - - - print "Dumping triage data\n"; - my $xref_id_offset = $self->xref_id_offset(); - my $translation=""; - my $translation_count=0; - my $transcript=""; - my $transcript_count=0; - my $batch_size=200; - my %translation_2_stable=(); - my %transcript_2_stable=(); - foreach my $temp (values %failed_xref_mappings){ - my ($type,$id,@junk) = split(/\|/,$temp); - if($type =~ /Translation/){ - $translation_count++; - if($translation_count > $batch_size){ - my $ex = $self->get_stable_ids("translation",$translation.$id,\%translation_2_stable); - $translation_count = 0; - $translation = ""; - } - else{ - $translation .= "$id,"; - } - } - elsif($type =~ /Transcript/){ - $transcript_count++; - if($transcript_count > $batch_size){ - $self->get_stable_ids("transcript",$transcript.$id,\%transcript_2_stable); - $transcript_count=0; - $transcript=""; - } - else{ - $transcript .= "$id,"; - } - } - else{ - die "Unknown type *".$type."*\n".$temp."\n"; - } - } - if($transcript_count){ - chop $transcript; # remove last , from list - $self->get_stable_ids("transcript",$transcript,\%transcript_2_stable); - } - if($translation_count){ - chop $translation; - $self->get_stable_ids("translation",$translation,\%translation_2_stable); - } - - - my $primary_sql= (<<PSQL); - SELECT DISTINCT(s.source_id), px.sequence_type - FROM source s, primary_xref px, xref x - WHERE x.xref_id = px.xref_id - AND s.source_id = x.source_id -PSQL - - my $psth = $self->xref->dbc->prepare($primary_sql) || die "prepare failed"; - $psth->execute() || die "execute failed"; - - my @primary_sources =(); - my %source_2_seqtype=(); - - my ($prim,$seq_type); - $psth->bind_columns(\$prim,\$seq_type); - while($psth->fetch()){ - push @primary_sources, $prim; - $source_2_seqtype{$prim} = $seq_type; - } - - open (XREF, ">" . $self->core->dir() . "/xref_triage.txt"); - - open (UNMAPPED_OBJECT, ">" . $self->core->dir() . "/unmapped_object_triage.txt"); - - # get the Unmapped Object Reasons for the core database and - # add new ones if the standard xref descriptions are not there - my %cutoff_2_failed_id=(); - - my $summary_failed = "Failed to match at thresholds"; - my $summary_missed = "Failed to match"; - my $description_missed = "Unable to match to any ensembl entity at all"; - - my $sth = $self->core->dbc->prepare("select MAX(unmapped_object_id) ". - "from unmapped_object"); - $sth->execute(); - my $max_unmapped_object_id; - $sth->bind_columns(\$max_unmapped_object_id); - $sth->fetch; - if(!defined($max_unmapped_object_id)){ - $max_unmapped_object_id = 1; - } - $sth->finish; - - my ($xref_DNA_analysis, $xref_PROT_analysis); - - $sth = $self->core->dbc->prepare("select unmapped_reason_id from unmapped_reason where full_description like '".$description_missed."'"); - $sth->execute(); - my $xref_missed_id; - $sth->bind_columns(\$xref_missed_id); - $sth->fetch; - if(!defined($xref_missed_id)){ - print STDERR "Could not find the description:\n"; - print STDERR $description_missed,"\n"; - print STDERR "In the directory ensembl/misc-scripts/unmapped_reason you "; - print STDERR "can add the new reason to the unmapped_reason.txt file "; - print STDERR "and run the update_unmapped_reasons.pl script to update "; - print STDERR "your core database\n"; - print STDERR "Alterntively do not add the triage data and add -notriage to the command line.\n"; - die(); - } - $sth->finish; - - - foreach my $source (@primary_sources){ - - my %triage_dumped=(); # dump only once for each accession - - if(defined($priority_source_id{$source})){ # These are done seperately at the end. - next; - } - - my $sql = "select x.xref_id, x.accession, x.version, x.label, x.description, x.source_id, ". - "x.species_id from xref x where x.source_id = $source"; - my $sth = $self->xref->dbc->prepare($sql); - $sth->execute(); - - my ($xref_id, $accession, $version, $label, $description, $source_id, $species_id); - $sth->bind_columns(\$xref_id, \$accession, \$version, \$label, - \$description, \$source_id, \$species_id); - while($sth->fetch()){ - if (!$xrefs_written{$xref_id}) { - my $external_db_id = $source_to_external_db{$source_id}; - if(!defined($updated_source{$external_db_id})){ - $self->cleanup_sources_file($external_db_id); - } - - print XREF ( $xref_id + $xref_id_offset ) . "\t" - . ( $external_db_id || '' ) . "\t" - . ( $accession || '' ) . "\t" - . ( $label || '' ) . "\t" - . ( $version || 0 ) . "\t" - . ( $description || '' ) . "\t" . "MISC" . "\t" - . "No match\n"; - - #dump out dependencies aswell - - $XXXxref_id_to_accession{$xref_id} = $accession; - $XXXxref_id_to_source_id{$xref_id} = $source_id; - if(!defined($accession) or $accession eq ""){ - print STDERR "No accession for ".($xref_id+$xref_id_offset)."\t2\n"; - } - - $self->dump_all_dependencies($xref_id, $xref_id_offset); - - if(defined($failed_xref_mappings{$xref_id})){ - - my ($ensembl_type,$ensembl_id,$q_perc,$t_perc,$q_cut,$t_cut) = - split(/\|/,$failed_xref_mappings{$xref_id}); - - if(!defined($cutoff_2_failed_id{$q_cut."_".$t_cut})){ - $cutoff_2_failed_id{$q_cut."_".$t_cut} = $self->get_failed_id($q_cut, $t_cut, $summary_failed); - } - $max_unmapped_object_id++; - - print UNMAPPED_OBJECT $max_unmapped_object_id."\txref\t"; - - if($ensembl_type =~ /Translation/){ - if (not defined $xref_PROT_analysis) { - $xref_PROT_analysis = $self->get_analysis_id($ensembl_type); - } - print UNMAPPED_OBJECT $xref_PROT_analysis."\t"; - } - elsif($ensembl_type =~ /Transcript/){ - if (not defined $xref_DNA_analysis) { - $xref_DNA_analysis = $self->get_analysis_id($ensembl_type); - } - print UNMAPPED_OBJECT $xref_DNA_analysis."\t"; - } - else{ - die "type=*".$ensembl_type."*\n".$failed_xref_mappings{$xref_id}."\n"; - } - print UNMAPPED_OBJECT $external_db_id."\t".$accession."\t".$cutoff_2_failed_id{$q_cut."_".$t_cut}."\t"; - print UNMAPPED_OBJECT $q_perc."\t".$t_perc."\t"; - print UNMAPPED_OBJECT $ensembl_id."\t".$ensembl_type."\n"; - - } - else{ - $max_unmapped_object_id++; - print UNMAPPED_OBJECT $max_unmapped_object_id."\txref\t"; - if($source_2_seqtype{$source} =~ /peptide/){ - if (not defined $xref_PROT_analysis) { - $xref_PROT_analysis = $self->get_analysis_id("translation"); - } - print UNMAPPED_OBJECT $xref_PROT_analysis."\t"; - } - elsif($source_2_seqtype{$source} =~ /dna/){ - if (not defined $xref_DNA_analysis) { - $xref_DNA_analysis = $self->get_analysis_id("transcript"); - } - print UNMAPPED_OBJECT $xref_DNA_analysis."\t"; - } - print UNMAPPED_OBJECT $external_db_id."\t".$accession."\t"; - print UNMAPPED_OBJECT $xref_missed_id."\t0\t0\t0\t\\N\n"; - } - } - } - $sth->finish; - } - close(XREF); - close(UNMAPPED_OBJECT); - - foreach my $table ("xref","unmapped_object"){ - my $file = $self->core->dir() . "/" . $table . "_triage.txt"; - - if(-s $file){ - my $sth = $self->core->dbc->prepare("LOAD DATA LOCAL INFILE \'$file\' IGNORE INTO TABLE $table"); - print "Uploading data in $file to $table\n"; - $sth->execute(); - } - else{ - print "NO file or zero size file, so not able to load file $file to $table\n"; - } - } - -} - -# dump xrefs that don't appear in either the primary_xref or dependent_xref tables -# also ignore direct_xref -# e.g. Interpro xrefs - -sub dump_orphan_xrefs { - - my ($self, $xref_id_offset) = @_; - - my $count = 0; - - open (XREF, ">>" . $self->core->dir() . "/xref.txt"); - - - my $sql = q[ - SELECT x.xref_id, x.accession, x.version, x.label, x.description, - x.source_id, x.species_id - FROM xref x - LEFT JOIN primary_xref px ON px.xref_id=x.xref_id - LEFT JOIN dependent_xref dx ON dx.dependent_xref_id=x.xref_id - LEFT JOIN direct_xref dirx ON dirx.general_xref_id=x.xref_id - WHERE px.xref_id IS NULL - AND dx.dependent_xref_id IS NULL - AND dirx.general_xref_id is NULL ]; - - my $sth = $self->xref->dbc->prepare($sql); - $sth->execute(); - - my ($xref_id, $accession, $version, $label, $description, $source_id, $species_id); - $sth->bind_columns(\$xref_id, \$accession, \$version, \$label, \$description, \$source_id, \$species_id); - - while ($sth->fetch()) { - - if(defined($priority_source_id{$source_id})){ - next; - } - - my $external_db_id = $source_to_external_db{$source_id}; - if ($external_db_id) { # skip "unknown" sources - if (!$xrefs_written{$xref_id}) { - if(!defined($updated_source{$external_db_id})){ - $self->cleanup_sources_file($external_db_id); - } - if(!defined($priority_xref_source_id{$xref_id})){ - - print XREF ( $xref_id + $xref_id_offset ) . "\t" - . ( $external_db_id || '' ) . "\t" - . ( $accession || '' ) . "\t" - . ( $label || '' ) . "\t" - . ( $version || 0 ) . "\t" - . ( $description || '' ) . "\t" . "MISC" . "\t" - . "No match\n"; - - $xrefs_written{$xref_id} = 1; - $count++; - } - } - } - - } - $sth->finish(); - - close(XREF); - - print "Wrote $count xrefs that are neither primary nor dependent\n"; - -} - - -# Dump direct xrefs. Need to do stable ID -> internal ID mapping. - -sub dump_direct_xrefs { - - my ($self, $xref_id_offset, $max_object_xref_id) = @_; - my $object_xref_id = $max_object_xref_id + 1; - - print "Writing direct xrefs\n"; - - my $count = 0; - - open (XREF, ">>" . $self->core->dir() . "/xref.txt"); - open (OBJECT_XREF, ">>" . $self->core->dir() . "/object_xref.txt"); - open (GO_XREF, ">>" .$self->core->dir(). "/go_xref.txt"); - - - my %go_source; - my $worm_pep_source_id = undef; - my $worm_locus_source_id = undef; - my $worm_gene_source_id = undef; - my $worm_transcript_source_id = undef; - - # Will need to look up translation stable ID from transcript stable ID, build hash table - - my %transcript_stable_id_to_translation_stable_id; - my $sql = (<<SQL); - SELECT tss.stable_id as transcript, tls.stable_id AS translation - FROM translation tl, translation_stable_id tls, transcript_stable_id tss - WHERE tss.transcript_id=tl.transcript_id AND tl.translation_id=tls.translation_id -SQL - - my $trans_sth = $self->core->dbc->prepare($sql); - $trans_sth->execute(); - my ($transcript_stable_id, $translation_stable_id); - $trans_sth->bind_columns(\$transcript_stable_id, \$translation_stable_id); - while ($trans_sth->fetch()) { - $transcript_stable_id_to_translation_stable_id{$transcript_stable_id} = $translation_stable_id; - } - $trans_sth->finish(); - - - # SQL / statement handle for getting all direct xrefs - my $xref_sql = (<<XSQL); - SELECT dx.general_xref_id, dx.ensembl_stable_id, dx.type, dx.linkage_xref, - x.accession, x.version, x.label, x.description, x.source_id, x.species_id - FROM direct_xref dx, xref x - WHERE dx.general_xref_id=x.xref_id -XSQL - - my $xref_sth = $self->xref->dbc->prepare($xref_sql); - - my $rv = $xref_sth->execute(); - - my ($xref_id, $ensembl_stable_id, $type, $linkage_xref, $accession, $version, - $label, $description, $source_id, $species_id); - $xref_sth->bind_columns(\$xref_id, \$ensembl_stable_id, \$type, \$linkage_xref,\ $accession, - \$version, \$label, \$description, \$source_id, \$species_id); - - - my %error_count; - my %error_example; - - my $ccds_source; - eval{ $ccds_source = - get_source_id_from_source_name($self->xref(), "CCDS") }; - if( $@ and $@ !~ /^Please try again/){ die( "==> $@" ) } - - while ($xref_sth->fetch()) { - my $external_db_id = $source_to_external_db{$source_id}; - - unless( $external_db_id ){ - if( ! defined( $external_db_id ) ){ - warn(" No external_db_id for source $source_id. Skip source!\n" ); - $source_to_external_db{$source_id} = 0; - } - next; - } - - # In the case of CCDS xrefs, direct_xref is to transcript but we want - # the mapping in the core db to be to the *translation* - if ($source_id == $ccds_source) { - $type = 'translation'; - my $tmp_esid = $ensembl_stable_id; - $ensembl_stable_id = $transcript_stable_id_to_translation_stable_id{$tmp_esid}; - if(defined($error_count{$source_id}) and !$ensembl_stable_id){ - $error_count{$source_id}++; - if($error_count{$source_id} < 6){ - $error_example{$source_id} .= ", $tmp_esid - $accession"; - } - } - elsif(!$ensembl_stable_id){ - $error_count{$source_id} = 1; - $error_example{$source_id} = "$tmp_esid - $accession"; - } - } - - my $ensembl_internal_id; - if(!defined($type)){ - $type = ""; - } - if(!defined($ensembl_stable_id)){ - $ensembl_stable_id = ""; - } - - if(defined($stable_id_to_internal_id{$type}->{$ensembl_stable_id})){ - $ensembl_internal_id = $stable_id_to_internal_id{$type}->{$ensembl_stable_id}; - } - else{ # ncRNA store internal id not stable check and switch if needed - if(defined($internal_id_to_stable_id{$type}{$ensembl_stable_id})){ - my $tmp = $internal_id_to_stable_id{$type}{$ensembl_stable_id}; - $ensembl_internal_id = $ensembl_stable_id; - $ensembl_stable_id = $tmp; - } - } - if ($ensembl_internal_id) { - - if (!$xrefs_written{$xref_id}) { - if(!defined($updated_source{$external_db_id})){ - $self->cleanup_sources_file($external_db_id); - } - - if(defined($priority_xref_source_id{$xref_id})){ - if(!defined($priority_source_id_to_name{$source_id}) or length($priority_source_id_to_name{$source_id}) < 2){ - print STDERR "priority_source_id_to_name has ".scalar(%priority_source_id_to_name)." keys for has\n"; - die "no source name for source id $source_id\n"; - } - my $key = $priority_source_id_to_name{$source_id}.":".$priority_xref_acc{$xref_id}; - if(!defined($priority_xref_priority{$key})){ - - $priority_xref_extra_bit{$xref_id} = "\tDIRECT" . "\t" . - "Externally assigned relationship between $ensembl_stable_id and $accession (via ". - $priority_source_id_to_desc{$source_id}.")" . "\n"; - $priority_xref{$key} = $xref_id; - $priority_xref_priority{$key} = $priority_source_id{$source_id}; - $priority_object_xref{$key} = ucfirst($type).":".$ensembl_internal_id; - $priority_identity_xref{$key} = undef; - - $priority_xref_state{$key} = "direct"; - next; # do not store XREF or OBJECT. do much later - } - if($priority_xref_priority{$key} # old one - > $priority_source_id{$source_id}){ # new one - - - $priority_xref_extra_bit{$xref_id} = "\tDIRECT" . "\t" . - "Externally assigned relationship between $ensembl_stable_id and $accession (via ". - $priority_source_id_to_desc{$source_id}.")" . "\n"; - $priority_xref{$key} = $xref_id; - $priority_xref_priority{$key} = $priority_source_id{$source_id}; - $priority_object_xref{$key} = ucfirst($type).":".$ensembl_internal_id; - $priority_identity_xref{$key} = undef; - - $priority_xref_state{$key} = "direct"; - } - next; # do not store XREF or OBJECT. do much later - } - - print XREF ( $xref_id + $xref_id_offset ) . "\t" - . ( $external_db_id || '' ) . "\t" - . ( $accession || '' ) . "\t" - . ( $label || '' ) . "\t" - . ( $version || 0 ) . "\t" - . ( $description || '' ) . "\t" - . "DIRECT" . "\t" - . "Externally assigned relationship between " - . $ensembl_stable_id . " and " - . $accession ."\n"; - - $xrefs_written{$xref_id} = 1; - } - $object_succesfully_mapped{$xref_id} = 1; - print OBJECT_XREF "$object_xref_id\t$ensembl_internal_id\t" . ucfirst($type) . "\t" . ($xref_id+$xref_id_offset) . "\t\\N\n"; - $object_xref_id++; - $count++; - - } else { - if(!defined($worm_pep_source_id)){ - # Non-standard handling of WormBase stuff - eval{ $worm_pep_source_id = get_source_id_from_source_name - ($self->xref(), "wormpep_id") }; - if( $@ and $@ !~ /^Please try again/){ die( "==> $@" ) } - eval{ $worm_locus_source_id = get_source_id_from_source_name - ($self->xref(), "wormbase_locus") }; - if( $@ and $@ !~ /^Please try again/){ die( "==> $@" ) } - eval{ $worm_gene_source_id = get_source_id_from_source_name - ($self->xref(), "wormbase_gene") }; - if( $@ and $@ !~ /^Please try again/){ die( "==> $@" ) } - eval{ $worm_transcript_source_id = get_source_id_from_source_name - ($self->xref(), "wormbase_transcript") }; - if( $@ and $@ !~ /^Please try again/){ die( "==> $@" ) } - eval{ %go_source = %{get_source_hash_from_source_name - ($self->xref(), "GO" ) }}; - if( $@ and $@ !~ /^Please try again/){ die( "==> $@" ) } - $worm_pep_source_id ||= 0; - } - # deal with UTR transcripts in Elegans and potentially others - # Need to link xrefs that are listed as linking to e.g. ZK829.4 - # to each of ZK829.4.1, ZK829.4.2, ZK829.4.3 - my $old_object_xref_id = $object_xref_id; - if ($source_id == $worm_pep_source_id || defined($go_source{$source_id}) - || $source_id == $worm_locus_source_id || $source_id == $worm_gene_source_id - || $source_id == $worm_transcript_source_id) { - - # search for matching stable IDs - my $pat = $ensembl_stable_id . '\..+'; - foreach my $stable_id (keys %{$stable_id_to_internal_id{$type}}) { - - if ($stable_id =~ /$pat/) { - - if (!$xrefs_written{$xref_id}) { - if(!defined($updated_source{$external_db_id})){ - $self->cleanup_sources_file($external_db_id); - } - - print XREF ( $xref_id + $xref_id_offset ) . "\t" - . ( $external_db_id || '' ) . "\t" - . ( $accession || '' ) . "\t" - . ( $label || '' ) . "\t" - . ( $version || 0 ) . "\t" - . ( $description || '' ) . "\t" - . "DIRECT" . "\t" - . "Externally assigned relationship between " - . $ensembl_stable_id . " and " - . $accession . "\n"; - - $xrefs_written{$xref_id} = 1; - } - $ensembl_internal_id = $stable_id_to_internal_id{$type}->{$stable_id}; - $object_succesfully_mapped{$xref_id} = 1; - print OBJECT_XREF "$object_xref_id\t$ensembl_internal_id\t" . ucfirst($type) . "\t" . ($xref_id+$xref_id_offset) . "\t\\N\n"; - if(defined($go_source{$source_id})){ - print GO_XREF $object_xref_id . "\t" . $linkage_xref . "\\N\n"; - $go_done{$ensembl_internal_id."|".ucfirst($type)."|" . ($xref_id+$xref_id_offset)} = 1; - } - $object_xref_id++; - - } - - } # foreach stable_id - - } # if source_id - - if(defined($error_count{$source_id})){ - $error_count{$source_id}++; - if($error_count{$source_id} < 6){ - $error_example{$source_id} .= ", $ensembl_stable_id - $accession"; - } - } - else{ - $error_count{$source_id} = 1; - $error_example{$source_id} = "$ensembl_stable_id - $accession"; - } - - - } - } - - foreach my $key (keys %error_count){ - print STDERR "Problems with ".$error_count{$key}." Direct Xrefs for source $key\n"; - print STDERR "\te.g. ".$error_example{$key}."\n"; - } - - close(OBJECT_XREF); - close(XREF); - close(GO_XREF); - $xref_sth->finish(); - - print " Wrote $count direct xrefs\n"; - return $object_xref_id; -} - - -# Dump the interpro table from the xref database -sub dump_interpro { - my $self = shift; - my $xref_id_offset = shift; - my $oxref_id_offset = shift; - - print "Writing InterPro\n"; - my( $ipro_count, $xref_count, $oxref_count, $goxref_count ) = (0,0,0,0); - - open (INTERPRO, ">" . $self->core->dir() . "/interpro.txt"); - open (XREF, ">>" . $self->core->dir() . "/xref.txt"); - open (OBJECT_XREF, ">>" . $self->core->dir() . "/object_xref.txt"); - open (GO_XREF, ">>" . $self->core->dir() . "/go_xref.txt"); - - # Get a mapping of protein domains to ensembl translations for - # interpro dependent xrefs - my $core_sql = "SELECT hit_name, translation_id FROM protein_feature" ; - my $core_sth = $self->core->dbc->prepare($core_sql); - $core_sth->execute(); - my %domain_to_translation = (); - my ($domain, $translation); - $core_sth->bind_columns(\$domain, \$translation); - while ($core_sth->fetch()) { - $domain_to_translation{$domain} ||= []; - push @{$domain_to_translation{$domain}}, $translation; - } - - # Get a list of interpro data, including dependent xrefs if avail - my $sth = $self->xref->dbc->prepare(" - SELECT ip.interpro, ip.pfam, x2.xref_id, x2.source_id, - x2.accession, x2.version, x2.label, x2.description, - dx.linkage_annotation - FROM interpro ip, xref x - LEFT JOIN dependent_xref dx ON x.xref_id=dx.master_xref_id - LEFT JOIN xref x2 ON dx.dependent_xref_id=x2.xref_id - WHERE ip.interpro = x.accession"); - my $rv = $sth->execute(); - my %interpro_cache; - my %xref_cache; - my %oxref_cache; - my %goxref_cache; - while( my $row = $sth->fetchrow_arrayref() ){ - my ( $interpro, $pfam, $dx_xref_id, $dx_source_id, $dx_accession, - $dx_version, $dx_label, $dx_description, $go_linkage ) = @$row; - unless( $interpro_cache{$interpro.$pfam} ){ - # We have a fresh interpro. - # Note; interpro xrefs themselves are handled by dump_orphan_xrefs - print INTERPRO $interpro . "\t" . $pfam . "\n"; - $interpro_cache{$interpro.$pfam} ++; - $ipro_count++; - } - if( $dx_accession ){ - # We have a dependent xref for this interpro... - my $xref_id; - unless( $xref_id = $xref_cache{$dx_accession} ){ - $xref_id = $dx_xref_id + $xref_id_offset; - $xref_cache{$dx_accession} = $xref_id; - printf XREF ("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", - $xref_id, - $source_to_external_db{$dx_source_id}, - $dx_accession, - $dx_label, - $dx_version, - $dx_description || "", - 'DEPENDENT', - "Generated via $interpro"); - $xref_count++; - } - foreach my $ensembl_id( @{$domain_to_translation{$pfam}||[]} ){ - #...And the interpro domain maps to a translation - my $oxref_id; - unless( $oxref_id = $oxref_cache{$dx_accession.$ensembl_id} ){ - $oxref_id = $oxref_count + 1 + $oxref_id_offset; - $oxref_cache{$dx_accession.$ensembl_id} = $oxref_id; - - # If this was loaded already via the normal go system ignore as already processed. - if(defined($go_done{$ensembl_id."|".'Translation'."|".$xref_id})) { - next; - } - - printf OBJECT_XREF ( "%s\t%s\t%s\t%s\t\\N\n", - $oxref_id, - $ensembl_id, - 'Translation', - $xref_id ); - $oxref_count ++; - if( $go_linkage ){ - #...And we have linkage data, indicating a GO sref - unless( $goxref_cache{$oxref_id.$go_linkage} ){ - $goxref_cache{$oxref_id.$go_linkage} ++; - printf GO_XREF ( "%s\t%s\t\\N\n", - $oxref_id, - $go_linkage ); - $goxref_count ++; - } - } - } - } - } - } - $sth->finish(); - - close (INTERPRO); - close (XREF); - close (OBJECT_XREF); - close (GO_XREF); - - print(" Wrote $ipro_count interpro table entries\n"); - print(" Wrote $xref_count interpro-dependent xrefs \n"); - print(" including $oxref_count object xrefs, \n"); - print(" and $goxref_count go xrefs\n"); - - return $oxref_id_offset + $oxref_count; -} - - -sub build_stable_id_to_internal_id_hash { - - my ($self) = @_; - - foreach my $type ('gene', 'transcript', 'translation') { # Add exon here if required - - my $core_sql = "SELECT ${type}_id, stable_id FROM ${type}_stable_id" ; - my $sth = $self->core->dbc->prepare($core_sql); - $sth->execute(); - my ($internal_id, $stable_id); - $sth->bind_columns(\$internal_id, \$stable_id); - - while ($sth->fetch) { - - $stable_id_to_internal_id{$type}{$stable_id} = $internal_id; - $internal_id_to_stable_id{$type}{$internal_id} = $stable_id; - } - - } - - return 1; - -} - -sub get_ensembl_object_type { - - my $filename = shift; - my $type; - - $filename = basename($filename); - - if ($filename =~ /_dna_/i) { - - $type = "Transcript"; - - } elsif ($filename =~ /_peptide_/i) { - - $type = "Translation"; - - } else { - - print STDERR "Cannot deduce Ensembl object type from filename $filename\n"; - } - - return $type; - -} - -sub get_parts { - - my $filename = shift; - - $filename = basename($filename); - - my ($method,$type,$part) = $filename =~ /^(.*)_(dna|peptide)_(\d+)\.map/; - if ($type eq "dna" ) { - - $type = "Transcript"; - - } elsif ($type eq "peptide") { - - $type = "Translation"; - - } else { - - print STDERR "Cannot deduce Ensembl object type from filename $filename\n"; - } - - return ($method, $type, $part); - -} - -sub get_method { - - my $filename = shift; - - $filename = basename($filename); - - my ($method) = $filename =~ /^(.*)_(dna|peptide)_\d+\.map/; - - return $method; - -} - -sub get_analysis_id { - - my ($self, $ensembl_type) = @_; - - my %typeToLogicName = ( 'transcript' => 'XrefExonerateDNA', - 'translation' => 'XrefExonerateProtein' ); - - my $logic_name = $typeToLogicName{lc($ensembl_type)}; - - my $sth = $self->core->dbc->prepare("SELECT analysis_id FROM analysis WHERE logic_name='" . $logic_name ."'"); - $sth->execute(); - - my $analysis_id; - - if (my @row = $sth->fetchrow_array()) { - - $analysis_id = $row[0]; -# print "Found exising analysis ID ($analysis_id) for $logic_name\n"; - - } else { - - print "No analysis with logic_name $logic_name found, creating ...\n"; - $sth = $self->core->dbc->prepare("INSERT INTO analysis (logic_name, created) VALUES ('" . $logic_name. "', NOW())"); - # TODO - other fields in analysis table - $sth->execute(); - $analysis_id = $sth->{'mysql_insertid'}; - print "Done (analysis ID=" . $analysis_id. ")\n"; - - } - $sth->finish(); - - return $analysis_id; - -} - -sub remove_all_old_output_files{ - my ($self) =@_; - - my $dir = $self->core->dir(); - - print "Deleting txt and sql files from output dir: $dir\n"; - unlink(<$dir/*.txt $dir/*.sql>); - $self->cleanup_projections_file(); -} - -sub dump_core_xrefs { - - my ($self, $xref_ids_hashref, $start_object_xref_id, $xref_id_offset, $object_xref_id_offset) = @_; - - my @xref_ids = keys %$xref_ids_hashref; - my %xref_to_objects = %$xref_ids_hashref; - - - $self->create_source_id_to_source_name(); - my $dir = $self->core->dir(); - - open (XREF, ">$dir/xref.txt"); - open (OBJECT_XREF, ">>$dir/object_xref.txt"); - open (EXTERNAL_SYNONYM, ">$dir/external_synonym.txt"); - open (GO_XREF, ">>$dir/go_xref.txt"); - open (IDENTITY_XREF, ">>$dir/identity_xref_temp.txt"); - - # Cache synonyms for later use - # Do one big query to get a list of all the synonyms; note each xref may have - # more than one synonym so they are stored in a hash of lists - - my $syn_count = 0; - my %synonyms; - my $syn_sth = $self->xref->dbc->prepare("SELECT xref_id, synonym FROM synonym"); - $syn_sth->execute(); - - my ($sxref_id, $synonym); - $syn_sth->bind_columns(\$sxref_id, \$synonym); - while ($syn_sth->fetch()) { - - if(!defined($priority_xref_source_id{$sxref_id})){ - push @{$synonyms{$sxref_id}}, $synonym; - } - } - - # keep a unique list of source IDs to build the external_db table later - my %source_ids; - - my $object_xref_id = $start_object_xref_id; - - # build cache of source id -> external_db id; note %source_to_external_db is global - %source_to_external_db = $self->map_source_to_external_db(); - - # execute several queries with a max of 200 entries in each IN clause - more efficient - my $batch_size = 200; - - # keep track of what xref_id & object_xref_ids have been written to prevent - # duplicates; e.g. several dependent xrefs may be dependent on the same master xref. - # Note %xrefs_written and %object_xrefs_written are global - - while(@xref_ids) { - - my @ids; - if($#xref_ids > $batch_size) { - @ids = splice(@xref_ids, 0, $batch_size); - } else { - @ids = splice(@xref_ids, 0); - } - - my $id_str; - if(@ids > 1) { - $id_str = "IN (" . join(',', @ids). ")"; - } else { - $id_str = "= " . $ids[0]; - } - - - my $sql = "SELECT * FROM xref WHERE xref_id $id_str"; - my $xref_sth = $self->xref->dbc->prepare($sql); - $xref_sth->execute(); - - my ($xref_id, $accession, $version, $label, $description, $source_id, $species_id, $master_xref_id, $linkage_annotation); - $xref_sth->bind_columns(\$xref_id, \$accession, \$version, \$label, \$description, \$source_id, \$species_id); - - # note the xref_id we write to the file is NOT the one we've just read - # from the internal xref database as the ID may already exist in the - # core database so we add on $xref_id_offset - while ($xref_sth->fetch()) { - - if(!defined($accession) or $accession eq ""){ - print " No accession for $xref_id, $accession, $label, $source_id\t3\n"; - } - $XXXxref_id_to_accession{$xref_id} = $accession; - $XXXxref_id_to_source_id{$xref_id} = $source_id; - # make sure label is set to /something/ so that the website displays something - $label = $accession if (!$label); - - if (!$xrefs_written{$xref_id}) { - my $external_db_id = $source_to_external_db{$source_id}; - if ($external_db_id) { # skip "unknown" sources - if(!defined($updated_source{$external_db_id})){ - $self->cleanup_sources_file($external_db_id); - } - - print XREF ( $xref_id + $xref_id_offset ) . "\t" - . ( $external_db_id || '' ) . "\t" - . ( $accession || '' ) . "\t" - . ( $label || '' ) . "\t" - . ( $version || 0 ) . "\t" - . ( $description || '' ) . "\t" - . "SEQUENCE_MATCH" . "\t" - . "Relationship generated from exonerate mapping\n"; - - $xrefs_written{$xref_id} = 1; - $source_ids{$source_id} = $source_id; - } - } - } - - # Now get the dependent xrefs for each of these xrefs and write them as well - # Store the go_linkage_annotations as we go along (need for dumping go_xref) - my %go_source = %{get_source_hash_from_source_name($self->xref, "GO")}; - - $sql = "SELECT DISTINCT(x.xref_id), dx.master_xref_id, x.accession, x.label, x.description, x.source_id, x.version, dx.linkage_annotation FROM dependent_xref dx, xref x WHERE x.xref_id=dx.dependent_xref_id AND master_xref_id $id_str"; - - my $dep_sth = $self->xref->dbc->prepare($sql); - $dep_sth->execute(); - - $dep_sth->bind_columns(\$xref_id, \$master_xref_id, \$accession, \$label, \$description, \$source_id, \$version, \$linkage_annotation); - - - while ($dep_sth->fetch()) { # dependent xrefs - - my $external_db_id = $source_to_external_db{$source_id}; - next if (!$external_db_id); - - - $label = $accession if (!$label); - my $master_accession = $XXXxref_id_to_accession{$master_xref_id}; -# my $master_source_name = $xref_source_id_to_name{$source_id}; - my $master_source_name = $xref_source_id_to_name{$XXXxref_id_to_source_id{$master_xref_id}}; - -#IANL need to add source here to be able to add this to object_xref; - - if(!defined($master_accession) or $master_accession eq ""){ - print "(dump_core_xrefs) No master_acc for master xref $master_xref_id,".($master_xref_id+$xref_id_offset)." for $accession ($xref_id)\n"; - } - if(!defined($master_source_name) or $master_source_name eq ""){ - print "(dump_core_xrefs) No master_source_name for master xref $master_xref_id,".($master_xref_id+$xref_id_offset)." for $accession ($xref_id)\n"; - } - - if (!$xrefs_written{$xref_id}) { - if(!defined($updated_source{$external_db_id})){ - $self->cleanup_sources_file($external_db_id); - } - - if(!defined($priority_xref_source_id{$xref_id})){ - - print XREF ( $xref_id + $xref_id_offset ) . "\t" - . ( $external_db_id || '' ) . "\t" - . ( $accession || '' ) . "\t" - . ( $label || '' ) . "\t" - . ( $version || 0 ) . "\t" - . ( $description || '' ) . "\t" - . "DEPENDENT" . "\t" - . "Generated via " - . $master_accession . "\n"; - - } - $xrefs_written{$xref_id} = 1; - $source_ids{$source_id} = $source_id; - } - - # create an object_xref linking this (dependent) xref with any objects it maps to - # write to file and add to object_xref_mappings - if (defined $xref_to_objects{$master_xref_id}) { - my @ensembl_object_ids = keys( %{$xref_to_objects{$master_xref_id}} ); - foreach my $object_id_key (@ensembl_object_ids) { - my ($object_id, $type) = split /\|/, $object_id_key; - my $full_key = $type."|".$object_id."|".$xref_id; - if(defined($priority_xref_source_id{$xref_id})){ - my $key = $priority_source_id_to_name{$source_id}.":".$priority_xref_acc{$xref_id}; - if(!defined($priority_xref_priority{$key})){ - - $priority_xref_extra_bit{$xref_id} = "\t" . "DEPENDENT" . "\t" . "Generated via $master_accession\n"; - $priority_xref{$key} = $xref_id; - $priority_xref_priority{$key} = $priority_source_id{$source_id}; - $priority_object_xref{$key} = "$type:$object_id"; - $priority_identity_xref{$key} = undef; - $priority_identity_temp_xref{$key} = $primary_identity{$master_xref_id}{$object_id."|".$type}; ## $primary_xref_idsmasters identity stuff; - $priority_xref_state{$key} = "dependent"; - next; - } - if($priority_xref_priority{$key} - > $priority_source_id{$source_id}){ - - $priority_xref_extra_bit{$xref_id} = "\t" . "DEPENDENT" . "\t" . "Generated via $master_accession\n"; - $priority_xref{$key} = $xref_id; - $priority_xref_priority{$key} = $priority_source_id{$source_id}; - $priority_object_xref{$key} = "$type:$object_id"; - $priority_identity_xref{$key} = undef; - $priority_identity_temp_xref{$key} = $primary_identity{$master_xref_id}{$object_id."|".$type}; ## $primary_xref_idsmasters identity stuff; - $priority_xref_state{$key} = "dependent"; - next; - } - next; - } - if (!$object_xrefs_written{$full_key}) { - - $object_succesfully_mapped{$xref_id} = 1; - - print OBJECT_XREF "$object_xref_id\t$object_id\t$type\t" . ($xref_id+$xref_id_offset) . "\tFROM:".$master_source_name.":".$master_accession."\n"; - print IDENTITY_XREF $object_xref_id."\t".$primary_identity{$master_xref_id}{$object_id."|".$type}; - - # Add this mapping to the list - note NON-OFFSET xref_id is used - my $key = $type . "|" . $object_id; - $object_xrefs_written{$full_key} = 1; - - # write a go_xref with the appropriate linkage type - print GO_XREF $object_xref_id . "\t" . $linkage_annotation . "\t\\N\n" if (defined($go_source{$source_id})); - $go_done{$object_id."|".$type."|" . ($xref_id+$xref_id_offset)} = 1; - my $master_accession = $XXXxref_id_to_accession{$master_xref_id}; - - # Also store *parent's* query/target identity for dependent xrefs - - $object_xref_id++; - - } - } - } - }# end of dependents - - #print "source_ids: " . join(" ", keys(%source_ids)) . "\n"; - - } # while @xref_ids - - # Dump any synonyms for xrefs we've written - # Now write the synonyms we want to the file - foreach my $xref_id (keys %synonyms) { - foreach my $syn (@{$synonyms{$xref_id}}) { - print EXTERNAL_SYNONYM ($xref_id+$xref_id_offset) . "\t" . $syn . "\n"; - $syn_count++; - } - } - - print "Wrote $syn_count synonyms\n"; - - close(XREF); - close(OBJECT_XREF); - close(EXTERNAL_SYNONYM); - close(GO_XREF); - close(IDENTITY_XREF); - - return $object_xref_id; - -} - - - -sub build_transcript_and_gene_display_xrefs { - # Called by $self->genes_and_transcripts_attributes_set - my ($self) = @_; - my $dir = $self->core->dir(); - - print "Building Transcript and Gene display_xrefs\n"; - - # Open file handles to recieve SQL and text data used to set - # display_xrefs - my $gene_dx_file = "$dir/gene_display_xref.sql"; - my $tran_dx_file = "$dir/transcript_display_xref.sql"; - my $unset_gene_dx_file = "$dir/gene_unset_display_xref.sql"; - my $unset_tran_dx_file = "$dir/transcript_unset_display_xref.sql"; - - open (GENE_DX, ">$gene_dx_file") - or die( "Could not open $gene_dx_file: $!" ); - open (TRANSCRIPT_DX, ">$tran_dx_file") - or die( "Could not open $tran_dx_file: $!" ); - open (GENE_DX_UNSET, ">$unset_gene_dx_file") - or die( "Could not open $unset_gene_dx_file: $!" ); - open (TRAN_DX_UNSET, ">$unset_tran_dx_file") - or die( "Could not open $unset_tran_dx_file: $!" ); - open (GENE_DX_TXT, ">$dir/gene_display_xref.txt"); - open (TRANSCRIPT_DX_TXT, ">$dir/transcript_display_xref.txt"); - - # These are the files that this method will return - my @files = ($unset_gene_dx_file,$gene_dx_file, - $unset_tran_dx_file,$tran_dx_file); - - # Write the 'unset' sql to the files, and cose them - print GENE_DX_UNSET qq(UPDATE gene SET display_xref_id=NULL;\n); - print TRAN_DX_UNSET qq(UPDATE transcript SET display_xref_id=NULL;\n); - close( GENE_DX_UNSET ); - close( TRAN_DX_UNSET ); - - my %external_name_to_id; - my %ex_db_id_to_status; - my $sql1 = "SELECT external_db_id, db_name, status from external_db"; - - my $sth1 = $self->core->dbc->prepare($sql1) || die "prepare failed for $sql1\n"; - $sth1->execute() || die "execute failed"; - my ($db_id, $name, $status); - $sth1->bind_columns(\$db_id, \$name, \$status); - while($sth1->fetch()){ - $external_name_to_id{$name} = $db_id; - $ex_db_id_to_status{$db_id} = $status; - } - $sth1->finish; - - - ############################# - #create the tempory table - ############################# - - my $sql_t = "create table identity_xref_temp like identity_xref"; - my $sth = $self->core->dbc->prepare( $sql_t ); - - print "creating table identity_xref_temp\n"; - - $sth->execute() || die( "Could not \n$sql_t\n", $sth->errstr ); - - - ############################# - #populate the tempory table - ############################# - my $file = $dir."/identity_xref_temp.txt"; - - if(-s $file){ - my $sth = $self->core->dbc->prepare(" -LOAD DATA LOCAL INFILE \'$file\' -IGNORE INTO TABLE identity_xref_temp"); - print "Uploading data in $file to identity_xref_temp\n"; - $sth->execute(); - } - else{ - print( "NO file or zero size file, so not able to load ", - "file $file to identity_xref_temp\n" ); - } - - # - # create a prioritised list of sources to use - # and also a list of those xrefs to ignore - # where the source name is the key and the value is the string to test for - # - my ($presedence, $ignore) = @{$self->transcript_display_xref_sources()}; - my $i=0; - my %level; - print "precedense in reverse order:-\n"; - foreach my $ord (reverse (@$presedence)){ - $i++; - if(!defined($external_name_to_id{$ord})){ - print STDERR "unknown external database name *$ord* being used\n"; - } - $level{$external_name_to_id{$ord}} = $i; - print "\t".$ord."\t$i\n"; - } - - # Generate genes_to_transcripts and translation_to_transcript mappings - # if needed - if(!scalar(keys %genes_to_transcripts)){ - $self->build_genes_to_transcripts(); - } - if(!scalar(keys %translation_to_transcript)){ - $self->load_translation_to_transcript(); - } - - # Gets object and identity xref data for 'SEQUENCE_MATCH' xrefs - # for a given db_id of a given object type (gene/transcript/translation) - my $sql = (<<ESQL); - SELECT ox.xref_id, ix.xref_identity, ix.ensembl_identity, - x.external_db_id, x.display_label, - e.db_name, ox.linkage_annotation - FROM (object_xref ox, xref x, external_db e) - LEFT JOIN identity_xref ix - ON (ox.object_xref_id = ix.object_xref_id) - WHERE x.xref_id = ox.xref_id - AND ox.ensembl_object_type = ? - AND ox.ensembl_id = ? - AND x.info_type = 'SEQUENCE_MATCH' - AND e.external_db_id = x.external_db_id -ESQL - my $primary_sth = $self->core->dbc->prepare($sql) - || die "prepare failed for $sql\n"; - - - # Gets object and identity xref data for 'DEPENDENT' xrefs - # for a given db_id of a given object type (gene/transcript/translation) - $sql = (<<ZSQL); - SELECT ox.xref_id, ix.xref_identity, ix.ensembl_identity, - x.external_db_id, x.display_label, - e.db_name, ox.linkage_annotation - FROM (object_xref ox, xref x, external_db e) - LEFT JOIN identity_xref_temp ix - ON (ox.object_xref_id = ix.object_xref_id) - WHERE x.xref_id = ox.xref_id and ox.ensembl_object_type = ? - AND ox.ensembl_id = ? and x.info_type = 'DEPENDENT' - AND e.external_db_id = x.external_db_id -ZSQL - my $dependent_sth = $self->core->dbc->prepare($sql) - || die "prepare failed for $sql\n"; - - - # Gets object xref data for 'DIRECT' xrefs - # for a given db_id of a given object type (gene/transcript/translation) - $sql = (<<QSQL); - SELECT x.xref_id, x.external_db_id, x.display_label, - e.db_name, o.linkage_annotation - FROM object_xref o, xref x, external_db e - WHERE x.xref_id = o.xref_id - AND o.ensembl_object_type = ? - AND o.ensembl_id = ? - AND (x.info_type = 'DIRECT' or x.info_type is NULL) - AND e.external_db_id = x.external_db_id -QSQL - - my $direct_sth = $self->core->dbc->prepare($sql) - || die "prepare failed for $sql\n"; - - # Gets object xrefs data for any type of xref - # for a given gene id. - $sql = (<<GSQL); - SELECT x.xref_id, x.external_db_id, e.db_name, o.linkage_annotation - FROM object_xref o, xref x, external_db e - WHERE x.xref_id = o.xref_id - AND o.ensembl_object_type = 'Gene' - AND o.ensembl_id = ? - AND e.external_db_id = x.external_db_id -GSQL - - my $gene_sth = $self->core->dbc->prepare($sql) - || die "prepare failed for $sql\n"; - - my $count =0; - - my ($xref_id, $qid, $tid, $ex_db_id, $display_label, $external_db_name, $linkage_annotation); - - # Loop for each gene_id - foreach my $gene_id (keys %genes_to_transcripts) { - my %percent_id; - my %level_db; - my %parent; - my %percent_id_via_acc; - my @gene_xrefs = (); - - # Query for object_xrefs attached directly o gene - $gene_sth->execute($gene_id) || die "execute failed"; - $gene_sth->bind_columns - (\$xref_id, \$ex_db_id, \$external_db_name, \$linkage_annotation); - - my $best_gene_xref = 0; # store xref - my $best_gene_level = 0; # store level - my $best_gene_percent = 0; # additoon of precentage ids - - while($gene_sth->fetch()){ - - # Skip certain hard-coded external_db.names. - if(defined($$ignore{$external_db_name})){ - if($linkage_annotation =~ /$$ignore{$external_db_name}/){ - #print( "Ignoring $xref_id as linkage_annotation has ", - # $$ignore{$external_db_name}." in it." ); # DEBUG - - next; - } - } - - # - if(defined($level{$ex_db_id})){ - if($level{$ex_db_id} > $best_gene_level){ - $best_gene_xref = $xref_id; - $best_gene_level = $level{$ex_db_id}; - } - } - } - - - my @transcripts = @{$genes_to_transcripts{$gene_id}}; - foreach my $transcript_id (@transcripts) { - - my @transcript_xrefs = (); - - foreach my $type ("Transcript", "Translation"){ - my $ens_id; - if($type eq "Transcript"){ - $ens_id = $transcript_id; - } - else{ - if(defined($transcript_to_translation{$transcript_id})){ - $ens_id=$transcript_to_translation{$transcript_id}; - } - else{ - next; - } - } - $primary_sth->execute($type, $ens_id ) || die "execute failed"; - $primary_sth->bind_columns(\$xref_id, \$qid, \$tid, \$ex_db_id, - \$display_label, \$external_db_name, - \$linkage_annotation); - while($primary_sth->fetch()){ - if($level{$ex_db_id} and $display_label =~ /\D+/ ){ #correct level and label is not just a number - if(defined($$ignore{$external_db_name})){ - if($linkage_annotation =~ /$$ignore{$external_db_name}/){ -# print "Ignoring $xref_id as linkage_annotation has ".$$ignore{$external_db_name}." in it. DELETE THIS MESSAGE AFTER TESTING\n"; - next; - } - } - - push @transcript_xrefs, $xref_id; - if(!defined($qid) || !defined($tid)){ - print "PRIMARY $xref_id\n"; - $percent_id{$xref_id} = 0; - } - else{ - $percent_id{$xref_id} = $qid + $tid; - } - - $level_db{$xref_id} = $level{$ex_db_id}; - } - } - - $dependent_sth->execute($type, $ens_id ) || die "execute failed"; - $dependent_sth->bind_columns(\$xref_id, \$qid, \$tid, \$ex_db_id, - \$display_label, \$external_db_name, - \$linkage_annotation); - while($dependent_sth->fetch()){ - if($level{$ex_db_id} and $display_label =~ /\D+/){ - if( defined($$ignore{$external_db_name}) and defined($linkage_annotation) ){ - if($linkage_annotation =~ /$$ignore{$external_db_name}/){ -# print "Ignoring $xref_id as linkage_annotation has ".$$ignore{$external_db_name}." in it. DELETE THIS MESSAGE AFTER TESTING\n"; - next; - } - } - push @transcript_xrefs, $xref_id; - if(!defined($qid) || !defined($tid)){ -# print "DEPENDENT $xref_id\n" if($ex_db_id != 1100); #HGNC has added one with no %ids. - $percent_id{$xref_id} = 0; - } - else{ - $percent_id{$xref_id} = $qid + $tid; - } - $level_db{$xref_id} = $level{$ex_db_id}; - } - } - - $direct_sth->execute($type, $ens_id ) || die "execute failed"; - $direct_sth->bind_columns(\$xref_id, \$ex_db_id, \$display_label, - \$external_db_name, \$linkage_annotation); - while($direct_sth->fetch()){ - if($level{$ex_db_id} and $display_label =~ /\D+/){ - if(defined($$ignore{$external_db_name})){ - if($linkage_annotation =~ /$$ignore{$external_db_name}/){ -# print "Ignoring $xref_id as linkage_annotation has ".$$ignore{$external_db_name}." in it. DELETE THIS MESSAGE AFTER TESTING\n"; - next; - } - } - push @transcript_xrefs, $xref_id; - $percent_id{$xref_id} = 0; - $level_db{$xref_id} = $level{$ex_db_id}; - } - } - - } - - my $best_tran_xref = 0; # store xref - my $best_tran_level = 0; # store level - my $best_tran_percent = 0; # store best %id total - - foreach my $xref_id (@transcript_xrefs) { - if(defined($level_db{$xref_id}) and $level_db{$xref_id}){ - if($level_db{$xref_id} < $best_tran_level){ - next; - } - - if($level_db{$xref_id} == $best_tran_level){ - if($percent_id{$xref_id} < $best_tran_percent){ - next; - } - } - $best_tran_percent = $percent_id{$xref_id}; - $best_tran_level = $level_db{$xref_id}; - $best_tran_xref = $xref_id; - } - } - - if($best_tran_xref){ - print TRANSCRIPT_DX "UPDATE transcript SET display_xref_id=" .$best_tran_xref. - " WHERE transcript_id=" . $transcript_id . ";\n"; - print TRANSCRIPT_DX_TXT $best_tran_xref. "\t" . $transcript_id . "\n"; - } - - if($best_tran_level < $best_gene_level){ - next; - } - if($best_tran_level == $best_gene_level){ - if($best_tran_percent < $best_gene_percent){ - next; - } - } - - $best_gene_percent = $best_tran_percent; - $best_gene_level = $best_tran_level; - $best_gene_xref = $best_tran_xref; - } - - if($best_gene_xref){ - print GENE_DX "UPDATE gene g SET g.display_xref_id=" . $best_gene_xref . - " WHERE g.gene_id=" . $gene_id . ";\n"; - print GENE_DX_TXT $best_gene_xref . "\t" . $gene_id ."\n"; - } - } - close TRANSCRIPT_DX; - close TRANSCRIPT_DX_TXT; - close GENE_DX; - close GENE_DX_TXT; - - # Done - return @files; - -} - - - - -# Display xref sources to be used for transcripts *in order of priority* -# Source names used must be identical to those in the source table. - -sub transcript_display_xref_sources { - my @list = qw(miRBase - RFAM - HGNC_curated_gene - HGNC_automatic_gene - MGI_curated_gene - MGI_automatic_gene - Clone_based_vega_gene - Clone_based_ensembl_gene - HGNC_curated_transcript - HGNC_automatic_transcript - MGI_curated_transcript - MGI_automatic_transcript - Clone_based_vega_transcript - Clone_based_ensembl_transcript - IMGT/GENE_DB - HGNC - SGD - MGI - flybase_symbol - Anopheles_symbol - Genoscope_annotated_gene - Uniprot/SWISSPROT - Uniprot/Varsplic - RefSeq_peptide - RefSeq_dna - Uniprot/SPTREMBL - EntrezGene - IPI); - - my %ignore; - $ignore{"EntrezGene"}= 'FROM:RefSeq_[pd][en][pa].*_predicted'; - - return [\@list,\%ignore]; - -} - -# Get transcripts associated with each gene - -sub build_genes_to_transcripts { - - my ($self) = @_; - - my $sql = "SELECT gene_id, transcript_id, seq_region_start, seq_region_end FROM transcript"; - my $sth = $self->core->dbc->prepare($sql); - $sth->execute(); - - my ($gene_id, $transcript_id, $start, $end); - $sth->bind_columns(\$gene_id, \$transcript_id, \$start, \$end); - - # Note %genes_to_transcripts is global - while ($sth->fetch()) { - push @{$genes_to_transcripts{$gene_id}}, $transcript_id; - $transcript_length{$transcript_id} = $end- $start; - } - -} - -# Find the index of an item in a list(ref), or -1 if it's not in the list. -# Only look for exact matches (case insensitive) - -sub find_in_list { - - my ($item, @list) = @_; - - for (my $i = 0; $i < scalar(@list); $i++) { - if (lc($list[$i]) eq lc($item)) { - return $i; - } - } - - return -1; - -} - -# Take a string and a list of regular expressions -# Find the index of the highest matching regular expression -# Return the index, or -1 if not found. - -sub find_match { - - my ($str, @list) = @_; - - my $str2 = $str; - my $highest_index = -1; - - for (my $i = 0; $i < scalar(@list); $i++) { - my $re = $list[$i]; - if ($str2 =~ /$re/i) { - $highest_index = $i; - } - } - - return $highest_index; - -} - -# Build a map of source id (in xref database) to external_db (in core database) - -sub map_source_to_external_db { - - my $self = shift; - - my %source_to_external_db; - - # get all sources - my $sth = $self->xref->dbc->prepare("select s.source_id, s.name, s.source_release, count(*) from xref x, source s where x.source_id = s.source_id group by source_id"); - $sth->execute(); - my ($source_id, $source_name, $source_release, $count); - $sth->bind_columns(\$source_id, \$source_name, \$source_release, \$count); - - while($sth->fetchrow_array()) { - - # find appropriate external_db_id for each one - my $sql = "SELECT external_db_id FROM external_db WHERE db_name=?"; - my $core_sth = $self->core->dbc->prepare($sql); - $core_sth->execute($source_name); - - my @row = $core_sth->fetchrow_array(); - - if (@row) { - - $source_to_external_db{$source_id} = $row[0]; - if($source_release ne "1"){ - if($source_release =~ /RefSeq/){ - $external_db_release{$row[0]} = substr($source_release,-255); # At the moment max is 255 - } - else{ - $external_db_release{$row[0]} = substr($source_release,0,255); # At the moment max is 255 - } - -# print "Source name $source_name id $source_id corresponds to core external_db_id " . $row[0] . "and release is *".$external_db_release{$row[0]}."*\n"; - } - } else { - print STDERR "Can't find external_db entry for source name $source_name; xrefs for this source will not be written. Consider adding $source_name to external_db and \n" - . " Make sure that you used TABS not spaces as delimiters in external_db.txt\n" ; - } - - } # while source - - return %source_to_external_db; -} - - -sub cleanup_projections_file{ - my $self = shift; - - my $dir = $self->core->dir(); - open (DEL, ">>$dir/cleanup.sql") || die "Could not open $dir/cleanup.sql\n"; - - print DEL "DELETE es "; - print DEL "FROM xref x, external_synonym es "; - print DEL "WHERE x.xref_id = es.xref_id and x.info_type = 'PROJECTION'\n"; - - print DEL "DELETE object_xref "; - print DEL "FROM object_xref, xref "; - print DEL "WHERE object_xref.xref_id = xref.xref_id "; - print DEL "AND xref.info_type = 'PROJECTION'\n"; - - print DEL "DELETE xref "; - print DEL "FROM xref "; - print DEL "WHERE xref.info_type = 'PROJECTION'\n"; - - close DEL; -} - - -sub species_specific_cleanup{ -} - -sub cleanup_sources_file{ - my ($self,$id) = @_; - - $updated_source{$id} =1; - - my $dir = $self->core->dir(); - open (DEL, ">>$dir/cleanup.sql") || die "Could not open $dir/cleanup.sql\n"; - - if ($id =~ m/\w/){ - - if(defined($external_db_release{$id})){ - print DEL "UPDATE external_db "; - print DEL 'SET db_release = "'.$external_db_release{$id}; - print DEL '" WHERE external_db_id = '.$id."\n"; - } - - - print DEL "DELETE external_synonym "; - print DEL "FROM external_synonym, xref "; - print DEL "WHERE external_synonym.xref_id = xref.xref_id "; - print DEL "AND xref.external_db_id = $id\n"; - - print DEL "DELETE gx "; - print DEL "FROM xref x, object_xref ox LEFT JOIN go_xref gx "; - print DEL "ON ox.object_xref_id = gx.object_xref_id "; - print DEL "WHERE x.xref_id = ox.xref_id "; - print DEL "AND x.external_db_id = $id "; - print DEL "AND gx.linkage_type is not null\n"; - - print DEL "DELETE identity_xref "; - print DEL "FROM identity_xref, object_xref, xref "; - print DEL "WHERE identity_xref.object_xref_id = object_xref.object_xref_id "; - print DEL "AND object_xref.xref_id = xref.xref_id "; - print DEL "AND xref.external_db_id = $id \n"; - - - print DEL "DELETE object_xref "; - print DEL "FROM object_xref, xref "; - print DEL "WHERE object_xref.xref_id = xref.xref_id "; - print DEL "AND xref.external_db_id = $id\n"; - - - print DEL "DELETE FROM xref WHERE xref.external_db_id = $id \n"; - - print DEL "DELETE FROM unmapped_object WHERE type='xref' and external_db_id = $id \n"; - - } else { - warn("\nFound an empty id\n"); - } - close DEL; - -} - -# Upload .txt files and execute .sql files. - -sub do_upload { - - my ($self) = @_; - - my $ensembl = $self->core; - my $core_db = $ensembl->dbc; - # xref.txt etc - my $file = $ensembl->dir() . "/cleanup.sql"; - - print "Deleting existing data - using file $file\n"; - open(CLEAN,"<$file") || die "could not open $file for reading \n"; - while (<CLEAN>){ - chomp; - my $sth = $core_db->prepare($_); - $sth->execute() or die "Couldn't execute statement: " . $sth->errstr; - } - close CLEAN; - - $self->species_specific_cleanup(); - - foreach my $table ("go_xref", "interpro") { - my $file = $ensembl->dir() . "/" . $table . ".txt"; - - if(-s $file){ - my $sth = $core_db->prepare("DELETE FROM $table"); - print "Deleting existing data in $table\n"; - $sth->execute(); - } - } - - print "Uploading new data\n"; - foreach my $table ("xref", "object_xref", "identity_xref", "external_synonym", - "go_xref", "interpro") { - - my $file = $ensembl->dir() . "/" . $table . ".txt"; - - if(-s $file){ - my $sth = $core_db->prepare("LOAD DATA LOCAL INFILE \'$file\' IGNORE INTO TABLE $table"); - print "Uploading data in $file to $table\n"; - $sth->execute(); - } - else{ - print "NO file or zero size file, so not able to load file $file to $table\n"; - } - } -} - -sub genes_and_transcripts_attributes_set{ - # Runs build_transcript_and_gene_display_xrefs, - # new_build_gene_descriptions, build_gene_transcript_status and - # build_meta_timestamp, and, if "-upload" is set, uses the SQL files - # produced to update the core database. - - my ($self) = @_; - - my $ensembl = $self->core; - my $core_db = $ensembl->dbc; - - # Create the sql files needed to update the database - my @displayxref_files = $self->build_transcript_and_gene_display_xrefs(); - my @description_files = $self->new_build_gene_descriptions(); - my @status_files = $self->build_gene_transcript_status(); - my @timestamp_files = $self->build_meta_timestamp; - - unless( $self->upload ){ - print "[INFO] created the following files " - ."(load by hand or use the --upload flag)\n " - .join( "\n ", - @displayxref_files, - @description_files, - @status_files, - @timestamp_files ) - ."\n"; - - return; - } - - # Get the command needed to run mysql on the shell - my $mysql_command = $self->get_mysql_command($core_db); - - # Load the display_xrefs - print "Setting gene and transcript display_xrefs\n"; - foreach my $file( @displayxref_files ){ - system( "$mysql_command < $file" ) == 0 - or print( "ERROR: parsing $file in mysql\n" ); - } - - # Load the gene descriptions from the appropriate file - foreach my $file( @description_files ){ - system( "$mysql_command < $file" ) == 0 - or print( "ERROR: parsing $file in mysql\n" ); - } - - # update meta table with timestamp saying when xrefs were last updated - foreach my $file( @timestamp_files ){ - system( "$mysql_command < $file" ) == 0 - or print( "ERROR: parsing $file in mysql\n" ); - } - - # set gene and transcript statuses to KNOWN or NOVEL - print "Setting gene and transcript status from external_db KNOWN/NOVEL\n"; - foreach my $file( @status_files ){ - system( "$mysql_command < $file" ) == 0 - or print( "ERROR: parsing $file in mysql\n" ); - } - - return 1; -} - - - - -sub load_translation_to_transcript{ - my ($self) = @_; - - my $sth = $self->core->dbc->prepare("SELECT translation_id, transcript_id FROM translation"); - $sth->execute(); - - my ($translation_id, $transcript_id); - $sth->bind_columns(\$translation_id, \$transcript_id); - - while ($sth->fetch()) { - $translation_to_transcript{$translation_id} = $transcript_id; - $transcript_to_translation{$transcript_id} = $translation_id if ($translation_id); - } -} - - -sub new_build_gene_descriptions{ - my ($self) = @_; - - - print "Building gene Descriptions\n"; - - my $dir = $self->core->dir(); - - # Open the handles on files that will contain the sql - my @files = (); # Names of files returned from this method - my $gene_desc_file = "$dir/gene_description.sql"; - my $unset_gene_desc_file = "$dir/gene_unset_description.sql"; - open(GENE_DESCRIPTIONS,">$gene_desc_file") - || die "Could not open $gene_desc_file: $!"; - open(GENE_DESC_UNSET, ">$unset_gene_desc_file" ) - || die "Could not open $unset_gene_desc_file: $!"; - push @files, $unset_gene_desc_file, $gene_desc_file; - - # Write the file that unsets the descriptions - print GENE_DESC_UNSET qq(UPDATE gene g SET g.description=NULL;\n); - close(GENE_DESC_UNSET); - - my @regexps = $self->gene_description_filter_regexps(); - - if(scalar(@regexps) == 0){ - warn "no reg exps\n"; - } - my @presedence = $self->gene_description_sources(); - - - - if(!scalar(keys %translation_to_transcript)){ - $self->load_translation_to_transcript(); - } - - my %external_name_to_id; - my %ex_db_id_to_status; - my %ex_db_id_to_display_name; - my $sql1 = "SELECT external_db_id, db_name, status, db_display_name from external_db"; - - my $sth1 = $self->core->dbc->prepare($sql1) - || die "prepare failed for $sql1\n"; - $sth1->execute() || die "execute failed"; - my ($db_id, $name, $status, $display_name); - $sth1->bind_columns(\$db_id, \$name, \$status, \$display_name); - while($sth1->fetch()){ - $external_name_to_id{$name} = $db_id; - $ex_db_id_to_status{$db_id} = $status; - $ex_db_id_to_display_name{$db_id} = $display_name; - } - $sth1->finish; - - my $i=0; - my %level; - - foreach my $ord (reverse (@presedence)){ - $i++; - if(!defined($external_name_to_id{$ord})){ - print STDERR "Unknown external database name *$ord* being used\n"; - } - $level{$external_name_to_id{$ord}} = $i; - - } - - if(!scalar(keys %genes_to_transcripts)){ - $self->build_genes_to_transcripts(); - } - - - my $sql = (<<ESQL); - SELECT ox.xref_id, ix.xref_identity, ix.ensembl_identity, x.external_db_id, x.description, x.dbprimary_acc - FROM (object_xref ox, xref x) - LEFT JOIN identity_xref ix ON (ox.object_xref_id = ix.object_xref_id) - WHERE x.xref_id = ox.xref_id and ox.ensembl_object_type = ? - and ox.ensembl_id = ? and x.info_type = 'SEQUENCE_MATCH' -ESQL - - my $primary_sth = $self->core->dbc->prepare($sql) || die "prepare failed for $sql\n"; - - $sql = (<<ZSQL); - SELECT ox.xref_id, ix.xref_identity, ix.ensembl_identity, x.external_db_id, x.description, x.dbprimary_acc - FROM (object_xref ox, xref x) - LEFT JOIN identity_xref_temp ix ON (ox.object_xref_id = ix.object_xref_id) - WHERE x.xref_id = ox.xref_id and ox.ensembl_object_type = ? - and ox.ensembl_id = ? and x.info_type = 'DEPENDENT' -ZSQL - - my $dependent_sth = $self->core->dbc->prepare($sql) || die "prepare failed for $sql\n"; - - - $sql = (<<QSQL); - SELECT x.xref_id, x.external_db_id, x.description, x.dbprimary_acc - FROM object_xref o, xref x - WHERE x.xref_id = o.xref_id - and o.ensembl_object_type = ? and o.ensembl_id = ? and x.info_type = 'DIRECT' -QSQL - - my $direct_sth = $self->core->dbc->prepare($sql) || die "prepare failed for $sql\n"; - - $sql = (<<GSQL); - SELECT x.xref_id, x.external_db_id, x.description, x.dbprimary_acc - FROM object_xref o, xref x - WHERE x.xref_id = o.xref_id - and o.ensembl_object_type = 'Gene' and o.ensembl_id = ? -GSQL - - my $gene_sth = $self->core->dbc->prepare($sql) || die "prepare failed for $sql\n"; - - my $count =0; - - my ($xref_id, $qid, $tid, $ex_db_id, $description, $acc); - - - # Do not use the %id here just use the Best one using the @words to do this - my @words = qw(unknown hypothetical putative novel probable [0-9]{3} kDa fragment cdna protein); - my $trembl_id = $external_name_to_id{"Uniprot/SPTREMBL"}; - - - my $checked = 0; - my $added = 0; - my $removed = 0; - - foreach my $gene_id (keys %genes_to_transcripts) { - - my %percent_id; - my %level_db; - my %parent; - my %ex_db; - my %xref_descriptions; - my %xref_accessions; - my @gene_xrefs = (); - my @transcript_xrefs = (); - - my $best_gene_xref = 0; # store xref - my $best_gene_level = 0; # store level - my $best_gene_percent = 0; # additoon of precentage ids - my $best_gene_length = 0; # best transcript for the genes length - - $gene_sth->execute($gene_id) || die "execute failed"; - $gene_sth->bind_columns(\$xref_id, \$ex_db_id, \$description, \$acc); - - while($gene_sth->fetch()){ - $checked++; - if ($description and defined($level{$ex_db_id})) { - my $filtered_description = $self->filter_by_regexp($description, \@regexps); - if ($filtered_description ne "") { - $xref_descriptions{$xref_id} = $description; - $xref_accessions{$xref_id} = $acc; - if($level{$ex_db_id} > $best_gene_level){ - $best_gene_xref = $xref_id; - $best_gene_level = $level{$ex_db_id}; - } - $added++; - } else { - $removed++; - } - } - } - - - my @transcripts = @{$genes_to_transcripts{$gene_id}}; - foreach my $transcript_id (@transcripts) { - foreach my $type("Transcript", "Translation"){ - my $ens_id; - if($type eq "Transcript"){ - $ens_id = $transcript_id; - } - else{ - if(defined($transcript_to_translation{$transcript_id})){ - $ens_id=$transcript_to_translation{$transcript_id}; - } - else{ - next; - } - } - - $primary_sth->execute($type, $ens_id) || die "execute failed"; - $primary_sth->bind_columns(\$xref_id, \$qid, \$tid, \$ex_db_id, \$description, \$acc); - while($primary_sth->fetch()){ - - if($level{$ex_db_id}){ - $checked++; - if ($description) { - my $filtered_description = $self->filter_by_regexp($description, \@regexps); - if ($filtered_description ne "") { - $xref_descriptions{$xref_id} = $description; - $xref_accessions{$xref_id} = $acc; - if(defined($qid)){ - $percent_id{$xref_id} = $qid + $tid; - } - else{ - print "WARN: xref_id $xref_id PRIMARY can't find percrnt id\n"; - $percent_id{$xref_id} = 50; - } - push @transcript_xrefs, $xref_id; # added ? - $percent_id{$xref_id} = $qid + $tid; - $ex_db{$xref_id} = $ex_db_id; - $level_db{$xref_id} = $level{$ex_db_id}; - $added++; - } else { - $removed++; - } - } - } - } - - $dependent_sth->execute($type, $ens_id) || die "execute failed"; - $dependent_sth->bind_columns(\$xref_id, \$qid, \$tid, \$ex_db_id, \$description, \$acc); - while($dependent_sth->fetch()){ - if($level{$ex_db_id}){ - $checked++; - if ($description) { - my $filtered_description = $self->filter_by_regexp($description, \@regexps); - if ($filtered_description ne "") { - $xref_descriptions{$xref_id} = $description; - $xref_accessions{$xref_id} = $acc; - push @transcript_xrefs, $xref_id; - if(defined($qid)){ - $percent_id{$xref_id} = $qid + $tid; - } - else{ -# print "WARN: xref_id $xref_id DEPEND can't find percrnt id. Type = $type, ens_id == $ens_id, acc =$acc\n"; - $percent_id{$xref_id} = 50; - } - $ex_db{$xref_id} = $ex_db_id; - $level_db{$xref_id} = $level{$ex_db_id}; - $added++; - } else { - $removed++; - } - } - } - } - - $direct_sth->execute($type, $ens_id) || die "execute failed"; - $direct_sth->bind_columns(\$xref_id, \$ex_db_id, \$description, \$acc); - while($direct_sth->fetch()){ - if($level{$ex_db_id}){ - $checked++; - if ($description) { - my $filtered_description = $self->filter_by_regexp($description, \@regexps); - if ($filtered_description ne "") { - $xref_descriptions{$xref_id} = $description; - $xref_accessions{$xref_id} = $acc; - push @transcript_xrefs, $xref_id; - $percent_id{$xref_id} = 200; - $ex_db{$xref_id} = $ex_db_id; - $level_db{$xref_id} = $level{$ex_db_id}; - $added++; - } else { - $removed++; - } - } - } - } - - } - - my $best_tran_xref = 0; # store xref - my $best_tran_level = 0; # store level - my $best_tran_percent = 0; # store best %id total - my $best_tran_length =0 ; # store length of the best - - foreach my $xref_id (@transcript_xrefs) { - if(defined($xref_descriptions{$xref_id})){ - if(defined($level_db{$xref_id} and $level_db{$xref_id})){ - if($level_db{$xref_id} < $best_tran_level){ - next; - } - if($level_db{$xref_id} == $best_tran_level){ - if($percent_id{$xref_id} < $best_tran_percent){ - next; - } - } - - $best_tran_percent = $percent_id{$xref_id}; - $best_tran_level = $level_db{$xref_id}; - $best_tran_xref = $xref_id; - $best_tran_length = $transcript_length{$transcript_id}; - } - } - } - - if($best_tran_level < $best_gene_level){ - next; - } - if($best_tran_level == $best_gene_level){ - if($best_tran_percent < $best_gene_percent){ - next; - } - elsif($best_tran_percent == $best_gene_percent){ - if($transcript_length{$transcript_id} < $best_gene_length){ - next; - } - } - - } - - $best_gene_percent = $best_tran_percent; - $best_gene_level = $best_tran_level; - $best_gene_xref = $best_tran_xref; - $best_gene_length = $transcript_length{$transcript_id}; - } - - if($best_gene_xref){ - my $description = $xref_descriptions{$best_gene_xref}; - my $acc = $xref_accessions{$best_gene_xref}; - - $description =~ s/\"//ig; # remove " as they will cause problems in .sql files - - my $desc = $description . " [Source:".$ex_db_id_to_display_name{$ex_db{$best_gene_xref}}.";Acc:$acc]"; - - print GENE_DESCRIPTIONS "UPDATE gene g SET g.description=\"$desc\" ". - "WHERE g.gene_id=$gene_id;\n" if ($description); - - } - } - close GENE_DESCRIPTIONS; - - my $sth = $self->core->dbc->prepare("drop table identity_xref_temp"); - print "dropping table identity_xref_temp\n"; - $sth->execute() || die "Could not drop table identity_xref_temp\n"; - print "finished dropping table\n"; - # Done - return @files; -} - -sub build_gene_transcript_status{ - # Creates the files that contain the SQL needed to (re)set the - # gene.status and transcript.status values - my $self = shift; - my @files; - my $dir = $self->core->dir(); - my $unset_file_t = $dir .'/%s_unset_status.sql'; - my $set_file_t = $dir .'/%s_set_status.sql'; - - - - foreach my $type ( 'gene','transcript' ){ - # Queries needed to unset status - my $unset_file = sprintf( $unset_file_t, $type ); - open( UNSET, ">$unset_file" ) or die( "Cannot open $unset_file: $!" ); - print UNSET qq(UPDATE $type SET status='NOVEL';\n); - close( UNSET ); - push @files, $unset_file; - - } - - - - - my $set_file = sprintf( $set_file_t, "gene" ); - open( GENE, ">$set_file" ) or die( "Cannot open $set_file: $!" ); - push @files, $set_file; - - $set_file = sprintf( $set_file_t, "transcript" ); - open( TRAN, ">$set_file" ) or die( "Cannot open $set_file: $!" ); - push @files, $set_file; - - - - my ($presedence, $ignore) = @{$self->transcript_display_xref_sources()}; - my $i=0; - my %list_extern; - foreach my $ord (reverse (@$presedence)){ - $list_extern{$ord} = 1; - } - - #create a hash known which ONLY has databases names of those that are KNOWN and KNOWNXREF - my %known; - my $sth = $self->core->dbc->prepare("select db_name from external_db where status in ('KNOWNXREF','KNOWN')"); - $sth->execute(); - my ($name); - $sth->bind_columns(\$name); - while($sth->fetch){ - if(defined($list_extern{$name})){ - $known{$name} = 1; - } - } - $sth->finish; - - - # loop throught the gene and all transcript until you find KNOWN/KNOWNXREF as a status - my $ensembl = $self->core; - my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-dbconn => $ensembl->dbc); - my $gene_adaptor = $db->get_GeneAdaptor(); - - my @genes = @{$gene_adaptor->fetch_all()}; - - while (my $gene = shift @genes){ - my $gene_found = 0; - my @dbentries = @{$gene->get_all_DBEntries()}; - foreach my $dbe (@dbentries){ - if(defined($known{$dbe->dbname})){ - $gene_found =1; - } - } - my $one_tran_found = 0; - foreach my $tr (@{$gene->get_all_Transcripts}){ - my $tran_found = 0; - foreach my $dbe (@{$tr->get_all_DBLinks}){ - if(defined($known{$dbe->dbname})){ - $tran_found = 1; - $one_tran_found = 1; - } - } - if($tran_found or $gene_found){ - print TRAN 'UPDATE transcript t set t.status = "KNOWN" where t.transcript_id = '.$tr->dbID.";\n"; - } - } - if($gene_found or $one_tran_found){ - print GENE 'UPDATE gene g set g.status = "KNOWN" where g.gene_id = '.$gene->dbID.";\n"; - } - } - close GENE; - close TRAN; - - return @files; -} - -sub build_meta_timestamp{ - # Creates a file that contains the SQL needed to (re)set the - # 'xref.timestamp' key of the meta table. - my $self = shift; - my $file = $self->core->dir() . "/meta_timestamp.sql"; - open (FILE, ">$file") or die( "Cannot open $file: $!" ); - print FILE qq(DELETE FROM meta WHERE meta_key='xref.timestamp'; -INSERT INTO meta (meta_key,meta_value) VALUES ('xref.timestamp', NOW());\n); - close(FILE); - return ($file); -} - - -# Check if any .err files exist that have non-zero size; -# this indicates that something has gone wrong with the exonerate run - -sub check_err { - - my ($self, $dir) = @_; - - foreach my $err (glob("$dir/*.err")) { - - print "\n\n*** Warning: $err has non-zero size; may indicate". - " problems with exonerate run\n\n\n" if (-s $err); - - } -} - -sub get_best { - my ($self,$refxref) = @_; - return $$refxref[-1]; -} - -sub strip{ - my ($self,$refarray) = @_; - my @ret; - - foreach my $arr (@$refarray){ - if(defined($xref_accessions{$arr})){ - push @ret, $arr; - } - } - return \@ret; -} - -# remove a list of patterns from a string -sub filter_by_regexp { - - my ($self, $str, $regexps) = @_; - - foreach my $regexp (@$regexps) { - $str =~ s/$regexp//ig; - } - - return $str; - -} - -# Regexp used for filter out useless text from gene descriptions -# Method can be overridden in species-specific modules -sub gene_description_filter_regexps { - - return (); - -} - - - -# move translation hits on to their transcripts if the same external_db_id -# has hits to both translation and transcript - - -sub cleanup_database { - my ($self) = @_; - my $ensembl_dbc = $self->core->dbc; - - # 1 make sure external_db do not lie on both translations and transcripts - - my $sql_check = (<<ESQL); - SELECT x.external_db_id, ox.ensembl_object_type, COUNT(*), e.db_name - FROM xref x, object_xref ox, external_db e - WHERE x.xref_id = ox.xref_id AND e.external_db_id = x.external_db_id - GROUP BY x.external_db_id, ox.ensembl_object_type -ESQL - - my $sth = $ensembl_dbc->prepare($sql_check); - $sth->execute(); - my $previous_id = -1; - my $previous_type =""; - while(my @row = $sth->fetchrow_array()){ - my $external_db_id = $row[0]; - if($external_db_id == $previous_id){ - $self->fix_mart_prob($row[3],$external_db_id,$row[1],$previous_type); - } - $previous_id = $external_db_id; - $previous_type = $row[1]; - } - - # now recheck just incase :-) - $sth->execute(); - $previous_id = -1; - $previous_type =""; - my $error =0; - while(my @row = $sth->fetchrow_array()){ - my $external_db_id = $row[0]; - if($external_db_id == $previous_id){ - print "Problem: Still have multiple associations with ".$row[3]."\n"; - $error++; - } - $previous_id = $external_db_id; - $previous_type = $row[1]; - } - $sth->finish(); - if(!$error){ - print "External databases only associate to one ensembl type (PASS)\n"; - } - else{ - print "External databases only associate to one ensembl type (FAIL)\n"; - } - -} - - -sub fix_mart_prob{ - my ($self,$db_name,$db_id,$type1,$type2) = @_; - my $ensembl_dbc = $self->core->dbc; - - my $dbh = $self->core->dbc->db_handle; - - - print "$db_name ($db_id) is associated with both $type1 and $type2 object types\n"; - - - my $to; - if($type1 eq "Gene" or $type2 eq "Gene"){ - $to = "Gene"; - } - else{ - $to = "Transcript"; - } - - print "Therefore moving all associations to the ".$to."s\n"; - - -# -# Get a copy of the qid, tid from the master and ??????????????? -# -# hash{MGI:123456} = qid+tid from temp_file -# -# load temp table -# do search and get hash -# remove temp table - - my %percent_id; - my @list = $self->get_list_of_sources_for_one_max_per_transcript; - my $temp_loaded = 0; - foreach my $item (@list){ - if($item eq $db_name){ - #load temp table - my $dir = $self->core->dir(); - my $file = $dir."/identity_xref_temp.txt"; - - my $sth = $self->core->dbc->prepare("create table identity_xref_temp like identity_xref"); - print "creating table identity_xref_temp\n"; - $sth->execute() || die "Could not \ncreate table identity_xref_temp like identity_xref\n"; - $sth->finish; - if(-s $file){ - my $temp_sth = $self->core->dbc->prepare("LOAD DATA LOCAL INFILE \'$file\' IGNORE INTO TABLE identity_xref_temp"); - print "Uploading data in $file to identity_xref_temp\n"; - $temp_sth->execute(); - # $temp_sth->finish; - } - else{ - print "NO file or zero size file, so not able to load file $file to identity_xref_temp\n"; - } - $temp_loaded = 1; - # get all the db_name %'s - my $sql =(<<TIX); - SELECT i.ensembl_identity, i.xref_identity, x.dbprimary_acc - FROM identity_xref_temp i - INNER JOIN object_xref o ON i.object_xref_id = o.object_xref_id - INNER JOIN xref x ON x.xref_id = o.xref_id - WHERE x.external_db_id = $db_id -TIX - - $sth = $self->core->dbc->prepare($sql) || die; - $sth->execute() || die; - my ($target_id, $query_id, $acc); - $sth->bind_columns(\$target_id, \$query_id, \$acc); - if($sth->fetch()){ - $percent_id{$item.":".$acc} = $target_id + $query_id; - } - $sth->finish - } - } - if($temp_loaded){ - $self->core->dbc->do("DROP TABLE identity_xref_temp"); - } - -# -# End of getting percent_id hash -# - - - - -# Move translations onto the transcripts - my $sql =(<<EOF); - UPDATE IGNORE object_xref, translation, xref - SET object_xref.ensembl_object_type = 'Transcript', - object_xref.ensembl_id = translation.transcript_id - WHERE object_xref.ensembl_object_type = 'Translation' AND - object_xref.ensembl_id = translation.translation_id AND - xref.xref_id = object_xref.xref_id AND - xref.external_db_id = $db_id; -EOF - my $result = $ensembl_dbc->do($sql) ; - - - - $sql =(<<EOF2); - DELETE object_xref - FROM object_xref, xref - WHERE object_xref.ensembl_object_type = 'Translation' AND - xref.xref_id = object_xref.xref_id AND - xref.external_db_id = $db_id; -EOF2 - - $ensembl_dbc->do($sql); - - if($to eq "Gene"){ #move transcripts to the gene - my $sql =(<<GENE); - UPDATE IGNORE object_xref, transcript, xref - SET object_xref.ensembl_object_type = 'Gene', - object_xref.ensembl_id = transcript.gene_id - WHERE object_xref.ensembl_object_type = 'Transcript' AND - object_xref.ensembl_id = transcript.transcript_id AND - xref.xref_id = object_xref.xref_id AND - xref.external_db_id = $db_id; -GENE - $ensembl_dbc->do($sql); - - $sql =(<<GEN2); - DELETE object_xref - FROM object_xref, xref - WHERE object_xref.ensembl_object_type = 'Transcript' AND - xref.xref_id = object_xref.xref_id AND - xref.external_db_id = $db_id; -GEN2 - $ensembl_dbc->do($sql); - - } - - if($temp_loaded){ # So call removal routine - $self->one_per_gene(\%percent_id); - } - -} - - -sub one_per_gene{ - my $self = shift; - my $hashref = @_; - - print "DO i still need to do this??\n"; -# my $sql = "select - -} - -# list of sources to be used when building gene descriptions -# sorted into DEcreasing order of priority - -sub gene_description_sources { - - return ("RFAM", - "miRBase", - "IMGT/GENE_DB", - "Uniprot/SWISSPROT", - "RefSeq_peptide", - "RefSeq_dna", - "Uniprot/Varsplic", - "Uniprot/SPTREMBL"); - -} - -# load external_db (if it's empty) from ../external_db/external_dbs.txt - -sub upload_external_db { - my ($self,$force_upload) = @_; - - my $core_db = $self->core->dbc; - my $dbname = $core_db->dbname ; ; - $core_db->connect(); - my $row = @{$core_db->db_handle->selectall_arrayref("SELECT COUNT(*) FROM external_db")}[0]; - my $count = @{$row}[0]; - - my $upload_external_db = 0 ; - - if ($count > 0 ) { - print "external_db table has $count rows\n" ; - print " you may use -delete_external_db to delete all entries from external_db table and upload new data\n" unless $force_upload ; - } - if ($force_upload) { - print "WARNING: to delete all data from external_db table in $dbname enter \"yes\" to confirm\n"; - print "WARNING: any other key to continue without changing the table\n" ; - print "WARNING: Do you really want to delete all entries from external_db ? [ENTER = no | yes = delete ] " ; - - $| = 1; # flush stdout - my $p = <STDIN>; - chomp $p; - if ($p eq "yes") { - my $edb_sth = $core_db->prepare("DELETE FROM external_db ") ; - $edb_sth->execute(); - print "$count entries deleted from external_db table, now uploading new data\nusing file ./ensembl/misc_scripts/external_db/external_dbs.txt into $dbname \n" ; - $upload_external_db = 1 ; - } else { - print "\n$dbname.external_db will NOT be changed\n"; - } - } - - if ($count == 0 || $upload_external_db ) { - my $edb = $self->external_db_file; - print "external_db table is empty, uploading from $edb\n"; - my $edb_sth = $core_db->prepare("LOAD DATA LOCAL INFILE \'$edb\' IGNORE INTO TABLE external_db"); - $edb_sth->execute(); - } - -} - - -sub get_xref_descriptions{ - return \%xref_descriptions; -} - -sub get_xref_accessions{ - return \%xref_accessions; -} - -sub xref_id_offset{ - my $self = shift; - - $self->{'xref_id_offset'} = shift if( @_ ); - if( exists $self->{'xref_id_offset'} ) { - return $self->{'xref_id_offset'}; - } - return undef; -} - -sub add_missing_pairs{ - my ($self) = @_; - my $xref_id_offset = $self->xref_id_offset(); - - # - # add the pairs - # - # get current max object_xref_id - my $row = @{$self->core->dbc->db_handle->selectall_arrayref("SELECT MAX(object_xref_id) FROM object_xref")}[0]; - my $max_object_xref_id = @{$row}[0] + 1; - - my $row2 = @{$self->core->dbc->db_handle->selectall_arrayref("SELECT count(object_xref_id) FROM object_xref")}[0]; - my $entries_obj_xref = @{$row2}[0]; - - if (!defined $max_object_xref_id && $entries_obj_xref == 0 ) { - die ("No existing object_xref_ids, something very wrong\n"); - } - if (!defined $max_object_xref_id) { - print "No existing object_xref_ids, will start from 1\n"; - $max_object_xref_id = 1; - } else { - print "Maximum existing object_xref_id = $max_object_xref_id\n"; - } - - my $xref_sql = (<<EOS); - SELECT x1.xref_id, x2.xref_id, x1.accession, x2.accession, x1.source_id, x2.source_id - FROM pairs p, xref x1, xref x2 - WHERE p.accession1 = x1.accession - AND p.accession2 = x2.accession - AND p.source_id = x1.source_id -EOS - my $xref_sth = $self->xref->dbc->prepare($xref_sql); - $xref_sth->execute(); - my ($xref_id1,$xref_id2, $x1_acc, $x2_acc, $x1_source_id, $x2_source_id); - $xref_sth->bind_columns(\$xref_id1, \$xref_id2, \$x1_acc, \$x2_acc, \$x1_source_id, \$x2_source_id); - - my %master_acc; - my %good2missed=(); - my %good2missed_acc=(); - my $okay =0; - my $both = 0; - my $poss = 0; - - my @xref_list; - - while ($xref_sth->fetch()) { - $master_acc{$xref_id1} = $x1_acc; - $master_acc{$xref_id2} = $x2_acc; - # If either of these are a priority source then make sure they are the chosen one. - if(defined($priority_xref_source_id{$xref_id1})){ - my $key = $priority_source_id_to_name{$priority_xref_source_id{$xref_id1}}.":".$x1_acc; - if(!defined($xrefs_written{$xref_id1})){ - if(!defined($priority_seenit{$key})){ - push @xref_list, $xref_id1; - $priority_seenit{$key} = $x2_acc; - } - else{ - next; - } - } - else{ - next; - } - } - - if(defined($priority_xref_source_id{$xref_id2})){ - if(!defined($xrefs_written{$xref_id2})){ - my $key = $priority_source_id_to_name{$priority_xref_source_id{$xref_id2}}.":".$x2_acc; - if(!defined($priority_seenit{$key})){ - push @xref_list, $xref_id2; - $priority_seenit{$key} = $x1_acc; - } - else{ - next; - } - } - else{ - next; - } - } - - - if(!defined($xrefs_written{$xref_id1}) or !defined($xrefs_written{$xref_id2})){ - if (!defined($xrefs_written{$xref_id1}) and !defined($xrefs_written{$xref_id2})){ - $okay++; - } - elsif(!defined($xrefs_written{$xref_id2})){ - $poss++; - $good2missed{$xref_id1+$xref_id_offset} = $xref_id2+$xref_id_offset; - $good2missed_acc{$xref_id1+$xref_id_offset} = $x2_acc; - } - else{ - $poss++; - $good2missed{$xref_id2+$xref_id_offset} = $xref_id1+$xref_id_offset; - $good2missed_acc{$xref_id2+$xref_id_offset} = $x1_acc; - } - } - else{ - $both++; - } - } - - - #sql needed to get the dependent xrefs for those missed. - my $dep_sql = "SELECT x.xref_id, x.accession, x.version, x.label, x.description, x.source_id, dx.linkage_annotation FROM dependent_xref dx, xref x WHERE x.xref_id=dx.dependent_xref_id AND master_xref_id = ?"; - my $dep_sth = $self->xref->dbc->prepare($dep_sql); - - - - - open(XREF2, ">".$self->core->dir()."/pairs_xref.txt") - || die "Could not open pairs_xref.txt"; - - if(scalar(@xref_list) > 1){ - - my $list = join(", ", @xref_list); - my $sql = "SELECT xref_id, accession, version, label, description, source_id from xref where xref_id in ($list)"; - my $sth = $self->xref->dbc->prepare($sql); - $sth->execute(); - my ($xref_id, $acc,$ver, $label, $desc, $source_id); - $sth->bind_columns(\$xref_id, \$acc, \$ver, \$label, \$desc, \$source_id); - my $count =0; - while($sth->fetch()){ - $count++; - $master_acc{$xref_id} = $acc; - print XREF2 ( $xref_id + $xref_id_offset ) . "\t" - . ( $source_to_external_db{$source_id} || '' ) . "\t" - . ( $acc || '' ) . "\t" - . ( $label || '' ) . "\t" - . ( $ver || 0 ) . "\t" - . ( $desc || '' ); - my $key = $priority_source_id_to_name{$priority_xref_source_id{$xref_id}}.":".$acc; - if(!defined($priority_seenit{$key})){ - print STDERR "PROBLEM: key =$key\n\txref = $xref_id\n\toffset is $xref_id_offset\n"; - } - print XREF2 "\t" - . "INFERRED_PAIR" . "\t" - . "Generated via its Pair " - . $priority_seenit{$key} . "\n"; - $xrefs_written{$xref_id} = 1; - - } - $sth->finish; - - } - - - -# create a hash to interchange between transcripts and translations - my $tran_sth = $self->core->dbc->prepare("SELECT translation_id, transcript_id from translation"); - my $transcript; - my $translation; - $tran_sth->execute(); - $tran_sth->bind_columns(\$translation,\$transcript); - my %transcript_2_translation; - my %translation_2_transcript; - while($tran_sth->fetch()){ - $transcript_2_translation{$transcript} = $translation; - $translation_2_transcript{$translation} = $transcript; - } - $tran_sth->finish(); - - open(OBJECT_XREF2, ">".$self->core->dir()."/pairs_object_xref.txt") - || die "Could not open pairs_object_xref.txt"; - - - open(GO_XREF2, ">".$self->core->dir()."/pairs_go_xref.txt") - || die "Could not open pairs_go_xref.txt"; - - my $i=0; - my $index; - my $added = 0; - my $added_transcript = 0; - my $sql = "SELECT o.xref_id, o.ensembl_id, o.ensembl_object_type, "; - $sql .= "x.dbprimary_acc, x.external_db_id "; - $sql .= "FROM object_xref o, xref x "; - $sql .= "WHERE x.xref_id = o.xref_id AND x.xref_id IN ("; - my ($goodxref, $ens_int_id, $type, $acc, $ex_db_id); - - # keep a list of the "master" xrefs for the new object xrefs. - # need a hash for the "master" xref to object xref so that we can add the fake - # identity xrefs to the identity_xref_temp file - my @new_list; - my %identity_master_xref_to_object_xref; - - my @list_all = keys %good2missed; - - - - while ( @list_all){ - my @list = splice(@list_all, 0,199); - my $sth_ob = $self->core->dbc->prepare($sql.(join(',',@list)).")") || die @_; - $sth_ob->execute(); - $sth_ob->bind_columns(\$goodxref,\$ens_int_id,\$type, \$acc, \$ex_db_id); - while($sth_ob->fetch()){ - if(($type =~ /Transcript/) and defined($transcript_2_translation{$ens_int_id})){ - $max_object_xref_id++; - $added++; - $object_succesfully_mapped{($good2missed{$goodxref}-$xref_id_offset)} = 1; - print OBJECT_XREF2 "$max_object_xref_id\t"; - print OBJECT_XREF2 $transcript_2_translation{$ens_int_id}."\tTranslation\t" ; - print OBJECT_XREF2 $good2missed{$goodxref}; - print OBJECT_XREF2 "\t\\N\n"; - - - push @new_list, $goodxref; - push @{$identity_master_xref_to_object_xref{$goodxref}}, $max_object_xref_id; - - $dep_sth->execute($good2missed{$goodxref}-$xref_id_offset); - my ($xref_id, $acc,$ver, $label, $desc, $source_id, $linkage_annotation); - $dep_sth->bind_columns(\$xref_id, \$acc, \$ver, \$label, \$desc, \$source_id, \$linkage_annotation); - - while($dep_sth->fetch){ - - if(!defined($priority_xref_source_id{$xref_id})){ - if(!defined($xrefs_written{$xref_id})){ - print XREF2 ( $xref_id + $xref_id_offset ) . "\t" - . ( $source_to_external_db{$source_id} || '' ) . "\t" - . ( $acc || '' ) . "\t" - . ( $label || '' ) . "\t" - . ( $ver || 0 ) . "\t" - . ( $desc || '' ) . "\t" - . "DEPENDENT" . "\t" - . "Generated via " - . $master_acc{ $good2missed{$goodxref} - $xref_id_offset } - . "\n"; - $xrefs_written{$xref_id} = 1; - } - - $max_object_xref_id++; - $object_succesfully_mapped{$xref_id} = 1; - print OBJECT_XREF2 "$max_object_xref_id\t"; - print OBJECT_XREF2 $transcript_2_translation{$ens_int_id}."\tTranslation\t" ; - print OBJECT_XREF2 $xref_id+$xref_id_offset; - print OBJECT_XREF2 "\t\\N\n"; - - if(defined($linkage_annotation) and $linkage_annotation ne ""){ - print GO_XREF2 "$max_object_xref_id\t$linkage_annotation\t\\N\n"; - } - push @new_list, $goodxref; - push @{$identity_master_xref_to_object_xref{$goodxref}}, $max_object_xref_id; - - } - } - } - elsif(($type =~ /Translation/) and defined($translation_2_transcript{$ens_int_id})){ - $max_object_xref_id++; - $added_transcript++; - $object_succesfully_mapped{($good2missed{$goodxref}-$xref_id_offset)} = 1; - print OBJECT_XREF2 "$max_object_xref_id\t"; - print OBJECT_XREF2 $translation_2_transcript{$ens_int_id}."\tTranscript\t" ; - print OBJECT_XREF2 $good2missed{$goodxref}; - print OBJECT_XREF2 "\t\\N\n"; - - push @new_list, $goodxref; - push @{$identity_master_xref_to_object_xref{$goodxref}}, $max_object_xref_id; - - $dep_sth->execute($good2missed{$goodxref}-$xref_id_offset); - my ($xref_id, $acc,$ver, $label, $desc, $source_id, $linkage_annotation); - $dep_sth->bind_columns(\$xref_id, \$acc, \$ver, \$label, \$desc, \$source_id, \$linkage_annotation); - while($dep_sth->fetch){ - - if(!defined($priority_xref_source_id{$xref_id})){ - if(!defined($xrefs_written{$xref_id})){ - print XREF2 ( $xref_id + $xref_id_offset ) . "\t" - . ( $source_to_external_db{$source_id} || '' ) . "\t" - . ( $acc || '' ) . "\t" - . ( $label || '' ) . "\t" - . ( $ver || 0 ) . "\t" - . ( $desc || '' ) . "\t" - . "DEPENDENT" . "\t" - . "Generated via " - . $master_acc{ $good2missed{$goodxref} - $xref_id_offset } . "\n"; - $xrefs_written{$xref_id} = 1; - } - - $max_object_xref_id++; - $object_succesfully_mapped{$xref_id} = 1; - print OBJECT_XREF2 "$max_object_xref_id\t"; - print OBJECT_XREF2 $translation_2_transcript{$ens_int_id}."\tTranscript\t" ; - print OBJECT_XREF2 $xref_id+$xref_id_offset; - print OBJECT_XREF2 "\t\\N\n"; - - if(defined($linkage_annotation) and $linkage_annotation ne ""){ - print GO_XREF2 "$max_object_xref_id\t$linkage_annotation\t\\N\n"; - } - - push @new_list, $good2missed{$goodxref}; - push @{$identity_master_xref_to_object_xref{$goodxref}}, $max_object_xref_id; - - } - } - - } - } - $sth_ob->finish(); - } - close XREF2; - close OBJECT_XREF2; - close GO_XREF2; - - # - # Now set the identity xref temp values. - # - - - if(@new_list){ - my $identity_sql = "select x.xref_id, i.* from xref x, object_xref o, identity_xref i"; - $identity_sql .= " where x.xref_id = o.xref_id and o.object_xref_id = i.object_xref_id and x.xref_id in ("; - $identity_sql .= join (", ", @new_list). ")"; - - my ($xref, $old_object_id, $qid, $tid, $hs, $he, $ts, $te, $cigar, $score, $eval, $anal_id); - - open(IDENTITY_XREF_P,">>".$self->core->dir."/identity_xref_temp.txt") || die "Could not open identity_xref_temp.txt"; - - my $sth_ob = $self->core->dbc->prepare($identity_sql) || die @_; - $sth_ob->execute(); - $sth_ob->bind_columns(\$xref, \$old_object_id, \$qid, \$tid, \$hs, \$he, \$ts, \$te, \$cigar, \$score, \$eval, \$anal_id); - while($sth_ob->fetch()){ - foreach my $object (@{$identity_master_xref_to_object_xref{$xref}}){ - print IDENTITY_XREF_P $object. "\t" . - ($qid || "0") . "\t" . - ($tid || "0") . "\t" . - ($hs || "0") . "\t" . - ($he || "0") . "\t" . - ($ts || "0") . "\t" . - ($te || "0") . "\t" . - ($cigar || ""). "\t" . - ($score || "0"). "\t" . - ($eval || "0") ."\t" . - ($anal_id || "") ."\n"; - } - } - - close IDENTITY_XREF_P; - } - - # - # Now load the data into the database. - # - - my $file = $self->core->dir()."/pairs_object_xref.txt"; - - # don't seem to be able to use prepared statements here - if(-s $file){ - my $sth = $self->core->dbc->prepare("LOAD DATA LOCAL INFILE \'$file\' IGNORE INTO TABLE object_xref"); - print "Uploading data in $file to object_xref\n"; - $sth->execute(); - - print "$added new object xrefs added based on the Pairs\n"; - print "$added_transcript transcripts\n"; - } - - $file = $self->core->dir()."/pairs_xref.txt"; - - if(-s $file){ - my $sth = $self->core->dbc->prepare("LOAD DATA LOCAL INFILE \'$file\' IGNORE INTO TABLE xref"); - print "Uploading data in $file to xref\n"; - $sth->execute(); - - } - - $file = $self->core->dir()."/pairs_go_xref.txt"; - - if(-s $file){ - my $sth = $self->core->dbc->prepare("LOAD DATA LOCAL INFILE \'$file\' IGNORE INTO TABLE go_xref"); - print "Uploading data in $file to go_xref\n"; - $sth->execute(); - - } - - -} - -sub dump_all_dependencies{ - my ($self, $master_id, $xref_id_offset, $type, $object_id) = @_; - my @return; - - # Now get the dependent xrefs for this xref and write them - - my $sql = (<<SQL); - SELECT DISTINCT(x.xref_id), dx.master_xref_id, x.accession, x.label, x.description, - x.source_id, x.version, dx.linkage_annotation - FROM dependent_xref dx, xref x - WHERE x.xref_id=dx.dependent_xref_id AND master_xref_id = $master_id -SQL - - my $dep_sth = $self->xref->dbc->prepare($sql); - $dep_sth->execute(); - - my ($xref_id, $accession, $version, $label, $description, $source_id, - $species_id, $master_xref_id, $linkage_annotation); - - $dep_sth->bind_columns(\$xref_id, \$master_xref_id, \$accession, \$label, \$description, - \$source_id, \$version, \$linkage_annotation); - while ($dep_sth->fetch()) { - - my $external_db_id = $source_to_external_db{$source_id}; - next if (!$external_db_id); - - - $label = $accession if (!$label); - - if (!$xrefs_written{$xref_id}) { - if(!defined($updated_source{$external_db_id})){ - $self->cleanup_sources_file($external_db_id,$source_id); - } - - - my $master_accession = $XXXxref_id_to_accession{$master_id}; - if(!defined($master_accession) or $master_accession eq ""){ - print "(dump_all_dependencies) No master_acc for master xref ($master_xref_id = $master_id)" - . " for $accession ($xref_id) $type\n"; - } - - if(!defined($priority_xref_source_id{$xref_id})){ - - print XREF ( $xref_id + $xref_id_offset ) . "\t" - . ( $external_db_id || '' ) . "\t" - . ( $accession || '' ) . "\t" - . ( $label || '' ) . "\t" - . ( $version || 0 ) . "\t" - . ( $description || '' ) . "\t" - . "DEPENDENT" . "\t" - . "Generated via " - . $master_accession . "\n"; - - push @return, ($xref_id+$xref_id_offset); - $xrefs_written{$xref_id} = 1; - } - elsif(defined($type)){ - my $key = $priority_source_id_to_name{$source_id}.":".$priority_xref_acc{$xref_id}; - if(!defined($priority_xref_priority{$key})){ - - $priority_xref_extra_bit{$xref_id} = "\t" . "DEPENDENT" . "\t" . "Generated via $master_accession\n"; - $priority_xref{$key} = $xref_id; - $priority_xref_priority{$key} = $priority_source_id{$source_id}; - $priority_object_xref{$key} = "$type:$object_id"; - $priority_identity_xref{$key} = undef; - $priority_xref_state{$key} = "dependent"; - next; - } - if($priority_xref_priority{$key} - > $priority_source_id{$source_id}){ - - $priority_xref_extra_bit{$xref_id} = "\t" . "DEPENDENT" . "\t" . "Generated via $master_accession\n"; - $priority_xref{$key} = $xref_id; - $priority_xref_priority{$key} = $priority_source_id{$source_id}; - $priority_object_xref{$key} = "$type:$object_id"; - $priority_identity_xref{$key} = undef; - $priority_xref_state{$key} = "dependent"; - next; - } - } - } - - } - return \@return; -} - - -sub get_mysql_command{ - my $self = shift; - my $dbc = shift; - - UNIVERSAL::isa( $dbc, 'Bio::EnsEMBL::DBSQL::DBConnection' ) - || die( "Need a Bio::EnsEMBL::DBSQL::DBConnection not a " . ref($dbc) ); - - my $host = $dbc->host; - my $port = $dbc->port; - my $dbname = $dbc->dbname; - my $user = $dbc->username; - my $pass = $dbc->password; - my $str = join( ' ', - 'mysql', - ( $host ? ( '-h', $host ) : () ), - ( $port ? ( '-P', $port ) : () ), - ( $user ? ( '-u', $user ) : () ), - ( $pass ? ( "-p'".$pass."'" ) : () ), - $dbname ); - return $str; -} - -sub dump_xref_with_no_triage_data { - my ($self) = @_; - - print "Dumping xrefs\n"; - my $xref_id_offset = $self->xref_id_offset(); - my $batch_size=200; - - my $primary_sql= (<<PSQL); - SELECT DISTINCT(s.source_id), px.sequence_type - FROM source s, primary_xref px, xref x - WHERE x.xref_id = px.xref_id - AND s.source_id = x.source_id -PSQL - - my $psth = $self->xref->dbc->prepare($primary_sql) || die "prepare failed"; - $psth->execute() || die "execute failed"; - - my @primary_sources =(); - my %source_2_seqtype=(); - - my ($prim,$seq_type); - $psth->bind_columns(\$prim,\$seq_type); - while($psth->fetch()){ - push @primary_sources, $prim; - $source_2_seqtype{$prim} = $seq_type; - } - - open (XREF, ">" . $self->core->dir() . "/xref_no_triage.txt"); - - foreach my $source (@primary_sources){ - - - if(defined($priority_source_name{$source})){ - next; - } - my $sql = "select x.xref_id, x.accession, x.version, x.label, x.description, x.source_id, ". - "x.species_id from xref x where x.source_id = $source"; - my $sth = $self->xref->dbc->prepare($sql); - $sth->execute(); - - my ($xref_id, $accession, $version, $label, $description, $source_id, $species_id); - $sth->bind_columns(\$xref_id, \$accession, \$version, \$label, - \$description, \$source_id, \$species_id); - while($sth->fetch()){ - if (!$xrefs_written{$xref_id}) { - my $external_db_id = $source_to_external_db{$source_id}; - if(!defined($updated_source{$external_db_id})){ - $self->cleanup_sources_file($external_db_id); - } - if(!defined($priority_xref_source_id{$xref_id})){ - - print XREF ( $xref_id + $xref_id_offset ) . "\t" - . ( $external_db_id || '' ) . "\t" - . ( $accession || '' ) . "\t" - . ( $label || '' ) . "\t" - . ( $version || 0 ) . "\t" - . ( $description || '' ) . "\t" . "MISC" . "\t" - . "No match\n"; - -#dump out dependencies aswell - - if(!defined($accession) or $accession eq ""){ - print "No accession for ".($xref_id+$xref_id_offset)."\t4\n"; - } - $XXXxref_id_to_accession{$xref_id} = $accession; - $XXXxref_id_to_source_id{$xref_id} = $source_id; - $self->dump_all_dependencies($xref_id, $xref_id_offset); - } - - } - } - $sth->finish; - } - close(XREF); - - my $file = $self->core->dir() . "/xref_no_triage.txt"; - - if(-s $file){ - my $sth = $self->core->dbc->prepare("LOAD DATA LOCAL INFILE \'$file\' IGNORE INTO TABLE xref"); - print "Uploading data in $file to xref\n"; - $sth->execute(); - } - else{ - print "NO file or zero size file, so not able to load file $file to xref\n"; - } - -} - -sub count_unmapped_reasons{ - # Returns the number of entries in the unmapped_reason table - my $self = shift; - my $sth = $self->core->dbc->prepare - ("select count(*) from unmapped_reason" ); - $sth->execute(); - my $count = ($sth->fetchrow_array)[0]; - $sth->finish; - return $count; -} - -sub unmapped_data_for_prioritys{ - - my $self= shift; - my $xref_id_offset = $self->xref_id_offset(); - my $ensembl = $self->core; - my $xref = $self->xref; - my $dir = $ensembl->dir(); - my %cutoff_2_failed_id=(); - - my $summary_failed = "Failed to match at thresholds"; - my $summary_missed = "Failed to match"; - my $description_missed = "Unable to match to any ensembl entity at all"; - - - #unmapped objects. Need to write these for the priority_xrefs. - # use priority_seenit and priority_failed - - open(UNMAPPED_OBJECT,">$dir/unmapped_object_priority.txt") - || die "Could not open unmapped_object_priority.txt"; - - open(XREF_P,">$dir/unmapped_xref_priority.txt") - || die "Could not open unmapped_xref_priority.txt"; - - #for each priority source that is primary :- - # get all xref,acc for this - # if seentit ignore - # else if priotrity_failed - #write unmapped object. - - my $sth = $self->core->dbc->prepare("select unmapped_reason_id from unmapped_reason where full_description like '".$description_missed."'"); - $sth->execute(); - my $xref_missed_id; - $sth->bind_columns(\$xref_missed_id); - $sth->fetch; - if(!defined($xref_missed_id)){ - print STDERR "Could not find the description:\n"; - print STDERR $description_missed,"\n"; - print STDERR "In the directory ensembl/misc-scripts/unmapped_reason you "; - print STDERR "can add the new reason to the unmapped_reason.txt file "; - print STDERR "and run the update_unmapped_reasons.pl script to update "; - print STDERR "your core database\n"; - print STDERR "Alterntively do not add the triage data and add -notriage to the command line.\n"; - die(); - } - $sth->finish; - - - $sth = $self->core->dbc->prepare("select MAX(unmapped_object_id) ". - "from unmapped_object"); - $sth->execute(); - my $max_unmapped_object_id; - $sth->bind_columns(\$max_unmapped_object_id); - $sth->fetch; - if(!defined($max_unmapped_object_id)){ - $max_unmapped_object_id = 1; - } - $sth->finish; - - my $primary_sql= (<<PSQL); - SELECT DISTINCT(s.source_id), px.sequence_type - FROM source s, primary_xref px, xref x - WHERE x.xref_id = px.xref_id - AND s.source_id = x.source_id -PSQL - - my $psth = $self->xref->dbc->prepare($primary_sql) || die "prepare failed"; - $psth->execute() || die "execute failed"; - - my @primary_sources =(); - my %source_2_seqtype=(); - - my ($prim, $seq_type); - $psth->bind_columns(\$prim,\$seq_type); - while($psth->fetch()){ - if(defined($priority_source_id_to_name{$prim})){ - push @primary_sources, $prim; - $source_2_seqtype{$prim} = $seq_type; - } - } - - my $xref_PROT_analysis = $self->get_analysis_id("translation"); - my $xref_DNA_analysis = $self->get_analysis_id("transcript"); - - foreach my $source_id (@primary_sources){ #priority primary sources - my $old_source_name = $priority_source_id_to_name{$source_id}; - my $external_db_id = $source_to_external_db{$source_id}; - my $xref_sql = "SELECT xref_id, accession, version, label, description from xref where xref.source_id = $source_id"; - my $sth = $self->xref->dbc->prepare($xref_sql) || die "prepare failed"; - $sth->execute() || die "execute failed"; - my ($xref_id, $acc, $ver, $lab, $desc); - $sth->bind_columns(\$xref_id, \$acc, \$ver, \$lab, \$desc); - while($sth->fetch()){ - my $key = $old_source_name.":".$acc; - if(defined($priority_seenit{$key})){ - next; - } - $priority_seenit{$key} = 1; - if(defined($priority_failed{$key})){ - my ($ensembl_type,$ensembl_id,$q_perc,$t_perc,$q_cut,$t_cut) = - split(/\|/,$priority_failed{$key}); - - if(!defined($cutoff_2_failed_id{$q_cut."_".$t_cut})){ - $cutoff_2_failed_id{$q_cut."_".$t_cut} = $self->get_failed_id($q_cut, $t_cut, $summary_failed); - } - $max_unmapped_object_id++; - - print UNMAPPED_OBJECT $max_unmapped_object_id."\txref\t"; - - if($ensembl_type =~ /Translation/){ - print UNMAPPED_OBJECT $xref_PROT_analysis."\t"; - } - elsif($ensembl_type =~ /Transcript/){ - print UNMAPPED_OBJECT $xref_DNA_analysis."\t"; - } - else{ - die "type=*".$ensembl_type."*\n".$failed_xref_mappings{$xref_id}."\n"; - } - print UNMAPPED_OBJECT $external_db_id."\t".$acc."\t".$cutoff_2_failed_id{$q_cut."_".$t_cut}."\t"; - print UNMAPPED_OBJECT $q_perc."\t".$t_perc."\t"; - print UNMAPPED_OBJECT $ensembl_id."\t".$ensembl_type."\n"; - print XREF_P ( $xref_id + $xref_id_offset ) . "\t" - . ( $external_db_id || '' ) . "\t" - . ( $acc || '' ) . "\t" - . ( $lab || '' ) . "\t" - . ( $ver || 0 ) . "\t" - . ( $desc || '' ) . "\t" . "MISC" . "\t" - . "No match over threshold\n"; - } - else{ - $max_unmapped_object_id++; - print UNMAPPED_OBJECT $max_unmapped_object_id."\txref\t"; - if($source_2_seqtype{$source_id} =~ /peptide/){ - print UNMAPPED_OBJECT $xref_PROT_analysis."\t"; - } - elsif($source_2_seqtype{$source_id} =~ /dna/){ - print UNMAPPED_OBJECT $xref_DNA_analysis."\t"; - } - print UNMAPPED_OBJECT $external_db_id."\t".$acc."\t"; - print UNMAPPED_OBJECT $xref_missed_id."\t0\t0\t0\t\\N\n"; - print XREF_P ( $xref_id + $xref_id_offset ) . "\t" - . ( $external_db_id || '' ) . "\t" - . ( $acc || '' ) . "\t" - . ( $lab || '' ) . "\t" - . ( $ver || 0 ) . "\t" - . ( $desc || '' ) . "\t" . "MISC" . "\t" - . "No match\n"; - } - } - } - close UNMAPPED_OBJECT; - close XREF_P; - - my $file = $dir."/unmapped_xref_priority.txt"; - - if(-s $file){ - my $sth = $ensembl->dbc->prepare("LOAD DATA LOCAL INFILE \'$file\' IGNORE INTO TABLE xref"); - print "Uploading data in $file to xref\n"; - $sth->execute(); - } - $file = $dir."/unmapped_object_priority.txt"; - - if(-s $file){ - my $sth = $ensembl->dbc->prepare("LOAD DATA LOCAL INFILE \'$file\' IGNORE INTO TABLE unmapped_object"); - print "Uploading data in $file to unmapped_object\n"; - $sth->execute(); - } - - -} - - - - -=head2 write_dependent_unmapped_objects - - Description: wrote and load the unmapped object for the dependent xrefs - Exceptions : none - Returntype : none - -=cut - -sub write_dependent_unmapped_objects{ - my $self = shift; - my $xref_id_offset = $self->xref_id_offset(); - my $ensembl = $self->core; - my $dir = $ensembl->dir(); - - open(UNMAPPED_OBJECT,">$dir/unmapped_object_dependents.txt") - || die "Could not open unmapped_object_dependents.txt"; - - open(XREF,">$dir/dependent_xref.txt") - || die "Could not open dependent_xref.txt"; - - - my $description_missed = "Unable to match as parent xref was not mapped"; - - my $sth = $self->core->dbc->prepare("select unmapped_reason_id from unmapped_reason where full_description like '".$description_missed."'"); - $sth->execute(); - my $xref_missed_id; - $sth->bind_columns(\$xref_missed_id); - $sth->fetch; - if(!defined($xref_missed_id)){ - print STDERR "Could not find the description:\n"; - print STDERR $description_missed,"\n"; - print STDERR "In the directory ensembl/misc-scripts/unmapped_reason you "; - print STDERR "can add the new reason to the unmapped_reason.txt file "; - print STDERR "and run the update_unmapped_reasons.pl script to update "; - print STDERR "your core database\n"; - print STDERR "Alterntively do not add the triage data and add -notriage to the command line.\n"; - die(); - } - $sth->finish; - - $sth = $self->core->dbc->prepare("select MAX(unmapped_object_id) ". - "from unmapped_object"); - $sth->execute(); - my $max_unmapped_object_id; - $sth->bind_columns(\$max_unmapped_object_id); - $sth->fetch; - if(!defined($max_unmapped_object_id)){ - $max_unmapped_object_id = 1; - } - $sth->finish; - - my $primary_sql= (<<PSQL); - SELECT DISTINCT(s.source_id), px.sequence_type - FROM source s, primary_xref px, xref x - WHERE x.xref_id = px.xref_id - AND s.source_id = x.source_id -PSQL - - my $psth = $self->xref->dbc->prepare($primary_sql) || die "prepare failed"; - $psth->execute() || die "execute failed"; +=head2 new - my %source_2_seqtype=(); + Description: Constructor for BasicMapper. + Returntype : BasicMapper + Exceptions : none + Caller : general - my ($prim, $seq_type); - $psth->bind_columns(\$prim,\$seq_type); - while($psth->fetch()){ - $source_2_seqtype{$prim} = $seq_type; - } +=cut - my $xref_PROT_analysis = $self->get_analysis_id("translation"); - my $xref_DNA_analysis = $self->get_analysis_id("transcript"); +sub new{ + my($class, @args) = @_; - # $unmapped_primary_xref{$xref_id} = "QZ1234"; + my $self ={}; + bless $self,$class; + return $self; +} - my $sql = (<<SQL); - SELECT DISTINCT(x.xref_id), x2.source_id, dx.master_xref_id, x.accession, x.label, - x.description, x.source_id,x.version, dx.linkage_annotation - FROM dependent_xref dx, xref x, xref x2 - WHERE x2.xref_id = dx.master_xref_id AND x.xref_id=dx.dependent_xref_id AND master_xref_id = ?"; -SQL - my $dep_sth = $self->xref->dbc->prepare($sql); +=head2 xref - foreach my $key (keys %unmapped_primary_xref){ - $dep_sth->execute($key); - my ($xref_id, $master_source, $accession, $version, $label, $description, $source_id, - $species_id, $master_xref_id, $linkage_annotation); + Arg [1] : (optional) + Example : $mapper->core($new_core); + Description: Getter / Setter for the core. + info for the xref database. + Returntype : XrefMapper::db + Exceptions : none - $dep_sth->bind_columns(\$xref_id, \$master_source, \$master_xref_id, \$accession, \$label, - \$description, \$source_id, \$version, \$linkage_annotation); +=cut - while ($dep_sth->fetch()) { +sub xref{ + my ($self, $arg) = @_; + (defined $arg) && + ($self->{_xref} = $arg ); + return $self->{_xref}; +} - if(defined($priority_xref_source_id{$xref_id})){ #priority ones already done - next; - } +=head2 core - my $external_db_id = $source_to_external_db{$source_id}; - next if (!$external_db_id); + Arg [1] : (optional) + Example : $mapper->core($new_core); + Description: Getter / Setter for the core. + info for the ensembl core database. + Returntype : XrefMapper::db + Exceptions : none +=cut - if (!$xrefs_written{$xref_id}) { +sub core{ + my ($self, $arg) = @_; - print XREF ( $xref_id + $xref_id_offset ) . "\t" - . ( $external_db_id || '' ) . "\t" - . ( $accession || '' ) . "\t" - . ( $label || '' ) . "\t" - . ( $version || 0 ) . "\t" - . ( $description || '' ) . "\t" - . "DEPENDENT" . "\t" - . "No mapping " - . $unmapped_primary_xref{$xref_id} . "\n"; + (defined $arg) && + ($self->{_core} = $arg ); + return $self->{_core}; +} - $xrefs_written{$xref_id} = 1; - } - if(!defined($object_succesfully_mapped{$xref_id})){ - $max_unmapped_object_id++; - print UNMAPPED_OBJECT $max_unmapped_object_id."\txref\t"; - if($source_2_seqtype{$master_source} =~ /peptide/){ - if (not defined $xref_PROT_analysis) { - $xref_PROT_analysis = $self->get_analysis_id("translation"); - } - print UNMAPPED_OBJECT $xref_PROT_analysis."\t"; - } - elsif($source_2_seqtype{$master_source} =~ /dna/){ - if (not defined $xref_DNA_analysis) { - $xref_DNA_analysis = $self->get_analysis_id("transcript"); - } - print UNMAPPED_OBJECT $xref_DNA_analysis."\t"; - } - else{ - die "COULD NOT get an analysis for source id = $master_source ".$unmapped_primary_xref{$master_xref_id}."\n"; - } - print UNMAPPED_OBJECT $external_db_id."\t".$accession."\t"; - print UNMAPPED_OBJECT $xref_missed_id."\t0\t0\t0\t\\N\t". - $unmapped_primary_xref{$master_xref_id}. "\n"; - } - } - } - close UNMAPPED_OBJECT; - close XREF; - my $file = $dir."/dependent_xref.txt"; - if(-s $file){ - my $sth = $ensembl->dbc->prepare("LOAD DATA LOCAL INFILE \'$file\' IGNORE INTO TABLE xref"); - print "Uploading data in $file to xref\n"; - $sth->execute(); - } - $file = $dir."/unmapped_object_dependents.txt"; +sub xref_latest_status { + my $self = shift; + my $verbose = shift || 0; - if(-s $file){ - my $sth = $ensembl->dbc->prepare("LOAD DATA LOCAL INFILE \'$file\' IGNORE INTO TABLE unmapped_object"); - print "Uploading data in $file to unmapped_object\n"; - $sth->execute(); + my $sth = $self->xref->dbc->prepare("select id, status, date from process_status order by id"); + + $sth->execute(); + my ($xref_id, $acc); + my ($id, $status, $date); + $sth->bind_columns(\$id, \$status,\$date); + while($sth->fetch){ + print "$status\t$date\n" if($verbose); } + return $status; } -sub get_list_of_sources_for_one_max_per_transcript{ +sub process_file { my $self = shift; + my $file = shift; - my @list; - return @list; -} + open(FILE, $file) or die ("\nCannot open input file '$file':\n $!\n"); + + my $xref=undef; + my $ensembl=undef; + my $type; + + my %xref_hash=(); + my %species_hash=(); + + while( my $line = <FILE> ) { + + chomp($line); + next if $line =~ /^#/; + next if !$line; + + # print $line."\n"; + my ($key, $value) = split("=",$line); + if($key eq "species"){ + $type = "species"; + $species_hash{'species'} = $value; + } + elsif($key eq "xref"){ + $type = "xref"; + } + elsif($type eq "species"){ # processing species data + $species_hash{lc($key)} = $value; + } + elsif($type eq "xref"){ # processing xref data + $xref_hash{lc($key)} = $value; + } + } + -sub check_special_sources { - my $self = shift; + my $value = $species_hash{'species'}; + if ($value !~ /_/) { + print STDERR "\'$value\' is not a recognised species - please use full species name (e.g. homo_sapiens) in $file\n"; + exit(1); + } + + my $mapper; + my $module; + my $class = "XrefMapper/$value.pm"; + eval { + require $class; + }; + if($@) { + if ($@ =~ /Can\'t locate $class/) { + warn("Did not find a specific mapping module XrefMapper::$value - using XrefMapper::BasicMapper instead\n"); + require XrefMapper::BasicMapper; + $module = "BasicMapper"; + } else { + die "$@"; + } + + } else{ + $module = $value; + } + + + $mapper = "XrefMapper::$module"->new(); - print "WARNING check_special_sources not ran need to sort this out\n"; - return; - # 1) get the external_db_id; - my @list = $self->get_list_of_sources_for_one_max_per_transcript(); + if(defined($xref_hash{host})){ + my ($host, $user, $dbname, $pass, $port); + $host = $xref_hash{'host'}; + $user = $xref_hash{'user'}; + $dbname = $xref_hash{'dbname'}; + if(defined($xref_hash{'password'})){ + $pass = $xref_hash{'password'}; + } + else{ + $pass = ''; + } + if(defined($xref_hash{'port'})){ + $port = $xref_hash{'port'}; + } + else{ + $port = 3306; + } + + $xref = new XrefMapper::db(-host => $host, + -port => $port, + -user => $user, + -pass => $pass, + -group => 'core', + -dbname => $dbname); + + $mapper->xref($xref); + if(defined($xref_hash{'dir'})){ + $xref->dir($xref_hash{'dir'}); + } + + } + else{ + die "No host name given for xref database\n"; + } + + + if(defined($species_hash{'species'})){ - my $ex_db_sql = 'SELECT e.external_db_id, e.db_name from external_db e where e.db_name like ?'; + my ($host, $port, $user, $dbname, $pass); + $host = $species_hash{'host'}; + $user = $species_hash{'user'}; + $dbname = $species_hash{'dbname'}; + if(defined($species_hash{'password'})){ + $pass = $species_hash{'password'}; + } + else{ + $pass = ''; + } + if(defined($species_hash{'port'})){ + $port = $species_hash{'port'}; + } + else{ + $port = ''; + } + my $core = new XrefMapper::db(-host => $host, + -port => $port, + -user => $user, + -pass => $pass, + -group => 'core', + -dbname => $dbname); + + $mapper->core($core); - my $mult_sql = (<<MULT); - SELECT ox.ensembl_id, ox.ensembl_object_type, count(*) AS count - FROM object_xref ox, xref x - WHERE x.xref_id = ox.xref_id AND - x.external_db_id = ? - GROUP BY ox.ensembl_id, ox.ensembl_object_type HAVING count >= 2; -MULT + if(defined($species_hash{'dir'})){ + $core->dir($species_hash{'dir'}); + } + + $core->species($value); + } - my $xref_sql = (<<XREFS); - SELECT x.xref_id, ox.object_xref_id, x.info_type - FROM xref x, object_xref ox - WHERE x.xref_id = ox.xref_id AND - ox.ensembl_object_type = ? AND - ox.ensembl_id = ? AND - x.external_db_id = ? -XREFS + - my $primary_sql = (<<PRIMARY); - SELECT i.ensembl_identity, i.xref_identity - FROM identity_xref i - WHERE i.object_xref_id = ? -PRIMARY + return $mapper; +} - my $dependent_sql = (<<DEPEND); - SELECT i.ensembl_identity, i.xref_identity - FROM identity_xref_temp i - WHERE i.object_xref_id = ? -DEPEND +=head2 dumpcheck + Arg [1] : (optional) + Example : $mapper->dumpcheck("yes"); + Description: Getter / Setter for dumpcheck. + If set the mapper will not dump fasta files + if they exist already. + Returntype : scalar + Exceptions : none - my $temp_loaded = 0; - my $total_transcripts_altered = 0; - my $total_object_xrefs_removed = 0; +=cut - my $outfile = $self->core->dir."/delete_multi_source.sql"; - open (DOI, ">".$outfile) || die "Could not open $outfile\nAHHHH\n"; +sub dumpcheck { + my ($self, $arg) = @_; - foreach my $source (@list){ - my $transcripts_altered = 0; - my $object_xrefs_removed = 0; - my $ex_db=0; + (defined $arg) && + ($self->{_dumpcheck} = $arg ); + return $self->{_dumpcheck}; +} - print "Looking up source $source with sql $ex_db_sql\n"; - my $ex_db_sth = $self->core->dbc->prepare($ex_db_sql); - $ex_db_sth->execute($source) || die "problem executing $ex_db_sql"; - my $name; - $ex_db_sth->bind_columns(\$ex_db,\$name) || die "problem binding $ex_db_sql\n"; - - $ex_db_sth->fetch(); - $ex_db_sth->finish; - if(!defined($ex_db) or $ex_db == 0){ - die "Could not find external_db_id for $source\n"; - } - else{ - print "making sure one $source ($ex_db) per transcript\n"; - } - # 2) get list of ensembl entities to check - my @mult_list; - my $type; - - my $mult_sth = $self->core->dbc->prepare($mult_sql); - my ($ens_id, $junk); - $mult_sth->execute($ex_db); - $mult_sth->bind_columns(\$ens_id, \$type, \$junk); - - while ($mult_sth->fetch()) { - push @mult_list, $ens_id; - } - $mult_sth->finish; - +sub species_id { + my ($self, $arg) = @_; - print "\tFound ".scalar(@mult_list)." ".$type."'s with more than one $source ($ex_db) xref\n"; + (defined $arg) && + ($self->{_species_id} = $arg ); + return $self->{_species_id}; +} + +sub get_id_from_species_name { + my ($self, $species_name) = @_; + + my $sql = "select species_id from species where name = '".$species_name."'"; + my $sth = $self->xref->dbc->prepare($sql); + $sth->execute(); + my @row = $sth->fetchrow_array(); + my $species_id; + if (@row) { + $species_id = $row[0]; + } else { + print STDERR "Couldn't get ID for species ".$species_name."\n"; + print STDERR "It must be one of :-\n"; + $sql = "select name from species"; + $sth = $self->dbc->prepare($sql); + $sth->execute(); + while(my @row = $sth->fetchrow_array()){ + print STDERR $row[0]."\n"; + } + die("Please try again :-)\n"); + } + $sth->finish(); + + return $species_id; - # 3) for each ensembl object to that has more than one xref of this type, try to remove the - # weakest link. - - - # get the priority for the sources (may be used for test criteria) NOT SURE YET - - - my $depend_sth; - my $seq_sth; -# my $i =0; - foreach my $ens (@mult_list){ - - ########################################### - # get all the xrefs of this ex_db with %ids - ########################################### -# if($i < 5){ -# print "processing ensembl $type:$ens\n"; -# } - my $xref_sth = $self->core->dbc->prepare($xref_sql); - $xref_sth->execute($type,$ens,$ex_db); - my ($xref_id, $object_xref_id, $xref_info_type); - $xref_sth->bind_columns(\$xref_id, \$object_xref_id, \$xref_info_type); - my %object_xref; - my %info_type; - my %percent_id_total; - while($xref_sth->fetch()){ - $object_xref{$xref_id} = $object_xref_id; - $info_type{$xref_id} = $xref_info_type - } - $xref_sth->finish; - my $last = 0; - foreach my $xref (keys %object_xref){ - if($info_type{$xref} eq "PROJECTION"){ - $percent_id_total{$xref} = 0; - } - elsif($info_type{$xref} eq "DIRECT"){ - $percent_id_total{$xref} = 200; - } - elsif($info_type{$xref} eq "SEQUENCE_MATCH"){ - if($last != 1){ - $seq_sth = $self->core->dbc->prepare($primary_sql); - $last = 1; - } - $seq_sth->execute($object_xref{$xref}); - my ($target_id, $query_id); - $seq_sth->bind_columns(\$target_id, \$query_id); - while($seq_sth->fetch()){ - $percent_id_total{$xref_id} = $target_id + $query_id; - } -# $seq_sth->finish - } - elsif($info_type{$xref} eq "DEPENDENT"){ - if(!$temp_loaded){ - my $dir = $self->core->dir(); - my $file = $dir."/identity_xref_temp.txt"; - - my $sth = $self->core->dbc->prepare("create table identity_xref_temp like identity_xref"); - print "creating table identity_xref_temp\n"; - $sth->execute() || die "Could not \ncreate table identity_xref_temp like identity_xref\n"; - $sth->finish; - if(-s $file){ - my $temp_sth = $self->core->dbc->prepare("LOAD DATA LOCAL INFILE \'$file\' IGNORE INTO TABLE identity_xref_temp"); - print "Uploading data in $file to identity_xref_temp\n"; - $temp_sth->execute(); -# $temp_sth->finish; - } - else{ - print "NO file or zero size file, so not able to load file $file to identity_xref_temp\n"; - } - $temp_loaded = 1; - } - if($last != 2){ - $depend_sth = $self->core->dbc->prepare($dependent_sql); - $last = 2; - } - $depend_sth->execute($object_xref{$xref}); - my ($target_id, $query_id); - $depend_sth->bind_columns(\$target_id, \$query_id); - if($depend_sth->fetch()){ - $percent_id_total{$xref} = $target_id + $query_id; - } - else{ - print "No percent_id for object_xref ".$object_xref{$xref}."\n"; - } - } - else{ - print STDERR "hmm ".$info_type{$xref}." not on list??\n"; - } - } # end foreach my $xref - - - #################### - # find the best one - #################### - - my $best =0; - foreach my $key (keys %object_xref){ - if($best < $percent_id_total{$key}){ - $best = $percent_id_total{$key}; - } - } - - # go through others again and if < best remove from object_xref and identity_xref - - my $alt=0; - foreach my $key (keys %object_xref){ - if($best > $percent_id_total{$key}){ - $alt = 1; - $object_xrefs_removed++; - print DOI "delete o from object_xref o where o.object_xref_id = ".$object_xref{$key}."\n"; - print DOI "delete i from identity_xref i where i.object_xref_id = ".$object_xref{$key}."\n"; - } - } - if($alt){ - $transcripts_altered++; - } - - - }# end foreach my $ens multi_list - print "For source $source:\n"; - print "\t$object_xrefs_removed object_xrefs removed\n"; - print "\t$transcripts_altered transcripts affected by this\n"; - $total_transcripts_altered += $transcripts_altered; - $total_object_xrefs_removed += $object_xrefs_removed; - } # end of foreach source - if($temp_loaded){ - $self->core->dbc->do("DROP TABLE identity_xref_temp"); - } - close DOI; - # process new sql file - - if($total_object_xrefs_removed){ # no point doing owt if file is empty - #process the sql file - } } + + # # official naming # +sub get_official_name { + return undef; +} + sub official_naming { my $self = shift; - my $dbname = $self->get_canonical_name(); - - + my $dbname = $self->get_official_name(); + + if(!defined($dbname)){ + return; + } print "Official naming started. Copy $dbname from gene to canonical transcript\n"; my ($max_object_xref_id, $max_xref_id); - my $sth = $self->core->dbc->prepare("SELECT MAX(object_xref_id) FROM object_xref"); + + $self->species_id($self->get_id_from_species_name($self->core->species)); + + my $sth = $self->xref->dbc->prepare("SELECT MAX(object_xref_id) FROM object_xref"); $sth->execute(); $sth->bind_columns(\$max_object_xref_id); $sth->fetch; - - $sth = $self->core->dbc->prepare("SELECT MAX(xref_id) FROM xref"); + + + $sth = $self->xref->dbc->prepare("SELECT MAX(xref_id) FROM xref"); $sth->execute(); $sth->bind_columns(\$max_xref_id); $sth->fetch; - my $object_xref_id = $max_object_xref_id + 1; - if($object_xref_id == 1){ + +######################################################## +# Copy $dbname from gene to the canonical transcripts. # +######################################################## + + # remove the ignore later on after testing + my $sth_add_ox = $self->xref->dbc->prepare("insert ignore into object_xref (object_xref_id, xref_id, ensembl_id, ensembl_object_type, linkage_type, ox_status) values(?, ?, ?, 'Transcript', ?, ?)"); + +# my $object_xref_id = $max_object_xref_id + 1; + + $max_object_xref_id++; + + if($max_object_xref_id == 1){ die "max_object_xref_id should not be 1\n"; } - my $sql = "select gene_id, canonical_transcript_id from gene"; - my $object_sql = (<<FSQL); -select x.xref_id, o.ensembl_id - from xref x, external_db e, object_xref o - where x.external_db_id = e.external_db_id and - e.db_name like "$dbname" and +select x.xref_id, o.ensembl_id, o.linkage_type, o.ox_status + from xref x, source s, object_xref o + where x.source_id = s.source_id and + o.ox_status = "DUMP_OUT" and + s.name like "$dbname" and o.xref_id = x.xref_id and o.ensembl_object_type = "Gene"; FSQL - - my $file = $self->core->dir()."/".$dbname."_transcript_data.sql"; - - open(OFDN_TRAN,">$file") || die "Could not open $file"; - + my $sql = "select gene_id, canonical_transcript_id from gene"; $sth = $self->core->dbc->prepare($sql); $sth->execute(); my ($gene_id, $tran_id); $sth->bind_columns(\$gene_id,\$tran_id); - my %gene_to_tran; + my %gene_to_tran_canonical; while ($sth->fetch){ - $gene_to_tran{$gene_id} = $tran_id; + $gene_to_tran_canonical{$gene_id} = $tran_id; } $sth->finish; - $sth = $self->core->dbc->prepare($object_sql); + print "Pre copy all HGNC from gene to canonical transcripts\n"; + + $self->biomart_test(); + + $sth = $self->xref->dbc->prepare($object_sql); $sth->execute(); - my ($xref_id); - $sth->bind_columns(\$xref_id, \$gene_id); + my ($xref_id, $linkage_type, $ox_status); + $sth->bind_columns(\$xref_id, \$gene_id, \$linkage_type, \$ox_status); + while ($sth->fetch){ - if(defined($gene_to_tran{$gene_id})){ - print OFDN_TRAN $object_xref_id++,"\t", - $gene_to_tran{$gene_id}, - "\tTranscript\t", - $xref_id,"\\N\n"; + if(defined($gene_to_tran_canonical{$gene_id})){ + $sth_add_ox->execute($max_object_xref_id++, $xref_id, $gene_to_tran_canonical{$gene_id}, $linkage_type, $ox_status) || print "(Gene id - $gene_id) Could not add $max_object_xref_id, .".$gene_to_tran_canonical{$gene_id}.", $xref_id, $linkage_type, $ox_status to object_xref\n"; } else{ print STDERR "Could not find canonical for gene $gene_id\n"; @@ -5376,69 +383,41 @@ FSQL } $sth->finish; - close (OFDN_TRAN); + print "Post copy all HGNC from gene to canonical transcripts\n"; -# -# import data -# - $sth = $self->core->dbc->prepare("LOAD DATA LOCAL INFILE \'$file\' IGNORE INTO TABLE object_xref"); - print "Uploading data in $file to object_xref\n"; - $sth->execute() || die "error loading file $file\n"; + $self->biomart_test(); ########################################################################################## # HGNC/MGI synonyms are special as they are only on some prioritys so copy all the synonyms -# from the xref database to the core database. Granted some will already be there but use +# from the xref database to the xref database. Granted some will already be there but use # update ignore just in case. ########################################################################################### -#get the xref synonyms store as a hash of arrays. - - - $sql = 'select distinct(s.synonym), x.accession from xref x, synonym s, source so where so.source_id = x.source_id and x.xref_id = s.xref_id and so.name like "'.$dbname.'"'; - -#+--------------------------+-----------+ -#| synonym | accession | -#+--------------------------+-----------+ -#| FWP007 | 7 | -#| S863-7 | 7 | -#| CPAMD5 | 7 | -#| AACT | 16 | -#| ACT | 16 | - - - $sth = $self->xref->dbc->prepare($sql); - - $sth->execute(); - my ($syn, $acc); - $sth->bind_columns(\$syn,\$acc); - my %acc_to_syn; - while ($sth->fetch){ - push @{$acc_to_syn{$acc}}, $syn; - } - $sth->finish; + #get the xref synonyms store as a hash of arrays. + my $syn_hash = $self->get_official_synonyms(); + $sql = "insert into synonym (xref_id, synonym) values (?, ?)"; + my $add_syn_sth = $self->xref->dbc->prepare($sql); + # - # get the OFDN (HGNC/MGI) xrefs in the xrefs and add the synonyms, plus cretae hash to get id from the display name + # get the OFDN (HGNC/MGI) xrefs in the xrefs and add the synonyms, plus create hash to get id from the display name # This is becouse not all MGI.HGNC's are loaded as these are priority xrefs. # my %display_label_to_id; - - # NOTE synonyms done first so that if the same name is in synonym and display_label later is kept. - $sql = 'select x.accession, sy.synonym from synonym sy, xref x, source so where x.xref_id = sy.xref_id and so.source_id = x.source_id and so.name like "'.$dbname.'"'; $sth = $self->xref->dbc->prepare($sql); $sth->execute(); - my ($display_label); - $sth->bind_columns(\$acc,\$display_label); + my ($display_label, $acc, $syn,); + $sth->bind_columns(\$acc,\$display_label); while($sth->fetch){ $display_label_to_id{$display_label} = $acc; } @@ -5457,43 +436,33 @@ FSQL } $sth->finish; - # oh yes and synonyms aswell while we are at it. - my $odn_file = $self->core->dir()."/".$dbname."_syn.txt"; - open(SYN, ">$odn_file" ) || die "Could not open file $odn_file"; - - $sql = 'select x.dbprimary_acc, x.xref_id, x.display_label from xref x, external_db e where e.external_db_id = x.external_db_id and e.db_name like "'.$dbname.'"'; - - - $sth = $self->core->dbc->prepare($sql); - + my %synonym; + $sth = $self->xref->dbc->prepare('select es.synonym, x.label from synonym es, xref x, source s where x.xref_id = es.xref_id and x.source_id = s.source_id and + s.name = "'.$dbname.'"' ); $sth->execute(); - $sth->bind_columns(\$acc,\$xref_id,\$display_label); + my ($name); + $sth->bind_columns(\$syn,\$name); while($sth->fetch){ - $display_label_to_id{$display_label} = $acc; - if(defined($acc_to_syn{$acc})){ - foreach my $a ( @{$acc_to_syn{$acc}} ){ - print SYN "$xref_id\t$a\n"; - } - } + $synonym{$syn} = $name; + } + $sth->finish; + + $sth = $self->xref->dbc->prepare('select es.synonym, x.label from synonym es, xref x, source s where x.xref_id = es.xref_id and x.source_id = s.source_id and + s.name = "EntrezGene"' ); + $sth->execute(); + $sth->bind_columns(\$syn,\$name); + while($sth->fetch){ + $synonym{$syn} = $name; } $sth->finish; - - close SYN; - # - # import HGNC/MGI synonyms - # - $sth = $self->core->dbc->prepare("LOAD DATA LOCAL INFILE \'$odn_file\' IGNORE INTO TABLE external_synonym"); - print "Uploading data in $odn_file to external_synonym\n"; - $sth->execute() || die "error loading file $odn_file\n"; - $sth->finish; - + ####################### #Do the naming bit now. @@ -5502,18 +471,13 @@ FSQL # get the vega external_sources - my $xref_file = $self->core->dir()."/xref_vega_names.txt"; - my $object_xref_file = $self->core->dir()."/object_xref_vega_names.txt"; - - open(XREF, ">$xref_file" ) || die "Could not open file $xref_file"; - open(OBJECT_XREF, ">$object_xref_file") || die "Could not open file $object_xref_file"; - - my ($odn_curated_gene_id, $odn_automatic_gene_id, $clone_based_vega_gene_id, $clone_based_ensembl_gene_id); my ($odn_curated_tran_id, $odn_automatic_tran_id, $clone_based_vega_tran_id, $clone_based_ensembl_tran_id); + my $odn_id; - $sth = $self->core->dbc->prepare("select external_db_id from external_db where db_name like ?"); + $sth = $self->xref->dbc->prepare("select source_id from source where name like ?"); + $sth->execute($dbname."_curated_gene"); $sth->bind_columns(\$odn_curated_gene_id); $sth->fetch; @@ -5544,8 +508,6 @@ FSQL } - - $sth->execute($dbname."_curated_transcript"); $sth->bind_columns(\$odn_curated_tran_id); $sth->fetch; @@ -5582,14 +544,19 @@ FSQL ########################### - my $del_vega_sql = "delete o from object_xref o, xref x where x.xref_id = o.xref_id and x.external_db_id in ( $odn_curated_gene_id, $odn_automatic_gene_id, $clone_based_vega_gene_id, $clone_based_ensembl_gene_id,$odn_automatic_tran_id, $clone_based_ensembl_tran_id)"; + my $del_vega_sql = "delete o from object_xref o, xref x where x.xref_id = o.xref_id and x.source_id in ( $odn_curated_gene_id, $odn_automatic_gene_id, $clone_based_vega_gene_id, $clone_based_ensembl_gene_id,$odn_automatic_tran_id, $clone_based_ensembl_tran_id)"; - $sth = $self->core->dbc->prepare($del_vega_sql); + $sth = $self->xref->dbc->prepare($del_vega_sql); $sth->execute(); - $del_vega_sql = "delete x from xref x where x.external_db_id in ($odn_curated_gene_id, $odn_automatic_gene_id, $clone_based_vega_gene_id, $clone_based_ensembl_gene_id,$odn_automatic_tran_id, $clone_based_ensembl_tran_id)"; + $del_vega_sql = "delete x from xref x where x.source_id in ($odn_curated_gene_id, $odn_automatic_gene_id, $clone_based_vega_gene_id, $clone_based_ensembl_gene_id,$odn_automatic_tran_id, $clone_based_ensembl_tran_id)"; - $sth = $self->core->dbc->prepare($del_vega_sql); + $sth = $self->xref->dbc->prepare($del_vega_sql); + $sth->execute(); + + $del_vega_sql = "delete x from xref x where x.source_id = $clone_based_vega_tran_id and x.info_type = 'MISC'"; # original ones added have info type of "DIRECT" + + $sth = $self->xref->dbc->prepare($del_vega_sql); $sth->execute(); @@ -5599,142 +566,133 @@ FSQL ###################################################### - my $ensembl = $self->core; - my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-dbconn => $ensembl->dbc); - my $gene_adaptor = $db->get_GeneAdaptor(); - - if(!defined($max_xref_id) or $max_xref_id == 0){ - die "Sorry dont belives there are no xrefs max = 0\n"; - } - if(!defined($max_object_xref_id) or $max_object_xref_id == 0){ - die "Sorry dont belives there are no object_xrefs max = 0\n"; - } - - - ##################################### - # get synonyms to test NONE problems - ##################################### + my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-dbconn => $self->core->dbc); + my $ga = $db->get_GeneAdaptor(); + +# if(!defined($max_xref_id) or $max_xref_id == 0){ +# die "Sorry i dont believe there are no xrefs max xref = 0\n"; +# } +# if(!defined($max_object_xref_id) or $max_object_xref_id == 0){ +# die "Sorry i dont believe there are no object_xrefs max object_xref = 0\n"; +# } - # - # Maybe i should use the xref synonyms here as there are more??? - # - my %synonym; - $sth = $self->core->dbc->prepare('select es.synonym, x.display_label from external_synonym es, xref x, external_db e where x.xref_id = es.xref_id and x.external_db_id = e.external_db_id and e.db_name = "'.$dbname.'"' ); - $sth->execute(); - my ($name); - $sth->bind_columns(\$syn,\$name); - while($sth->fetch){ - $synonym{$syn} = $name; - } - $sth->finish; - #Some hgnc/mgi synonyms are missing so use entrez gene instead a hack i know but will be fixed later. - $sth = $self->core->dbc->prepare('select es.synonym, x.display_label from external_synonym es, xref x, external_db e where x.xref_id = es.xref_id and x.external_db_id = e.external_db_id and e.db_name = "EntrezGene"' ); - $sth->execute(); - $sth->bind_columns(\$syn,\$name); - while($sth->fetch){ - $synonym{$syn} = $name; - } - $sth->finish; ########################### # Process each Gene ########################### - my @genes = @{$gene_adaptor->fetch_all()}; + + my %gene_to_transcripts; + my %gene_id_to_stable_id; + my %tran_id_to_stable_id; + + $sth = $self->xref->dbc->prepare("select gtt.gene_id, gtt.transcript_id, gsi.stable_id, tsi.stable_id + from gene_transcript_translation gtt, gene_stable_id gsi, transcript_stable_id tsi + where gtt.gene_id = gsi.internal_id and gtt.transcript_id = tsi.internal_id"); + + $sth->execute; + my $gsi; + my $tsi; + $sth->bind_columns(\$gene_id, \$tran_id, \$gsi, \$tsi); + while ($sth->fetch){ + push @{$gene_to_transcripts{$gene_id}}, $tran_id; + $gene_id_to_stable_id{$gene_id} = $gsi; + $tran_id_to_stable_id{$tran_id} = $tsi; + } + + my $total_gene_vega = 0; my $total_gene = 0; my $total_clone_name = 0; my $odn_count = 0; + + my $dbentrie_sth = $self->xref->dbc->prepare("select x.label from xref x, object_xref ox, source s where x.xref_id = ox.xref_id and + x.source_id = s.source_id and s.name = ? and ox.ox_status = 'DUMP_OUT' and ox.ensembl_id = ? and ox.ensembl_object_type = ? "); + + + + $sql = "insert into xref (xref_id, source_id, accession, label, version, species_id, info_type, dumped) values (?, ?, ?, ?, 0, ".$self->species_id.", 'MISC', 0)"; + my $ins_xref_sth = $self->xref->dbc->prepare($sql); + + + # important or will crash and burn!!! + my %xref_added; # store those added already $xref_added{$accession:$source_id} = $xref_id; + + + my $ins_object_xref_sth = $self->xref->dbc->prepare("insert into object_xref (object_xref_id, ensembl_id, ensembl_object_type, xref_id, linkage_type, ox_status) values (?, ?, ?, ?, 'MISC', 'DUMP_OUT')"); - while (my $gene = shift @genes){ - # if($gene->biotype ne "protein_coding"){ - # $types{$gene->biotype}++; - # next; - # } - my @dbentries = @{$gene->get_all_DBEntries()}; + foreach my $gene_id (keys %gene_to_transcripts){ + my @ODN=(); my @VEGA_NAME=(); my $CLONE_NAME = undef; - foreach my $dbe (@dbentries){ - if($dbe->dbname eq $dbname){ - if($dbe->info_text =~ /via ccds/){ # put ccds one as the best one - unshift @ODN , $dbe->display_id; - } - else{ - push @ODN, $dbe->display_id; - } - } + my $display; + $dbentrie_sth->execute($dbname, $gene_id, "Gene"); + $dbentrie_sth->bind_columns(\$display); + while($dbentrie_sth->fetch){ + push @ODN, $display; } + if(scalar(@ODN)){ $odn_count++; } - + my %no_vega; # hash now as we want to sort by stable id and hence need a key value pair my $vega_count = 0; - - - - - # - # Here do sql rather than create transcript and then get dbEntries - # - # Reasons - # 1) faster access (no creation of trasncript objects) - # 2) get only the dbentries we want - - - - - foreach my $tr (@{$gene->get_all_Transcripts}){ + + + my $count = 0; + foreach my $tran_id ( @{$gene_to_transcripts{$gene_id}} ){ my $VEGA = undef; + $count = 0 ; - my $count = 0; - - foreach my $dbe (@{$tr->get_all_DBEntries}){ - if($dbe->dbname eq $dbname."_curated_transcript" or $dbe->dbname eq "Clone_based_vega_transcript"){ - my($hgnc_bit, $num) = split(/-0\d\d/,$dbe->display_id); - if($hgnc_bit =~ /[.]/){ - $CLONE_NAME = $hgnc_bit; - } - else{ - $VEGA = $hgnc_bit; - $vega_count++; - my $multiple = 1; - if(scalar(@VEGA_NAME)){ - foreach my $name (@VEGA_NAME){ - if($name ne $VEGA){ - if(defined($synonym{$name}) and uc($synonym{$name}) eq $VEGA){ - $multiple = 0; - } - if(defined($synonym{$VEGA}) and uc($synonym{$VEGA}) eq uc($name)){ - $multiple = 0; - } - } - else{ - $multiple = 0; - } + $dbentrie_sth->execute($dbname."_curated_transcript", $tran_id, "Transcript"); + $dbentrie_sth->bind_columns(\$display); + while($dbentrie_sth->fetch){ + my($hgnc_bit, $num) = split(/-0\d\d/,$display); + $VEGA = $hgnc_bit; + $vega_count++; + my $multiple = 1; + if(scalar(@VEGA_NAME)){ + foreach my $name (@VEGA_NAME){ + if($name ne $VEGA){ + if(defined($synonym{$name}) and uc($synonym{$name}) eq $VEGA){ + $multiple = 0; } - if($multiple){ - push @VEGA_NAME , $hgnc_bit; + if(defined($synonym{$VEGA}) and uc($synonym{$VEGA}) eq uc($name)){ + $multiple = 0; } } else{ - push @VEGA_NAME , $hgnc_bit; + $multiple = 0; } - } - $count++; - } - } # end of dbentries fro this transcript - + } + if($multiple){ + push @VEGA_NAME , $hgnc_bit; + } + } + else{ + push @VEGA_NAME , $hgnc_bit; + } + $count++; + } + + $dbentrie_sth->execute("Clone_based_vega_transcript", $tran_id, "Transcript"); + $dbentrie_sth->bind_columns(\$display); + while($dbentrie_sth->fetch){ + my($hgnc_bit, $num) = split(/-0\d\d/,$display); + $CLONE_NAME = $hgnc_bit; + $count++; + } if($count == 0){ - $no_vega{$tr->stable_id} = $tr->dbID; + $no_vega{$tran_id_to_stable_id{$tran_id}} = $tran_id; } if($count > 1){ - print "Problem: ".$tr->stable_id." has more than one vega_transcript\n"; + print "Problem: ".$tran_id_to_stable_id{$tran_id}." has more than one vega_transcript\n"; } if($count == 1){ if(defined($VEGA) and scalar(@ODN)){ @@ -5748,10 +706,9 @@ FSQL } } if(!$found){ - print "Problem: ".$gene->stable_id." linked to ".$dbname." (".join(', ',@ODN).") BUT ".$tr->stable_id." linked to vega_transcript $VEGA????\n"; + print "Problem: ".$gene_id_to_stable_id{$gene_id}." linked to ".$dbname." (".join(', ',@ODN).") BUT ".$tran_id_to_stable_id{$tran_id}." linked to vega_transcript $VEGA????\n"; } } -# push @has_vega, $tr->dbID; } } # end for each transcript #################################################################################### @@ -5760,30 +717,42 @@ FSQL # loop through no_vega array and set vega_transcript_like for each starting at 201 #################################################################################### if(scalar(@VEGA_NAME) > 1){ - print "Warning: gene ".$gene->stable_id." has more than one vega_transcript these are (".join(', ',@VEGA_NAME).")\n"; + print "Warning: gene ".$gene_id_to_stable_id{$gene_id}." has more than one vega_transcript these are (".join(', ',@VEGA_NAME).")\n"; } if($vega_count){ foreach my $name (@VEGA_NAME){ - $max_xref_id++; - $max_object_xref_id++; my $id = $display_label_to_id{$name}; if(!defined($id)){ $id = $name; print "Warning Could not find id for $name\n"; } - print XREF $max_xref_id . "\t" - . $odn_curated_gene_id. "\t" . $id . "\t".$name."\t" . "0" . "\t". "\n" ; - print OBJECT_XREF "$max_object_xref_id\t".$gene->dbID."\tGene\t" .$max_xref_id . "\t\\N\n"; + if(!defined($xref_added{$id.":".$odn_curated_gene_id})){ + $max_xref_id++; + $ins_xref_sth->execute($max_xref_id, $odn_curated_gene_id, $id, $name); + $xref_added{$id.":".$odn_curated_gene_id} = $max_xref_id; + if(defined($syn_hash->{$name})){ + foreach my $syn (@{$syn_hash->{$name}}){ + $add_syn_sth->execute($xref_id, $syn); + } + } + + } + $max_object_xref_id++; + $ins_object_xref_sth->execute($max_object_xref_id, $gene_id, 'Gene', $xref_added{$id.":".$odn_curated_gene_id}); } my $name = $VEGA_NAME[0]; my $tran_name_ext = 201; foreach my $tran (sort keys %no_vega){ - $max_xref_id++; + my $id = $name."-".$tran_name_ext; + if(!defined($xref_added{$id.":".$odn_automatic_tran_id})){ + $max_xref_id++; + $ins_xref_sth->execute($max_xref_id, $odn_automatic_tran_id, $id, $id); + $xref_added{$id.":".$odn_automatic_tran_id} = $max_xref_id; + } $max_object_xref_id++; - print XREF $max_xref_id . "\t" - . $odn_automatic_tran_id. "\t" . $name."-".$tran_name_ext . "\t".$name."-".$tran_name_ext."\t" . "0" . "\t". "\n" ; - print OBJECT_XREF "$max_object_xref_id\t".$no_vega{$tran}."\tTranscript\t" .$max_xref_id . "\t\\N\n"; + $ins_object_xref_sth->execute($max_object_xref_id, $no_vega{$tran}, 'Transcript', $xref_added{$id.":".$odn_automatic_tran_id}); + $tran_name_ext++; } } @@ -5795,26 +764,39 @@ FSQL #################################################################################### elsif(scalar(@ODN)){ foreach my $name (@ODN){ - $max_xref_id++; - $max_object_xref_id++; my $id = $display_label_to_id{$name}; if(!defined($id)){ $id = $name; print "Warning Could not find id for $name\n"; } - print XREF $max_xref_id . "\t" - . $odn_automatic_gene_id. "\t" . $id . "\t".$name."\t" . "0" . "\t". "\n" ; - print OBJECT_XREF "$max_object_xref_id\t".$gene->dbID."\tGene\t" .$max_xref_id . "\t\\N\n"; + + if(!defined($xref_added{$id.":".$odn_automatic_gene_id})){ + $max_xref_id++; + $ins_xref_sth->execute($max_xref_id, $odn_automatic_gene_id, $id, $name); + $xref_added{$id.":".$odn_automatic_gene_id} = $max_xref_id; + if(defined($syn_hash->{$name})){ + foreach my $syn (@{$syn_hash->{$name}}){ + $add_syn_sth->execute($xref_id, $syn); + } + } + } + $max_object_xref_id++; + $ins_object_xref_sth->execute($max_object_xref_id, $gene_id, 'Gene', $xref_added{$id.":".$odn_automatic_gene_id}); + + } my $name = $ODN[0]; my $tran_name_ext = 201; foreach my $tran (sort keys %no_vega){ - $max_xref_id++; + my $id = $name."-".$tran_name_ext; + if(!defined($xref_added{$id.":".$odn_automatic_tran_id})){ + $max_xref_id++; + $ins_xref_sth->execute($max_xref_id, $odn_automatic_tran_id, $id, $id); + $xref_added{$id.":".$odn_automatic_tran_id} = $max_xref_id; + } $max_object_xref_id++; - print XREF $max_xref_id . "\t" - . $odn_automatic_tran_id. "\t" . $name."-".$tran_name_ext . "\t".$name."-".$tran_name_ext."\t" . "0" . "\t". "\n" ; - print OBJECT_XREF "$max_object_xref_id\t".$no_vega{$tran}."\tTranscript\t" .$max_xref_id . "\t\\N\n"; + $ins_object_xref_sth->execute($max_object_xref_id, $no_vega{$tran}, 'Transcript', $xref_added{$id.":".$odn_automatic_tran_id}); $tran_name_ext++; } } @@ -5827,25 +809,37 @@ FSQL #################################################################################### else{ if(defined($CLONE_NAME)){ - $max_xref_id++; + my $id = $CLONE_NAME; + if(!defined($xref_added{$id.":".$clone_based_vega_gene_id})){ + $max_xref_id++; + $ins_xref_sth->execute($max_xref_id, $clone_based_vega_gene_id, $id, $id); + $xref_added{$id.":".$clone_based_vega_gene_id} = $max_xref_id; + } + $max_object_xref_id++; - print XREF $max_xref_id . "\t" - . $clone_based_vega_gene_id. "\t" . $CLONE_NAME . "\t".$CLONE_NAME."\t" . "0" . "\t". "\n" ; - print OBJECT_XREF "$max_object_xref_id\t".$gene->dbID."\tGene\t" .$max_xref_id . "\t\\N\n"; - + $ins_object_xref_sth->execute($max_object_xref_id, $gene_id, 'Gene', $xref_added{$id.":".$clone_based_vega_gene_id}); + + my $tran_name_ext = 201; foreach my $tran (sort keys %no_vega){ - $max_xref_id++; + my $id = $CLONE_NAME."-".$tran_name_ext; + if(!defined($xref_added{$id.":".$clone_based_vega_tran_id})){ + $max_xref_id++; + $ins_xref_sth->execute($max_xref_id, $clone_based_vega_tran_id, $id, $id); + $xref_added{$id.":".$clone_based_vega_tran_id} = $max_xref_id; + } + $max_object_xref_id++; - print XREF $max_xref_id . "\t" - . $clone_based_vega_tran_id. "\t" . $CLONE_NAME."-".$tran_name_ext . "\t".$CLONE_NAME."-".$tran_name_ext."\t" . "0" . "\t". "\n" ; - print OBJECT_XREF "$max_object_xref_id\t".$no_vega{$tran}."\tTranscript\t" .$max_xref_id . "\t\\N\n"; + $ins_object_xref_sth->execute($max_object_xref_id, $no_vega{$tran}, 'Transcript', $xref_added{$id.":".$clone_based_vega_tran_id}); + + $tran_name_ext++; } } else{ #get the clone name + my $gene= $ga->fetch_by_dbID($gene_id); my $new_gene = $gene->transform('clone'); my $new_clone_name = undef; if(defined($new_gene)){ @@ -5879,19 +873,30 @@ FSQL } } # store the data - $max_xref_id++; - $max_object_xref_id++; - print XREF $max_xref_id . "\t" - . $clone_based_ensembl_gene_id. "\t" . $new_clone_name . "\t".$new_clone_name."\t" . "0" . "\t". "\n" ; - print OBJECT_XREF "$max_object_xref_id\t".$gene->dbID."\tGene\t" .$max_xref_id . "\t\\N\n"; + + if(!defined($xref_added{$new_clone_name.":".$clone_based_ensembl_gene_id})){ + $max_xref_id++; + $ins_xref_sth->execute($max_xref_id, $clone_based_ensembl_gene_id, $new_clone_name, $new_clone_name); + $xref_added{$new_clone_name.":".$clone_based_ensembl_gene_id} = $max_xref_id; + } + $max_object_xref_id++; + $ins_object_xref_sth->execute($max_object_xref_id, $gene->dbID, 'Gene', $xref_added{$new_clone_name.":".$clone_based_ensembl_gene_id}); + my $tran_name_ext = 201; foreach my $tran (sort keys %no_vega){ - $max_xref_id++; + my $id = $new_clone_name."-".$tran_name_ext; + if(!defined($xref_added{$id.":".$clone_based_ensembl_tran_id})){ + $max_xref_id++; + $ins_xref_sth->execute($max_xref_id, $clone_based_ensembl_tran_id, $id, $id); + $xref_added{$id.":".$clone_based_ensembl_tran_id} = $max_xref_id; + } + $max_object_xref_id++; - print XREF $max_xref_id . "\t" - . $clone_based_ensembl_tran_id. "\t" . $new_clone_name."-".$tran_name_ext . "\t".$new_clone_name."-".$tran_name_ext."\t" . "0" . "\t". "\n" ; - print OBJECT_XREF "$max_object_xref_id\t".$no_vega{$tran}."\tTranscript\t" .$max_xref_id . "\t\\N\n"; + $ins_object_xref_sth->execute($max_object_xref_id, $no_vega{$tran}, 'Transcript', $xref_added{$id.":".$clone_based_ensembl_tran_id}); + + + $tran_name_ext++; } @@ -5908,80 +913,300 @@ FSQL $total_gene++; # print "Finished Gene ".$gene->stable_id."\t$ODN\t $vega_count\n" } + $add_syn_sth->finish; - close XREF; - close OBJECT_XREF; - - - -# -# import data -# - $sth = $self->core->dbc->prepare("LOAD DATA LOCAL INFILE \'$xref_file\' IGNORE INTO TABLE xref"); - print "Uploading data in $xref_file to xref\n"; - $sth->execute() || die "error loading file $xref_file\n"; - - $sth = $self->core->dbc->prepare("LOAD DATA LOCAL INFILE \'$object_xref_file\' IGNORE INTO TABLE object_xref"); - print "Uploading data in $object_xref_file to object_xref\n"; - $sth->execute() || die "error loading file $object_xref_file\n"; - # # Now check for duplicate clone names as gene names. # - my $clone_file = $self->core->dir()."/clone_names.sql"; - - open(CLONE, ">$xref_file" ) || die "Could not open file $xref_file"; - my $xref_list_sql = 'select x.xref_id, x.dbprimary_acc from external_db e, xref x, gene g, gene_stable_id gsi where e.external_db_id =x.external_db_id and e.db_name like "Clone_based_ensembl_gene" and g.gene_id = gsi.gene_id and g.display_xref_id = x.xref_id and x.display_label = ? order by gsi.stable_id'; - - my $gene_list_sql = 'SELECT DISTINCT(x.display_label), COUNT(*) AS count FROM gene g, xref x, external_db e WHERE e.external_db_id=x.external_db_id AND g.biotype = "protein_coding" AND x.xref_id=g.display_xref_id and e.db_name like "Clone_based_ensembl_gene" GROUP BY x.display_label, e.db_name HAVING COUNT > 1'; + my $xref_list_sql = 'select x.xref_id, x.label, count(*) as count from source s, xref x, object_xref ox where s.source_id = x.source_id and s.name like "Clone_based_ensembl_gene" and x.xref_id = ox.xref_id and ox.ensembl_object_type = "Gene" group by xref_id having count >1'; - my @xref_name_list; - - - $sth = $self->core->dbc->prepare($gene_list_sql); + # may need to add gsi to get the order the same each time + my $object_xref_list = "select object_xref_id, ensembl_id from object_xref where xref_id = ? order by ensembl_id"; # Can only be linked to a gene so no point checking really. + my $ox_list_sth = $self->xref->dbc->prepare($object_xref_list); + + my $update_first_xref = "update xref set accession = ?, label = ? where xref_id = ?"; # just rename xref; + my $update_first_sth = $self->xref->dbc->prepare($update_first_xref); + + my $change_object_xref = "update object_xref set xref_id = ? where xref_id = ? and ensembl_id = ?"; + my $change_ox_sth = $self->xref->dbc->prepare($change_object_xref); + + $sth = $self->xref->dbc->prepare($xref_list_sql); + $sth->execute; + my ($xref, $count); + $sth->bind_columns(\$xref, \$name, \$count); + while($sth->fetch){ + $ox_list_sth->execute($xref); + my ($object_xref_id, $ensembl_id); + $count = 1; + $ox_list_sth->bind_columns(\$object_xref_id, \$ensembl_id); + while($ox_list_sth->fetch){ + my $new_name = $name."-".$count; + if($count == 1){ + $update_first_sth->execute($new_name, $new_name, $xref); + $xref_added{$new_name.":".$clone_based_ensembl_gene_id} = $xref; + } + else{ + #add new one + if(!defined($xref_added{$new_name.":".$clone_based_ensembl_gene_id})){ + $max_xref_id++; + $ins_xref_sth->execute($max_xref_id, $clone_based_ensembl_gene_id, $new_name, $new_name); + $xref_added{$new_name.":".$clone_based_ensembl_gene_id} = $max_xref_id; + } + else{ + print "$new_name already exists????\n"; + } + $change_ox_sth->execute($max_xref_id, $xref, $ensembl_id); + } + $count++; + + } + } + $change_ox_sth->finish; + $update_first_sth->finish; + $ox_list_sth->finish; + + my $sth_stat = $self->xref->dbc->prepare("insert into process_status (status, date) values('official_naming_done',now())"); + $sth_stat->execute(); + $sth_stat->finish; +} + +sub biomart_fix{ + my ($self, $db_name, $type1, $type2) = @_; + my $xref_dbc = $self->xref->dbc; + + print "$db_name is associated with both $type1 and $type2 object types\n"; - $sth->execute(); - my ($xref_label, $count); - $sth->bind_columns(\$xref_label,\$count); - while($sth->fetch()){ - push @xref_name_list, $xref_label; + my $to; + my $from; + my $to_id; + my $from_id; + if($type1 eq "Gene" or $type2 eq "Gene"){ + $to = "Gene"; + $to_id = "gene_id"; + if($type1 eq "Translation" or $type2 eq "Translation"){ + $from = "Translation"; + $from_id = "translation_id" + } + else{ + $from = "Transcript"; + $from_id = "transcript_id"; + } + } + else{ + $to = "Transcript"; + $to_id = "transcript_id"; + $from = "Translation"; + $from_id = "translation_id"; } - $sth->finish; + print "Therefore moving all associations from $from to ".$to."\n"; - $sth = $self->core->dbc->prepare($xref_list_sql); + + my $sql =(<<EOF); + UPDATE IGNORE object_xref, gene_transcript_translation, xref, source + SET object_xref.ensembl_object_type = "$to", + object_xref.ensembl_id = gene_transcript_translation.$to_id + WHERE object_xref.ensembl_object_type = "$from" AND + object_xref.ensembl_id = gene_transcript_translation.$from_id AND + xref.xref_id = object_xref.xref_id AND + xref.source_id = source.source_id AND + object_xref.ox_status = "DUMP_OUT" AND + source.name = "$db_name"; +EOF + my $result = $xref_dbc->do($sql) ; + $result->finish; +# print "\n$sql\n"; + + $sql =(<<EOF2); + DELETE object_xref + FROM object_xref, xref, source + WHERE object_xref.ensembl_object_type = "$from" AND + xref.xref_id = object_xref.xref_id AND + xref.source_id = source.source_id AND + object_xref.ox_status = "DUMP_OUT" AND + source.name = "$db_name"; +EOF2 + + $result = $xref_dbc->do($sql); + $result->finish; +# print "\n$sql\n"; +} + + + +sub biomart_testing{ + my ($self) = @_; + + my $sql = 'SELECT ox.ensembl_object_type, COUNT(*), s.name FROM xref x, object_xref ox, source s WHERE x.xref_id = ox.xref_id AND s.source_id = x.source_id and ox.ox_status = "DUMP_OUT" GROUP BY s.name, ox.ensembl_object_type'; + + + my $again = 1; + while ($again){ + $again = 0; + + my $sth = $self->xref->dbc->prepare($sql); + $sth->execute(); + my ($type, $count, $name); + my ($last_type, $last_count, $last_name); + $sth->bind_columns(\$type,\$count,\$name); + $last_name = "DEFAULT"; + while (!$again and $sth->fetch){ + if($last_name eq $name){ + $again = 1; + $self->biomart_fix($name,$last_type, $type); +# $again = 0; # remove this line after testing + } + $last_name = $name; + $last_type= $type; + $last_count = $count; + } + $sth->finish; + } +} + + + +sub biomart_test{ + my ($self) = @_; + + my $sql = 'SELECT ox.ensembl_object_type, COUNT(*), s.name FROM xref x, object_xref ox, source s WHERE x.xref_id = ox.xref_id AND s.source_id = x.source_id and ox.ox_status = "DUMP_OUT" GROUP BY s.name, ox.ensembl_object_type'; + + + my $sth = $self->xref->dbc->prepare($sql); - foreach my $name (@xref_name_list){ - my $count = 201; - $sth->execute($name); - $sth->bind_columns(\$xref_id,\$acc); - while($sth->fetch()){ - my $new = $acc."-".$count; - print CLONE 'update xref set dbprimary_acc = "'.$new.'" where xref_id = '.$xref_id.";\n"; - print CLONE 'update xref set display_label = "'.$new.'" where xref_id = '.$xref_id.";\n"; - $count++; + $sth->execute(); + my ($type, $count, $name); + my ($last_type, $last_count, $last_name); + $sth->bind_columns(\$type,\$count,\$name); + $last_name = "ARSE"; + my $first = 1; + while ($sth->fetch){ + if($last_name eq $name){ + if($first){ + print "\nProblem Biomart test fails\n"; + $first=0; + } + print "$last_name\t$last_count\t$last_type\n"; + print "$name\t$count\t$type\n"; } + $last_name = $name; + $last_type= $type; + $last_count = $count; } $sth->finish; - close CLONE; - my $mysql_command = $self->get_mysql_command($self->core->dbc); +} + +# remove a list of patterns from a string +sub filter_by_regexp { + + my ($self, $str, $regexps) = @_; - # Load the new clone xrefs from the appropriate file - system( "$mysql_command < $clone_file" ) == 0 - or print( "ERROR: parsing $clone_file in mysql\n" ); + foreach my $regexp (@$regexps) { + $str =~ s/$regexp//ig; + } + return $str; } +sub gene_description_filter_regexps { + + return (); -sub species_specific_pre_attributes_set{ } -1; +sub gene_description_sources { + + return ("RFAM", + "miRBase", + "IMGT/GENE_DB", + "Uniprot/SWISSPROT", + "RefSeq_peptide", + "RefSeq_dna", + "Uniprot/Varsplic", + "Uniprot/SPTREMBL"); + +} + + +sub transcript_display_xref_sources { + my @list = qw(miRBase + RFAM + HGNC_curated_gene + HGNC_automatic_gene + MGI_curated_gene + MGI_automatic_gene + Clone_based_vega_gene + Clone_based_ensembl_gene + HGNC_curated_transcript + HGNC_automatic_transcript + MGI_curated_transcript + MGI_automatic_transcript + Clone_based_vega_transcript + Clone_based_ensembl_transcript + IMGT/GENE_DB + HGNC + SGD + MGI + flybase_symbol + Anopheles_symbol + Genoscope_annotated_gene + Uniprot/SWISSPROT + Uniprot/Varsplic + RefSeq_peptide + RefSeq_dna + Uniprot/SPTREMBL + EntrezGene + IPI); + + my %ignore; + $ignore{"EntrezGene"}= 'FROM:RefSeq_[pd][en][pa].*_predicted'; + + return [\@list,\%ignore]; + +} + +sub get_official_synonyms{ + my $self = shift; + my %hgnc_syns; + my %seen; # can be in more than one for each type of hgnc. + my $dbname = $self->get_official_name(); + + my $sql = (<<SYN); +SELECT x.accession, x.label, sy.synonym + FROM xref x, source so, synonym sy + WHERE x.xref_id = sy.xref_id + AND so.source_id = x.source_id + AND so.name like "$dbname" +SYN + + my $sth = $self->xref->dbc->prepare($sql); + + $sth->execute; + my ($acc, $label, $syn); + $sth->bind_columns(\$acc, \$label, \$syn); + + my $count = 0; + while($sth->fetch){ + if(!defined($seen{$acc.":".$syn})){ + push @{$hgnc_syns{$acc}}, $syn; + push @{$hgnc_syns{$label}}, $syn; + $count++; + } + $seen{$acc.":".$syn} = 1; + } + $sth->finish; + + return \%hgnc_syns; + +} + + +1;