Commit 199ec320 authored by Glenn Proctor's avatar Glenn Proctor
Browse files

Merged branch-mysql5 back into HEAD

parent 2b39b9b7
Generating the Schema Descriptions
The tables.txt and tables.html files are generated automatically from
the tables.xml. This requires an XML processor called Saxon which can
be downloaded from http://saxon.sourceforge.net/
When Saxon has been downloaded and installed, edit process-saxon.sh
and make sure that the -jar argument in each of the two Java calls
points to the location of your saxon7.jar file.
Then run the process-saxon.sh shell script and the .txt and .html
files will be created.
Editing the Schema Description
If you need to edit the schema description, edit tables.xml (according
to the rules in schema_description.dtd) and re-run the processing
stage as above to regenerate the .html and .txt files.
The XML stylesheets for creating the HTML and text (Wiki format) from
the XML are in xml2html.xsl and xml2wiki.xsl - you shouldn't need to
edit these unless you are adding a new feature to the processing step.
......@@ -50,7 +50,7 @@ sub parse {
my %feature;
my ($group, $seq, $method, $feature, $chr, $start, $end, $str, $phase, $score, $type, $id_ignore, $id) = split;
my ($group, $seq, $method, $feature, $chr, $start, $end, $str, $phase, $score, $pvalue, $type, $id_ignore, $id) = split;
my $strand = ($str =~ /\+/ ? 1 : -1);
$id =~ s/[\"\']//g; # strip quotes
......
......@@ -68,6 +68,8 @@ Usage: perl $0 <options>
-port Database port to connect to
-dbname Database name to use
-user Database username
-pass Password for user
......
......@@ -90,6 +90,7 @@ for my $dbname ( @dbnames ) {
} elsif( $expression =~ /^\s*select/i ||
$expression =~ /^\s*show/i ||
$expression =~ /^\s*desc/i ) {
print "### HERE";
my $res = $db->selectall_arrayref( $expression );
my @results = map { join( " ", @$_ ) } @$res ;
my $db_name_off = 0 ;
......
......@@ -363,7 +363,6 @@ sub dump_subset{
$final_clause = " AND ((" . join(") OR (", @or_list) . "))" unless ($use_all) ;
for my $sequence_type ('dna', 'peptide') {
my $filename = $xref->dir() . "/xref_" . $index . "_" . $sequence_type . ".fasta";
......
......@@ -166,6 +166,7 @@ EOF
if (($reader = open(BSUB_READER, '-|'))) {
while (<BSUB_READER>) {
if (/^Job <(\d+)> is submitted/) {
$jobid = $1;
print "LSF job ID for main mapping job: $jobid (job array with $num_jobs jobs)\n"
......
......@@ -12,9 +12,10 @@ sub get_set_lists {
return [["ExonerateGappedBest1", ["mus_musculus","*"]]];
}
sub consortium {
return "MarkerSymbol"; # Default to something that won't be matched as a source
return "MarkerSymbol";
}
......
......@@ -381,10 +381,33 @@ sub get_source_id_for_source_name {
return $source_id;
}
# --------------------------------------------------------------------------------
# Get a set of source IDs matching a source name pattern
sub get_source_ids_for_source_name_pattern {
my ($self, $source_name) = @_;
my $sql = "SELECT source_id FROM source WHERE upper(name) LIKE '%".uc($source_name)."%'";
my $sth = dbi()->prepare($sql);
my @sources;
$sth->execute();
while(my @row = $sth->fetchrow_array()){
push @sources,$row[0];
}
$sth->finish;
return @sources;
}
sub get_source_name_for_source_id {
my ($self, $source_id) = @_;
my $source_name;
my $sql = "SELECT name FROM source WHERE source_id= '" . $source_id. "'";
my $sth = dbi()->prepare($sql);
$sth->execute();
......@@ -537,6 +560,39 @@ sub get_valid_codes{
}
# --------------------------------------------------------------------------------
# --------------------------------------------------------------------------------
sub get_existing_mappings {
my ($self, $from_source_name, $to_source_name, $species_id) =@_;
my %mappings;
my $from_source = get_source_id_for_source_name($from_source_name);
my $to_source = get_source_id_for_source_name($to_source_name);
print "from source: $from_source_name id $from_source\t\tto source: $to_source_name id $to_source\n";
my $sql = "SELECT dx.dependent_xref_id, x1.accession as dependent, dx.master_xref_id, x2.accession as master FROM dependent_xref dx, xref x1, xref x2 WHERE x1.xref_id=dx.dependent_xref_id AND x2.xref_id=dx.master_xref_id AND x2.source_id=? AND x1.source_id=? AND x1.species_id=? AND x2.species_id=?";
my $sth = dbi()->prepare($sql);
$sth->execute($to_source_name, $from_source_name, $species_id, $species_id);
while(my @row = $sth->fetchrow_array()){
$mappings{$row[0]} = $row[1];
}
print "Got " . scalar(keys(%mappings)) . " $from_source_name -> $to_source_name mappings\n";
return \%mappings;
}
# --------------------------------------------------------------------------------
# Upload xrefs to the database
sub upload_xref_object_graphs {
......
package XrefParser::GOParser;
use strict;
......@@ -43,6 +45,12 @@ sub run {
my (%swiss) = %{XrefParser::BaseParser->get_valid_codes("uniprot",$species_id)};
my (%refseq) = %{XrefParser::BaseParser->get_valid_codes("refseq",$species_id)};
# complication with GO xrefs from JAX - linked to MGI symbols, which are themselves
# dependent, so we need to get the MGI->Uniprot mapping and store the *Uniprot*
# as the master xref
my (%mgi_to_uniprot) = %{XrefParser::BaseParser->get_existing_mappings("MarkerSymbol", "Uniprot/Swissprot", $species_id)};
my %worm;
my %worm_label;
my $wormset;
......@@ -127,6 +135,14 @@ sub run {
$count++;
}
}
elsif($array[0] =~ /MGI/){
if($mgi_to_uniprot{$array[1]}){
XrefParser::BaseParser->add_to_xrefs($mgi_to_uniprot{$array[1]},$array[4],'',$array[4],'',$array[6],$source_id,$species_id);
$count++;
}
}
elsif(!defined($wrongtype{$array[0]})){
print STDERR "WARNING: unknown type ".$array[0]."\n";
$wrongtype{$array[0]} = 1;
......
......@@ -55,14 +55,15 @@ sub run {
#ZDB-GENE-000112-34 couptf4 O42534
my $count =0;
my $spcount =0;
my $rscount =0;
my $mismatch=0;
while (<SWISSPROT>) {
chomp;
my ($zfin, $label, $acc) = split (/\s+/,$_);
if(defined($swiss{$acc})){
XrefParser::BaseParser->add_to_xrefs($swiss{$acc},$zfin,'',$label,'','',$source_id,$species_id);
$count++;
$spcount++;
}
else{
$mismatch++;
......@@ -82,14 +83,14 @@ sub run {
my ($zfin, $label, $acc) = split (/\s+/,$_);
if(defined($refseq{$acc})){
XrefParser::BaseParser->add_to_xrefs($refseq{$acc},$zfin,'',$label,'','',$source_id,$species_id);
$count++;
$rscount++;
}
else{
$mismatch++;
}
}
close REFSEQ;
print "\t$count xrefs succesfully loaded\n";
print "\t$spcount xrefs from Swissprot and $rscount xrefs from RefSeq succesfully loaded\n";
print "\t$mismatch xrefs ignored\n";
return 0;
}
......
......@@ -285,7 +285,7 @@ INSERT INTO source_url (source_id, species_id, url, checksum, file_modified_date
## UniGene
INSERT INTO source_url (source_id, species_id, url, checksum, file_modified_date, upload_date, parser) VALUES (6, 9606,'ftp://ftp.ncbi.nih.gov/repository/UniGene/Homo_sapiens/Hs.seq.uniq.gz ftp://ftp.ncbi.nih.gov/repository/UniGene/Homo_sapiens/Hs.data.gz', '', now(), now(), "UniGeneParser");
## ncRNA's presently inhouse.
## ncRNAs presently inhouse.
INSERT INTO source_url (source_id, species_id, url, checksum, file_modified_date, upload_date, parser) VALUES (4000, 9606,'LOCAL:ncRNA/ncRNA.txt', '', now(), now(), "ncRNAParser");
# --------------------------------------------------------------------------------
......@@ -308,7 +308,7 @@ INSERT INTO source_url (source_id, species_id, url, checksum, file_modified_date
INSERT INTO source_url (source_id, species_id, url, checksum, file_modified_date, upload_date, parser) VALUES (1080, 10090,'ftp://ftp.informatics.jax.org/pub/reports/MRK_SwissProt_TrEMBL.rpt ftp://ftp.informatics.jax.org/pub/reports/MRK_Synonym.sql.rpt', '', now(), now(), "MGDParser");
## GO
INSERT INTO source_url (source_id, species_id, url, checksum, file_modified_date, upload_date, parser) VALUES (1070, 10090,'ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/MOUSE/gene_association.goa_mouse.gz', '', now(), now(), "GOParser");
INSERT INTO source_url (source_id, species_id, url, checksum, file_modified_date, upload_date, parser) VALUES (1070, 10090,'http://www.geneontology.org/cgi-bin/downloadGOGA.pl/gene_association.mgi.gz', '', now(), now(), "GOParser");
## IPI
INSERT INTO source_url (source_id, species_id, url, checksum, file_modified_date, upload_date, parser) VALUES (5, 10090,'ftp://ftp.ebi.ac.uk/pub/databases/IPI/current/ipi.MOUSE.fasta.gz', '', now(), now(), "IPIParser");
......
xref
host=ecs4
port=3350
dbname=glenn_test_xref
dbname=gp1_mouse_xrefs
user=ensadmin
password=ensembl
dir=/nfs/acari/gp1/work/ensembl/misc-scripts/xref_mapping/xref
species=homo_sapiens
host=ecs2
port=3364
dbname=homo_sapiens_core_26_35
user=ensro
password=
dir=/nfs/acari/gp1/work/ensembl/misc-scripts/xref_mapping/test
species=mus_musculus
host=ecs4
port=3350
dbname=gp1_mus_musculus_core_37_34e
user=ensadmin
password=ensembl
dir=/nfs/acari/gp1/work/ensembl/misc-scripts/xref_mapping/mouse
#species=gallus_gallus
#host=ecs4
#port=3350
#dbname=glenn_gallus_gallus_core_31_1g
#user=ensadmin
#password=ensembl
#dir=/nfs/acari/gp1/work/ensembl/misc-scripts/xref_mapping/chicken
#species=mus_musculus
#host=ecs4
#port=3350
#dbname=ianl_mus_musculus_core_31_33g
#user=ensadmin
#password=ensembl
#dir=/nfs/acari/gp1/work/ensembl/misc-scripts/xref_mapping/mouse
#species=drosophila_melanogaster
#host=ia64g
#port=3306
#dbname=glenn_drosophila_melanogaster_core_NEW
#user=ensadmin
#password=ensembl
#dir=/nfs/acari/gp1/work/ensembl/misc-scripts/xref_mapping/drosophila
......@@ -58,8 +58,8 @@ foreach my $to_species (@to_multi) {
my $to_ga = Bio::EnsEMBL::Registry->get_adaptor($to_species, 'core', 'Gene');
my $to_dbea = Bio::EnsEMBL::Registry->get_adaptor($to_species, 'core', 'DBEntry');
delete_names() if ($delete_names);
delete_go_terms() if ($delete_go_terms);
delete_names($to_ga) if ($delete_names);
delete_go_terms($to_ga) if ($delete_go_terms);
my $mlss = $mlssa->fetch_by_method_link_type_registry_aliases($method_link_type, [$from_species, $to_species]);
......@@ -130,14 +130,13 @@ sub project_display_names {
my ($to_ga, $to_dbea, $ma, $from_member, $to_member, %db_to_type) = @_;
my $to_gene = $to_ga->fetch_by_stable_id($to_member->stable_id());
my $from_gene = $from_ga->fetch_by_stable_id($from_member->stable_id());
my $dbEntry = $from_gene->display_xref();
my $to_source = $to_gene->display_xref()->dbname() if ($to_gene->display_xref());
my $from_source = $from_gene->display_xref()->dbname() if ($from_gene->display_xref());
# if no display name set, do the projection
if (!$to_gene->external_name()) {
my $from_gene = $from_ga->fetch_by_stable_id($from_member->stable_id());
my $dbEntry = $from_gene->display_xref();
# TODO only do this for certain types of DBEntry?
if (check_overwrite_display_xref($to_gene, $from_source, $to_source)) {
if ($dbEntry) {
......@@ -275,6 +274,9 @@ sub get_stats {
$count = count_rows($to_ga, "SELECT COUNT(*) FROM gene g, xref x WHERE g.display_xref_id=x.xref_id AND x.display_label LIKE '%[from%'");
$str .= sprintf(" projected %d (%3.1f\%)" , $count, (100 * $count / $total_genes));
$count = count_rows($to_ga, "SELECT COUNT(*) FROM gene g, xref x, external_db e WHERE g.display_xref_id=x.xref_id AND x.external_db_id=e.external_db_id AND e.db_name IN ('RefSeq_dna_predicted', 'RefSeq_peptide_predicted')");
$str .= sprintf(" predicted %d (%3.1f\%)" , $count, (100 * $count / $total_genes));
$count = count_rows($to_ga, "SELECT COUNT(*) FROM gene g WHERE display_xref_id IS NOT NULL");
$str .= sprintf(" total genes with names %d (%3.1f\%)" , $count, (100 * $count / $total_genes));
......@@ -359,6 +361,37 @@ sub delete_go_terms {
# ----------------------------------------------------------------------
# Decide if a gene name should be overwritten
# Criteria: overwrite if:
# - no existing display_xref
# or
# - existing display_xref is RefSeq_*_predicted
# AND from_gene is from "best" source external db,
# e.g. HGNC in human, MGI in mouse
sub check_overwrite_display_xref {
my ($to_gene, $from_dbname, $to_dbname) = @_;
return 1 if (!$to_gene->external_name());
if ($to_dbname eq "RefSeq_dna_predicted" || $to_dbname eq "RefSeq_peptide_predicted") {
if (($from_species eq "human" && $from_dbname eq "HUGO") ||
($from_species eq "mouse" && $from_dbname eq "MarkerSymbol")) {
return 1;
}
}
return 0;
}
# ----------------------------------------------------------------------
sub usage {
print << "EOF";
......
......@@ -112,7 +112,7 @@ sub get_all_AffyProbes {
if( $self->adaptor() && $self->dbID() ) {
my $probeAdaptor = $self->adaptor()->db()->get_AffyProbeAdaptor();
my $probes = $probeAdaptor->fetch_all_by_AffyArray( $self );
my $probes = $probeAdaptor->fetch_by_AffyArray( $self );
return $probes;
} else {
warning( "Need database connection to retrieve Probes" );
......
......@@ -149,6 +149,7 @@ sub primary_id {
if( defined $arg ) {
$self->{primary_id} = $arg;
}
return $self->{primary_id};
}
......
......@@ -276,6 +276,7 @@ sub store {
$sth->bind_param(4,$exObj->description,SQL_VARCHAR);
$sth->bind_param(5,$dbRef,SQL_INTEGER);
$sth->execute();
$dbX = $sth->{'mysql_insertid'};
$sth->finish();
#
......@@ -859,13 +860,13 @@ sub _type_by_external_id{
}
my @queries = (
"select $ID_sql
from $from_sql xref, object_xref as oxr
from $from_sql xref, object_xref as oxr
where $where_sql xref.dbprimary_acc = ? and
xref.xref_id = oxr.xref_id and oxr.ensembl_object_type= ?",
xref.xref_id = oxr.xref_id and oxr.ensembl_object_type= ?",
"select $ID_sql
from $from_sql xref, object_xref as oxr
from $from_sql xref, object_xref as oxr
where $where_sql xref.display_label = ? and
xref.xref_id = oxr.xref_id and oxr.ensembl_object_type= ?",
xref.xref_id = oxr.xref_id and oxr.ensembl_object_type= ?",
"select $ID_sql
from $from_sql object_xref as oxr, external_synonym as syn
where $where_sql syn.synonym = ? and
......@@ -880,6 +881,7 @@ sub _type_by_external_id{
my @result = ();
foreach( @queries ) {
my $sth = $self->prepare( $_ );
$sth->bind_param(1,"$name",SQL_VARCHAR);
$sth->bind_param(2,$ensType,SQL_VARCHAR);
......
......@@ -373,7 +373,7 @@ sub store {
my( $self, $feature, $ensObjs, $influence, $evidence ) = @_;
if( ref( $ensObjs ) ne 'ARRAY' ){
warning( "Use of sralar args is deprecated - please use a listref" );
warning( "Use of scalar args is deprecated - please use a listref" );
$ensObjs = [[$ensObjs, $influence, $evidence]];
}
......
......@@ -125,7 +125,7 @@ sub new {
Args : none
Example : none
Description: returns true if this gene has a display_xref
Description: returns true if this gene has a status of KNOWN
Returntype : 0,1
Exceptions : none
Caller : general
......
......@@ -69,9 +69,9 @@ use Bio::EnsEMBL::ProjectionSegment;
use Bio::EnsEMBL::Registry;
use Bio::EnsEMBL::DBSQL::MergedAdaptor;
use Bio::EnsEMBL::StrainSlice;
use Bio::EnsEMBL::IndividualSlice;
use Bio::EnsEMBL::IndividualSliceFactory;
#use Bio::EnsEMBL::StrainSlice;
#use Bio::EnsEMBL::IndividualSlice;
#use Bio::EnsEMBL::IndividualSliceFactory;
use Bio::EnsEMBL::Mapper::RangeRegistry;
use Data::Dumper;
......
......@@ -193,6 +193,7 @@ $count = $db->dbc->db_handle->selectall_arrayref
ok($count == 1);
@attribs = @{$aa->fetch_all_by_Slice($slice)};
print "attribs: " . scalar(@attribs) . "\n";
ok(@attribs == 1);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment