Skip to content
Snippets Groups Projects
Commit 8e0d3423 authored by Glenn Proctor's avatar Glenn Proctor
Browse files

Script for dumping Ensembl information in XML format foe EBI search page

parent 939535ce
No related branches found
No related tags found
No related merge requests found
# Dump gene and xref information to an XML file for indexing by the EBI's
# search engine.
use strict;
use DBI;
use Getopt::Long;
use IO::Zlib;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use CGI qw(escapeHTML);
my ( $host, $user, $pass, $port, $dbpattern, $max_genes, $gzip );
GetOptions( "host=s", \$host,
"user=s", \$user,
"pass=s", \$pass,
"port=i", \$port,
"dbpattern|pattern=s", \$dbpattern,
"gzip!", \$gzip,
"max_genes=i", \$max_genes,
"help" , \&usage
);
if( !$host || !$dbpattern ) {
usage();
}
my $entry_count = 0;
my $fh;
run();
sub run() {
# loop over databases
# TODO - separate file for each database?
my $dsn = "DBI:mysql:host=$host";
$dsn .= ";port=$port" if ($port);
my $db = DBI->connect( $dsn, $user, $pass );
my @dbnames = map {$_->[0] } @{$db->selectall_arrayref("show databases")};
for my $dbname (@dbnames) {
next if ($dbname !~ /$dbpattern/);
my $file = $dbname . ".xml";
$file .= ".gz" if ($gzip);
if ($gzip) {
$fh = new IO::Zlib;
$fh->open("$file", "wb9") || die ("Can't open compressed stream to $file");
} else {
open(FILE, ">$file") || die "Can't open $file";
}
print "Dumping $dbname to $file\n";
my $dba = new Bio::EnsEMBL::DBSQL::DBAdaptor('-host' => $host,
'-port' => $port,
'-user' => $user,
'-pass' => $pass,
'-dbname' => $dbname);
header($dba, $dbname);
content($dba);
footer();
}
}
# -------------------------------------------------------------------------------
sub header {
my ($dba, $dbname) = @_;
p ("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>");
p ("<database>");
p ("<name>$dbname</name>");
my $meta_container = $dba->get_MetaContainer();
my $species = @{$meta_container->list_value_by_key('species.common_name')}[0];
p ("<description>Ensembl $species database</description>");
my $release = @{$meta_container->list_value_by_key('schema_version')}[0];
my $date = @{$meta_container->list_value_by_key('xref.timestamp')}[0]; # near enough for now
p ("<release>$release</release>");
p ("<release_date>$date</release_date>");
p ("");
p ("<entries>");
}
# -------------------------------------------------------------------------------
sub content {
my ($dba) = @_;
my $gene_adaptor = $dba->get_GeneAdaptor();
foreach my $gene (@{$gene_adaptor->fetch_all()}) {
last if ($max_genes && $entry_count >= $max_genes);
# general gene info
p("");
p ("<entry id=\"" . $gene->stable_id() . "\">");
p ("<name>" . $gene->display_id() . "</name>");
my $description = escapeHTML($gene->description()); # do any other fields need escaping?
p ("<description>" . $description . "</description>");
p ("<dates>");
# TODO - date formatting
if ( $gene->created_date()) { # don't always have creation date
p ("<date type=\"creation\" value=\"" . $gene->created_date() . "\"/>");
}
p ("<date type=\"last_modification\" value=\"" . $gene->modified_date() . "\"/>");
p ("</dates>");
# xrefs
p ("<cross-references>");
foreach my $xref (@{$gene->get_all_DBLinks()}) {
p ("<ref dbname=\"" . $xref->dbname() ."\" dbkey=\"" . $xref->display_id() . "\"/>");
}
p ("</cross-references>");
# additional fields - transcript, translation etc
p ("<additional_fields>");
foreach my $transcript (@{$gene->get_all_Transcripts()}) {
p ("<field name=\"transcript\">" . $transcript->stable_id() . "</field>");
my $translation = $transcript->translation();
p ("<field name=\"translation\">" . $translation->stable_id() . "</field>") if ($translation);
}
p ("</additional_fields>");
# close tag
p ("</entry>");
$entry_count++;
}
}
# -------------------------------------------------------------------------------
sub footer {
p ("</entries>");
p ("<entry_count>$entry_count</entry_count>");
p ("</database>");
print "Dumped $entry_count entries\n";
if ($gzip) {
$fh->close();
} else {
close(FILE);
}
}
# -------------------------------------------------------------------------------
sub p {
my $str = shift;
# TODO - encoding
$str .= "\n";
if ($gzip) {
print $fh $str;
} else {
print FILE $str;
}
}
# -------------------------------------------------------------------------------
# -------------------------------------------------------------------------------
sub usage {
print <<EOF; exit(0);
Usage: perl $0 <options>
-host Database host to connect to.
-port Database port to connect to.
-dbpattern Database name regexp
-user Database username.
-pass Password for user.
-gzip Compress output as it's written.
-max_genes Only dump this many genes for testing.
-help This message.
EOF
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment