Skip to content
Snippets Groups Projects
Commit b8bcec30 authored by Magali Ruffier's avatar Magali Ruffier
Browse files

link_out directory moved to ensembl-production

removing ensembl-core copy
parent 298eee92
No related branches found
No related tags found
No related merge requests found
For each release regenerate resource files by running script generate_LinkOut.pl :
perl generate_LinkOut.pl -dbpattern 'core_67' -config_file linkOut_config.txt
Upload new resource files to NCBI:
It's best to do it on the first day of the release. NCBI links are regenerated every day based on the resource files in each provider’s directory.
Host: ftp-private.ncbi.nlm.nih.gov
Username: ensem
Password: xxxx - check confluence page for full instructions and password: http://www.ebi.ac.uk/seqdb/confluence/display/ENS/NCBI+LinkOut+files
Instructions:
http://www.ncbi.nlm.nih.gov/books/NBK3807/#files.Transferring_Files_via_FTP
From a standard ftp client:
1.
At a command prompt type: ftp ftp-private.ncbi.nlm.nih.gov and press enter.
2.
Type your login name at the login prompt and press enter. (See how to obtain ftp account from NCBI.)
3.
Type your password at the password prompt and press enter.
4.
You should now be logged into the ftp server. If you receive an error message, check your login information, type 'bye' followed by enter, and retry steps 1-3.
5.
Type 'bin' and press enter. This changes your ftp server to BINARY mode.
6.
Type ‘cd holdings’ and press enter. This changes your current directory.
7.
Type the 'put' command, followed by your pathname or drive and filename and press enter. (For example, 'put C:\filename' or 'put /home/testfiles/journalv6n3'.)
8.
Type 'dir' and press enter to display the files in the current directory.
9.
Type 'bye' to disconnect from the server and close the ftp session.
# Generate LinkOut resource file for NCBI website.
# Author: Monika Komorowska
# Date : 06.04.2011
use strict;
use DBI;
use Getopt::Long;
sub new_file;
my ( $dbpattern, $out_file, $config_file );
GetOptions( "dbpattern|pattern=s", \$dbpattern,
"out_file=s", \$out_file,
"config_file=s", \$config_file,
);
if( !$dbpattern ) {
usage();
}
if (!$config_file) {
$config_file = "linkOut_config.txt";
}
open( CFH, "<$config_file" ) or die("Can't open $config_file\n");
my @hosts;
while (my $line = <CFH>) {
push( @hosts, $line);
}
close CFH;
#delete the old resource files
if (-e "resources*") {
exec("rm -r resources*");
}
if( !$out_file ) {
$out_file = "resources";
}
my $file_size;
my $number_of_files = 1;
my $header = <<HEADER;
<?xml version="1.0"?>
<!DOCTYPE LinkSet PUBLIC "-//NLM//DTD LinkOut 1.0//EN"
"http://www.ncbi.nlm.nih.gov/entrez/linkout/doc/LinkOut.dtd"
[<!ENTITY base.url "http://www.ensembl.org/id/">]>
<LinkSet>
HEADER
my $header_size;
{
use bytes;
$header_size = length($header);
}
new_file();
my $link_no = 0;
foreach my $host_line (@hosts) {
$host_line =~ /([^\s]+)\s+([^\s]+)\s*(\d*)/;
my $host = $1;
my $user = $2;
my $port = $3;
my $dsn = "DBI:mysql:host=$host";
if( $port =~ /\d+/) {
$dsn .= ";port=$port";
}
my $db = DBI->connect( $dsn, $user);
if (!defined $db) {
my $message = "Can't connect to host: $host, port: ";
if($port =~ /\d+/) {
$message .= $port;
} else {
$message .= 'default';
}
$message .= ", user: $user\n";
print STDOUT $message;
next;
}
my @dbnames = map {$_->[0] } @{ $db->selectall_arrayref( "show databases" ) };
for my $dbname ( @dbnames ) {
if( $dbpattern ) {
if( $dbname !~ /$dbpattern/ ) {
next;
}
}
$db->do( "use $dbname" );
#get nucleotide data
my ($entrez_db, $ref_seq_accession,$ensembl_stable_id);
my $current_file_no = $number_of_files;
$entrez_db = "Nucleotide";
my $sth = $db->prepare("SELECT dbprimary_acc, stable_id FROM object_xref o INNER JOIN xref x on o.xref_id = x.xref_id INNER JOIN external_db e on e.external_db_id =x.external_db_id INNER JOIN transcript on ensembl_id = transcript_id WHERE db_name in ('RefSeq_dna', 'RefSeq_dna_predicted', 'RefSeq_mRNA', 'RefSeq_mRNA_predicted', 'RefSeq_ncRNA', 'RefSeq_ncRNA_predicted') GROUP BY dbprimary_acc, stable_id");
$sth->execute();
print STDOUT "Writing out nucleotide links for database $dbname\n";
my $nucleotide_links = 0;
while ( ($ref_seq_accession,$ensembl_stable_id) = $sth->fetchrow_array() )
{
$link_no ++;
my $link = " <Link>
<LinkId>$link_no</LinkId>
<ProviderId>7853</ProviderId>
<ObjectSelector>
<Database>$entrez_db</Database>
<ObjectList>
<Query>$ref_seq_accession</Query>
</ObjectList>
</ObjectSelector>
<ObjectUrl>
<Base>&base.url;</Base>
<Rule>$ensembl_stable_id</Rule>
</ObjectUrl>
</Link>\n";
{
use bytes;
my $byte_size = length($link);
$file_size += $byte_size;
}
#each file has a limit of 20Mb
if ($file_size >= 19900000) {
$number_of_files ++;
new_file();
}
print FH $link;
$nucleotide_links ++;
}
$sth->finish();
my $message = "Written out $nucleotide_links nucleotide links for database $dbname";
if ($nucleotide_links > 0) {
$message .= " in file(s):\n";
for (my $i = $current_file_no; $i <= $number_of_files; $i++) {
$message .= $out_file . "_" . "$i\n";
}
} else {
$message .= "\n";
}
print STDOUT $message;
#get protein data
$current_file_no = $number_of_files;
$entrez_db = "Protein";
$sth = $db->prepare("SELECT dbprimary_acc, stable_id FROM object_xref o INNER JOIN xref x on o.xref_id = x.xref_id INNER JOIN external_db e on e.external_db_id =x.external_db_id INNER JOIN translation on ensembl_id = translation_id WHERE db_name in ('RefSeq_peptide', 'RefSeq_peptide_predicted') group by dbprimary_acc, stable_id");
$sth->execute();
print STDOUT "Writing out protein links for database $dbname\n";
my $protein_links = 0;
while ( ($ref_seq_accession,$ensembl_stable_id) = $sth->fetchrow_array() )
{
$link_no ++;
my $link = " <Link>
<LinkId>$link_no</LinkId>
<ProviderId>7853</ProviderId>
<ObjectSelector>
<Database>$entrez_db</Database>
<ObjectList>
<Query>$ref_seq_accession</Query>
</ObjectList>
</ObjectSelector>
<ObjectUrl>
<Base>&base.url;</Base>
<Rule>$ensembl_stable_id</Rule>
</ObjectUrl>
</Link>\n";
{
use bytes;
my $byte_size = length($link);
$file_size += $byte_size;
}
#each file has a limit of 20Mb
if ($file_size >= 19900000) {
$number_of_files ++;
new_file();
}
print FH $link;
$protein_links ++;
}
$sth->finish();
$message = "Written out $protein_links protein links for database $dbname";
if ($protein_links > 0) {
$message .= " in file(s):\n";
for (my $i = $current_file_no; $i <= $number_of_files; $i++) {
$message .= $out_file . "$i\n";
}
} else {
$message .= "\n";
}
print STDOUT $message;
}
$db->disconnect();
print FH "</LinkSet>";
close FH;
}
sub usage {
print STDERR <<EOF
Usage: generate_LinkOut options
-dbpattern database name pattern
-out_file output resource file name, default 'resources'
-config_file should contain one or more lines with: host user port(optional), e.g. ens-staging1 ensro
EOF
;
exit;
}
sub new_file
{
if ($number_of_files > 1) {
print FH "</LinkSet>";
close FH;
}
my $file_name = $out_file . $number_of_files . '.xml';
open( FH, ">$file_name" ) or die("Can't open $file_name\n");
print FH $header;
$file_size = $header_size;
}
ens-staging1 ensro
ens-staging2 ensro
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment