diff --git a/misc-scripts/link_out/README b/misc-scripts/link_out/README deleted file mode 100644 index aa3a0f409c03c5c094c9420be6fbf1b7a90b64a1..0000000000000000000000000000000000000000 --- a/misc-scripts/link_out/README +++ /dev/null @@ -1,43 +0,0 @@ -For each release regenerate resource files by running script generate_LinkOut.pl : - - perl generate_LinkOut.pl -dbpattern 'core_67' -config_file linkOut_config.txt - -Upload new resource files to NCBI: - -It's best to do it on the first day of the release. NCBI links are regenerated every day based on the resource files in each provider’s directory. - -Host: ftp-private.ncbi.nlm.nih.gov -Username: ensem -Password: xxxx - check confluence page for full instructions and password: http://www.ebi.ac.uk/seqdb/confluence/display/ENS/NCBI+LinkOut+files - -Instructions: -http://www.ncbi.nlm.nih.gov/books/NBK3807/#files.Transferring_Files_via_FTP - -From a standard ftp client: - -1. -At a command prompt type: ftp ftp-private.ncbi.nlm.nih.gov and press enter. - -2. -Type your login name at the login prompt and press enter. (See how to obtain ftp account from NCBI.) - -3. -Type your password at the password prompt and press enter. - -4. -You should now be logged into the ftp server. If you receive an error message, check your login information, type 'bye' followed by enter, and retry steps 1-3. - -5. -Type 'bin' and press enter. This changes your ftp server to BINARY mode. - -6. -Type ‘cd holdings’ and press enter. This changes your current directory. - -7. -Type the 'put' command, followed by your pathname or drive and filename and press enter. (For example, 'put C:\filename' or 'put /home/testfiles/journalv6n3'.) - -8. -Type 'dir' and press enter to display the files in the current directory. - -9. -Type 'bye' to disconnect from the server and close the ftp session. diff --git a/misc-scripts/link_out/generate_LinkOut.pl b/misc-scripts/link_out/generate_LinkOut.pl deleted file mode 100644 index dc9d6a5c6ffb0ecf5b2552ba3008c2b9fd334d08..0000000000000000000000000000000000000000 --- a/misc-scripts/link_out/generate_LinkOut.pl +++ /dev/null @@ -1,228 +0,0 @@ -# Generate LinkOut resource file for NCBI website. -# Author: Monika Komorowska -# Date : 06.04.2011 - - -use strict; -use DBI; -use Getopt::Long; - -sub new_file; - -my ( $dbpattern, $out_file, $config_file ); - -GetOptions( "dbpattern|pattern=s", \$dbpattern, - "out_file=s", \$out_file, - "config_file=s", \$config_file, - ); - -if( !$dbpattern ) { - usage(); -} - -if (!$config_file) { - $config_file = "linkOut_config.txt"; -} - -open( CFH, "<$config_file" ) or die("Can't open $config_file\n"); -my @hosts; -while (my $line = <CFH>) { - push( @hosts, $line); -} -close CFH; - -#delete the old resource files -if (-e "resources*") { - exec("rm -r resources*"); -} - -if( !$out_file ) { - $out_file = "resources"; -} - -my $file_size; -my $number_of_files = 1; - -my $header = <<HEADER; -<?xml version="1.0"?> -<!DOCTYPE LinkSet PUBLIC "-//NLM//DTD LinkOut 1.0//EN" -"http://www.ncbi.nlm.nih.gov/entrez/linkout/doc/LinkOut.dtd" -[<!ENTITY base.url "http://www.ensembl.org/id/">]> - -<LinkSet> -HEADER - -my $header_size; -{ - use bytes; - $header_size = length($header); -} - -new_file(); -my $link_no = 0; - -foreach my $host_line (@hosts) { - $host_line =~ /([^\s]+)\s+([^\s]+)\s*(\d*)/; - my $host = $1; - my $user = $2; - my $port = $3; - - my $dsn = "DBI:mysql:host=$host"; - if( $port =~ /\d+/) { - $dsn .= ";port=$port"; - } - my $db = DBI->connect( $dsn, $user); - if (!defined $db) { - my $message = "Can't connect to host: $host, port: "; - if($port =~ /\d+/) { - $message .= $port; - } else { - $message .= 'default'; - } - $message .= ", user: $user\n"; - print STDOUT $message; - next; - } - - my @dbnames = map {$_->[0] } @{ $db->selectall_arrayref( "show databases" ) }; - - for my $dbname ( @dbnames ) { - if( $dbpattern ) { - if( $dbname !~ /$dbpattern/ ) { - next; - } - } - - $db->do( "use $dbname" ); - #get nucleotide data - my ($entrez_db, $ref_seq_accession,$ensembl_stable_id); - my $current_file_no = $number_of_files; - $entrez_db = "Nucleotide"; - - my $sth = $db->prepare("SELECT dbprimary_acc, stable_id FROM object_xref o INNER JOIN xref x on o.xref_id = x.xref_id INNER JOIN external_db e on e.external_db_id =x.external_db_id INNER JOIN transcript on ensembl_id = transcript_id WHERE db_name in ('RefSeq_dna', 'RefSeq_dna_predicted', 'RefSeq_mRNA', 'RefSeq_mRNA_predicted', 'RefSeq_ncRNA', 'RefSeq_ncRNA_predicted') GROUP BY dbprimary_acc, stable_id"); - $sth->execute(); - print STDOUT "Writing out nucleotide links for database $dbname\n"; - my $nucleotide_links = 0; - while ( ($ref_seq_accession,$ensembl_stable_id) = $sth->fetchrow_array() ) - { - $link_no ++; -my $link = " <Link> - <LinkId>$link_no</LinkId> - <ProviderId>7853</ProviderId> - <ObjectSelector> - <Database>$entrez_db</Database> - <ObjectList> - <Query>$ref_seq_accession</Query> - </ObjectList> - </ObjectSelector> - <ObjectUrl> - <Base>&base.url;</Base> - <Rule>$ensembl_stable_id</Rule> - </ObjectUrl> - </Link>\n"; - { - use bytes; - my $byte_size = length($link); - $file_size += $byte_size; - } - #each file has a limit of 20Mb - if ($file_size >= 19900000) { - $number_of_files ++; - new_file(); - } - print FH $link; - $nucleotide_links ++; - } - - $sth->finish(); - my $message = "Written out $nucleotide_links nucleotide links for database $dbname"; - if ($nucleotide_links > 0) { - $message .= " in file(s):\n"; - for (my $i = $current_file_no; $i <= $number_of_files; $i++) { - $message .= $out_file . "_" . "$i\n"; - } - } else { - $message .= "\n"; - } - print STDOUT $message; - - #get protein data - $current_file_no = $number_of_files; - $entrez_db = "Protein"; - $sth = $db->prepare("SELECT dbprimary_acc, stable_id FROM object_xref o INNER JOIN xref x on o.xref_id = x.xref_id INNER JOIN external_db e on e.external_db_id =x.external_db_id INNER JOIN translation on ensembl_id = translation_id WHERE db_name in ('RefSeq_peptide', 'RefSeq_peptide_predicted') group by dbprimary_acc, stable_id"); - $sth->execute(); - print STDOUT "Writing out protein links for database $dbname\n"; - my $protein_links = 0; - while ( ($ref_seq_accession,$ensembl_stable_id) = $sth->fetchrow_array() ) - { - $link_no ++; -my $link = " <Link> - <LinkId>$link_no</LinkId> - <ProviderId>7853</ProviderId> - <ObjectSelector> - <Database>$entrez_db</Database> - <ObjectList> - <Query>$ref_seq_accession</Query> - </ObjectList> - </ObjectSelector> - <ObjectUrl> - <Base>&base.url;</Base> - <Rule>$ensembl_stable_id</Rule> - </ObjectUrl> - </Link>\n"; - { - use bytes; - my $byte_size = length($link); - $file_size += $byte_size; - } - #each file has a limit of 20Mb - if ($file_size >= 19900000) { - $number_of_files ++; - new_file(); - } - print FH $link; - $protein_links ++; - } - - $sth->finish(); - $message = "Written out $protein_links protein links for database $dbname"; - if ($protein_links > 0) { - $message .= " in file(s):\n"; - for (my $i = $current_file_no; $i <= $number_of_files; $i++) { - $message .= $out_file . "$i\n"; - } - } else { - $message .= "\n"; - } - print STDOUT $message; - } - - $db->disconnect(); - print FH "</LinkSet>"; - close FH; -} -sub usage { - print STDERR <<EOF - - Usage: generate_LinkOut options - -dbpattern database name pattern - -out_file output resource file name, default 'resources' - -config_file should contain one or more lines with: host user port(optional), e.g. ens-staging1 ensro -EOF -; - exit; -} - -sub new_file -{ - if ($number_of_files > 1) { - print FH "</LinkSet>"; - close FH; - } - my $file_name = $out_file . $number_of_files . '.xml'; - open( FH, ">$file_name" ) or die("Can't open $file_name\n"); - print FH $header; - $file_size = $header_size; -} - - diff --git a/misc-scripts/link_out/linkOut_config.txt b/misc-scripts/link_out/linkOut_config.txt deleted file mode 100644 index 6d0bb9e41f9903f05a3c7b95a16a2873e05a3022..0000000000000000000000000000000000000000 --- a/misc-scripts/link_out/linkOut_config.txt +++ /dev/null @@ -1,2 +0,0 @@ -ens-staging1 ensro -ens-staging2 ensro