From 4283b9b5eb537f82579403ec071e3dd091b8ea1d Mon Sep 17 00:00:00 2001
From: Monika Komorowska <mk8@sanger.ac.uk>
Date: Wed, 20 Apr 2011 13:03:28 +0000
Subject: [PATCH] *** empty log message ***

---
 misc-scripts/link_out/generate_LinkOut.pl | 226 ++++++++++++++++++++++
 misc-scripts/link_out/linkOut_config.txt  |   2 +
 2 files changed, 228 insertions(+)
 create mode 100644 misc-scripts/link_out/generate_LinkOut.pl
 create mode 100644 misc-scripts/link_out/linkOut_config.txt

diff --git a/misc-scripts/link_out/generate_LinkOut.pl b/misc-scripts/link_out/generate_LinkOut.pl
new file mode 100644
index 0000000000..f2a5707c1b
--- /dev/null
+++ b/misc-scripts/link_out/generate_LinkOut.pl
@@ -0,0 +1,226 @@
+# Generate LinkOut resource file for NCBI website.
+# Author: Monika Komorowska
+# Date : 06.04.2011
+
+# Upload generated files:
+# http://www.ncbi.nlm.nih.gov/books/NBK3807/#files.Transferring_Files_via_FTP
+ 
+
+use strict;
+use DBI;
+use Getopt::Long;
+
+sub new_file;
+
+my ( $dbpattern, $out_file, $config_file );
+
+GetOptions( "dbpattern|pattern=s", \$dbpattern,
+	    "out_file=s", \$out_file,
+	    "config_file=s", \$config_file,
+	  );
+
+if( !$dbpattern ) {
+  usage();
+}
+
+if (!$config_file) {
+  $config_file = "linkOut_config.txt";
+}
+
+open( CFH, "<$config_file" ) or die("Can't open $config_file\n");
+my @hosts;
+while (my $line = <CFH>) {
+    push( @hosts, $line); 
+}  
+close CFH;
+
+if( !$out_file ) {
+  $out_file = "resources";
+}
+
+my $file_size;
+my $number_of_files = 1;
+
+my $header = <<HEADER;
+<?xml version="1.0"?>
+<!DOCTYPE LinkSet PUBLIC "-//NLM//DTD LinkOut 1.0//EN"
+"http://www.ncbi.nlm.nih.gov/entrez/linkout/doc/LinkOut.dtd"
+[<!ENTITY base.url "http://www.ensembl.org/id/">]>
+
+<LinkSet>
+HEADER
+
+my $header_size;
+{
+  use bytes;
+  $header_size = length($header);
+}
+
+new_file();
+my $link_no = 0;
+
+foreach my $host_line (@hosts) {
+  $host_line =~ /([^\s]+)\s+([^\s]+)\s*(\d*)/;
+  my $host = $1;
+  my $user = $2;
+  my $port = $3;
+  
+  my $dsn = "DBI:mysql:host=$host";
+  if( $port =~ /\d+/) {
+    $dsn .= ";port=$port";
+  }
+  my $db = DBI->connect( $dsn, $user);
+  if (!defined $db) {
+    my $message = "Can't connect to host: $host, port: ";
+    if($port =~ /\d+/) {
+      $message .= $port;
+    } else {
+      $message .= 'default';
+    }
+    $message .= ", user: $user\n";
+    print STDOUT $message;
+    next;
+  }
+  
+  my @dbnames = map {$_->[0] } @{ $db->selectall_arrayref( "show databases" ) };  
+
+  for my $dbname ( @dbnames ) {
+    if( $dbpattern ) {
+      if( $dbname !~ /$dbpattern/ ) {
+	next;
+      }
+    }
+  
+    $db->do( "use $dbname" );
+    #get nucleotide data
+    my ($entrez_db, $ref_seq_accession,$ensembl_stable_id);
+    my $current_file_no = $number_of_files;
+    $entrez_db = "Nucleotide";
+    
+    my $sth  = $db->prepare("SELECT dbprimary_acc,  stable_id FROM object_xref o INNER JOIN xref x on o.xref_id = x.xref_id INNER JOIN external_db e on e.external_db_id =x.external_db_id INNER JOIN transcript_stable_id on ensembl_id = transcript_id WHERE db_name in ('RefSeq_dna', 'RefSeq_dna_predicted') GROUP BY dbprimary_acc,  stable_id");
+    $sth->execute();
+    print STDOUT "Writing out nucleotide links for database $dbname\n";
+    my $nucleotide_links = 0;
+    while ( ($ref_seq_accession,$ensembl_stable_id) = $sth->fetchrow_array() ) 
+    {
+	$link_no ++;
+my $link = " <Link>
+  <LinkId>$link_no</LinkId>
+  <ProviderId>7853</ProviderId>
+  <ObjectSelector>
+    <Database>$entrez_db</Database>
+    <ObjectList>
+      <Query>$ref_seq_accession</Query>
+    </ObjectList>
+  </ObjectSelector>
+  <ObjectUrl>
+    <Base>&base.url;</Base>
+    <Rule>$ensembl_stable_id</Rule>
+  </ObjectUrl>
+ </Link>\n";
+	  {
+	    use bytes;
+	    my $byte_size = length($link);
+	    $file_size += $byte_size;
+	  }
+	  #each file has a limit of 20Mb
+	  if ($file_size >= 19900000) {
+	    $number_of_files ++;
+	    new_file();
+	  }
+	  print FH $link;
+	  $nucleotide_links ++;
+    }
+     
+    $sth->finish();
+    my $message = "Written out $nucleotide_links nucleotide links for database $dbname";
+    if ($nucleotide_links > 0) {
+      $message .= " in file(s):\n";
+      for (my $i = $current_file_no; $i <= $number_of_files; $i++) {
+	    $message .= $out_file . "_" . "$i\n";
+      }
+    } else {
+      $message .= "\n";
+    }
+    print STDOUT $message;
+    
+    #get protein data
+    $current_file_no = $number_of_files;
+    $entrez_db = "Protein";
+    $sth  = $db->prepare("SELECT dbprimary_acc,  stable_id FROM object_xref o INNER JOIN xref x on o.xref_id = x.xref_id INNER JOIN external_db e on e.external_db_id =x.external_db_id INNER JOIN translation_stable_id on ensembl_id = translation_id WHERE db_name in ('RefSeq_peptide', 'RefSeq_peptide_predicted') group by dbprimary_acc,  stable_id");
+    $sth->execute();
+    print STDOUT "Writing out protein links for database $dbname\n";
+    my $protein_links = 0;
+    while ( ($ref_seq_accession,$ensembl_stable_id) = $sth->fetchrow_array() ) 
+    {
+	$link_no ++;
+my $link = " <Link>
+  <LinkId>$link_no</LinkId>
+  <ProviderId>7853</ProviderId>
+  <ObjectSelector>
+    <Database>$entrez_db</Database>
+    <ObjectList>
+      <Query>$ref_seq_accession</Query>
+    </ObjectList>
+  </ObjectSelector>
+  <ObjectUrl>
+    <Base>&base.url;</Base>
+    <Rule>$ensembl_stable_id</Rule>
+  </ObjectUrl>
+ </Link>\n";
+	{
+	   use bytes;
+	   my $byte_size = length($link);
+	  $file_size += $byte_size;
+	}
+	#each file has a limit of 20Mb
+	if ($file_size >= 19900000) {
+	  $number_of_files ++;
+	  new_file();
+	}
+	print FH $link;
+	$protein_links ++;
+    }
+     
+    $sth->finish();
+    $message = "Written out $protein_links protein links for database $dbname";
+    if ($protein_links > 0) {
+      $message .= " in file(s):\n";
+      for (my $i = $current_file_no; $i <= $number_of_files; $i++) {
+	    $message .= $out_file . "_" . "$i\n";
+      }
+    } else {
+      $message .= "\n";
+    }
+    print STDOUT $message;
+  }
+
+  $db->disconnect();
+  print FH "</LinkSet>";
+  close FH;
+}
+sub usage {
+  print STDERR <<EOF
+
+             Usage: generate_LinkOut options
+	 	    -dbpattern database name pattern
+		    -out_file output resource file name, default 'resources'
+		    -config_file should contain one or more lines with: host user port(optional), e.g. ens-staging1 ensro
+EOF
+;
+  exit;
+}
+
+sub new_file
+{
+  if ($number_of_files > 1) {
+    print FH "</LinkSet>";
+    close FH;
+  }
+  my $file_name = $out_file . $number_of_files . '.xml';
+  open( FH, ">$file_name" ) or die("Can't open $file_name\n");
+  print FH $header;
+  $file_size = $header_size;
+}
+
+
diff --git a/misc-scripts/link_out/linkOut_config.txt b/misc-scripts/link_out/linkOut_config.txt
new file mode 100644
index 0000000000..6d0bb9e41f
--- /dev/null
+++ b/misc-scripts/link_out/linkOut_config.txt
@@ -0,0 +1,2 @@
+ens-staging1 ensro
+ens-staging2 ensro
-- 
GitLab