Deprecated script

2bed7638 · Abel Ureta-Vidal · 722d1469 · 722d1469 · 722d1469 · 722d1469
Commit 2bed7638 authored 21 years ago by Abel Ureta-Vidal
--- a/scripts/clonecompara.pl
+++ b/scripts/clonecompara.pl
-#!/usr/local/bin/perl
-
-use strict;
-use Bio::EnsEMBL::Pipeline::RunnableDB::CrossComparer;
-use Bio::EnsEMBL::DBLoader;
-use Getopt::Long;
-
-$| = 1;
-
-
-my $dbtype = 'rdb';
-my $host   = 'ensrv3';
-my $port   = '';
-my $dbname = 'homo_sapiens_core_110';
-my $dbuser = 'ensro';
-my $dbpass = '';
-my $module = 'Bio::EnsEMBL::DBSQL::DBAdaptor';
-
-my $dbtype2 = 'rdb';
-my $host2   = 'ensrv3';
-my $port2   = '';
-my $dbname2 = 'homo_sapiens_core_110';
-my $dbuser2 = 'ensro';
-my $dbpass2 = '';
-my $module2 = 'Bio::EnsEMBL::DBSQL::DBAdaptor';
-my $contig;
-my $contig2;
-
-&GetOptions ( 
-	      'dbtype:s' => \$dbtype,
-	      'host:s'   => \$host,
-	      'port:n'   => \$port,
-	      'dbname:s' => \$dbname, 
-	      'dbuser:s' => \$dbuser,
-	      'dbpass:s' => \$dbpass,
-	      'module:s' => \$module,
-	      'dbtype2:s' => \$dbtype2,
-	      'host2:s'   => \$host2,
-	      'port2:n'   => \$port2,
-	      'dbname2:s' => \$dbname2, 
-	      'dbuser2:s' => \$dbuser2,
-	      'dbpass2:s' => \$dbpass2,
-	      'module2:s' => \$module2,
-	      'contig1:s' => \$contig,
-	      'contig2:s' => \$contig2
-	      );
-$contig || die("Need to specify contigs to compare (got $contig and $contig2)");
-$contig2 || die("Need to specify contigs to compare (got $contig and $contig2)");
-
-my $locator = "$module/host=$host;port=$port;dbname=$dbname;user=$dbuser;pass=$dbpass";
-print STDERR "Using $locator for db 1\n";
-
-my $locator2 = "$module2/host=$host2;port=$port2;dbname=$dbname2;user=$dbuser2;pass=$dbpass2";
-print STDERR "Using $locator for db 2\n";
-
-my $querydb =  Bio::EnsEMBL::DBLoader->new($locator);
-my $targetdb =  Bio::EnsEMBL::DBLoader->new($locator2);
-
-my $input_id = "$locator-$contig~$locator2-$contig2";
-
-my $crosscomparer = Bio::EnsEMBL::Pipeline::RunnableDB::CrossComparer->new(
-									 -dbobj => $querydb,
-									 -input_id => $input_id,
-									 -min_score => 100
-									 );
-
-$crosscomparer->fetch_input;
-$crosscomparer->run;
-$crosscomparer->write_output;
-
-
-
-
--- a/scripts/desc-from-fam.sh
+++ b/scripts/desc-from-fam.sh
-#!/bin/sh -x
-# -*- mode: sh; -*-
-# $Id$
-
-# Script to merge existing gene_descriptions with those from the families
-# (where known) into a new table called $new_gene_description, which is a
-# complete replacement for the old table.  The script assumes that the family
-# and ensembl-core database live in the same server (so you can do joins).
-
-# The new gene_description table is created inside the family  database!
-
-usage="Usage: $0 core_database family_database DBNAME -h host -u user -ppass"
-if [ $# -lt 4 ] ; then
-    echo $usage
-    exit 1
-fi
-
-# where mysql lives (or how it can be found, provided PATH is ok):
-mysql=mysql; mysql_extra_flags='--batch'
-# mysql=cat  # for debugging
-
-# Database to read existing descriptions from:
-core_db=$1; shift
-
-# family database:
-fam_db=$1; shift;
-# check the name contains 'fam', to avoid trouble:
-if echo "$fam_db" | grep -i 'fam' > /dev/null 2>&1 ; then
-    : # OK
-else
-    echo "arg 2:  '$fam_db' does not match 'fam'" >&2
-    exit 2
-fi
-
-# inside the family database, which database to read additional descriptions
-# from (e.g, ENSG, ENSMUSG etc.; it's the prefix of the IDs)
-echo "warning: hard coded stuff ahead ... ! " >&2
-db_name=$1; shift;
-# db_name='ENSEMBLGENE';
-
-# check if the db_name makes sense:
-should_match='^ENS.*GENE$' # egrep pattern
-if echo "$db_name" | egrep $should_match > /dev/null 2>&1 ; then
-    : # OK
-else
-    echo "arg 2: database name to use: '$db_name' does not match $should_match" >&2
-    exit 2
-fi
-
-# Name of the table having the existing descriptions (only changes when the
-# schema does, really, but also useful during testing/debugging)
-core_gene_desc_table='gene_description'
-
-# the table with new descriptions to be created (in the family database):
-merged_gene_desc_table='gene_description'
-# merged_gene_desc_table='merged_gene_description'
-
-# now produce the SQL (with the $variables  being replaced with their values)
-# and pipe this as input into mysql:
-(cat <<EOF
-SELECT COUNT(*) AS all_old_descriptions
-FROM $core_db.$core_gene_desc_table gd
-WHERE gd.description IS NOT NULL
-  AND gd.description NOT IN ('', 'unknown', 'UNKNOWN');
-
-# creating new local gene_description table with same layout+types as main one:
-CREATE TABLE $merged_gene_desc_table
-SELECT *
-FROM  $core_db.$core_gene_desc_table gd 
-WHERE gd.gene_id IS NULL;
-
-ALTER TABLE $merged_gene_desc_table ADD PRIMARY KEY(gene_id);
-
-# insert existing desc's (ie. the ones coming from swissprot/sptrembl)
-INSERT INTO $merged_gene_desc_table 
-  SELECT *
-  FROM $core_db.$core_gene_desc_table;
-
-# delete anything that looks remotely unknown:
-DELETE FROM $merged_gene_desc_table WHERE description is null;
-DELETE FROM $merged_gene_desc_table WHERE description = '';
-DELETE FROM $merged_gene_desc_table WHERE description = 'unknown';
-DELETE FROM $merged_gene_desc_table WHERE description = 'UNKNOWN';
-DELETE FROM $merged_gene_desc_table WHERE description REGEXP '^[ \t]*$';
-DELETE FROM $merged_gene_desc_table WHERE description REGEXP '^.$';
-
-# selecting all genes from gene, this time properly as 'unknown'
-# This will fail on the knowns, as they should; only the 'unknowns' do get
-# in. This is so that all the unknowns are recognizable by the string 'unknown'
-#
-INSERT INTO $merged_gene_desc_table
-  SELECT gene_id, 'unknown'
-  FROM $core_db.gene;
-
-# give stats:
-SELECT COUNT(*) AS unknown_old_descriptions
-FROM $merged_gene_desc_table
-WHERE description = 'unknown';
-  
-# set up a temporary table to hold descriptions from the family database, for
-# those ones that are unknown.  (This table is local to our mysql session,
-# and is deleted automatically when it ends).
-CREATE TEMPORARY TABLE tmp_new_descriptions
-  SELECT gd.gene_id, CONCAT(f.description," [Source:ENSEMBL_PROTEIN_FAMILIES;Acc:",f.id,"]") as concat
-  FROM family f, family_members fm, $merged_gene_desc_table gd
-  WHERE gd.description ='unknown'
-    AND fm.db_name ='$db_name'
-    AND fm.db_id = gd.gene_id
-    AND fm.family = f.internal_id
-    AND f.description not in ('',  'unknown', 'UNKNOWN');
-
-# get rid of the unknowns:
-DELETE FROM $merged_gene_desc_table where description = 'unknown';
-
-# and insert them from the tmp (this should produce any duplicate
-# warnings. But that wouldn't matter anyway). 
-INSERT INTO $merged_gene_desc_table
-  SELECT *
-  FROM tmp_new_descriptions;
-
-# done; give new stats:
-SELECT COUNT(*) AS all_new_descriptions
-FROM $merged_gene_desc_table;
-EOF
-) | $mysql  $mysql_extra_flags "$@" $fam_db
--- a/scripts/load_compara
+++ b/scripts/load_compara
-#!/usr/local/bin/perl
-
-use strict;
-
-use Bio::EnsEMBL::DBSQL::DBAdaptor;
-use Bio::EnsEMBL::FeaturePair;
-
-use Getopt::Long;
-
-my $host      = 'ecs1d';
-my $dbname1   = 'homo_sapiens_core_130',
-my $dbname2   = 'mus_musculus_core_1_3',
-my $dbuser    = 'ensadmin';
-my $pass      = 'ensembl';
-my $infile;
-
-$| = 1;
-
-&GetOptions( 'host:s'     => \$host,
-             'dbuser:s'   => \$dbuser,
-             'dbname1:s'  => \$dbname1,
-             'dbname2:s'  => \$dbname2,
-             'pass:s'     => \$pass,
-	     'infile:s'   => \$infile,
-             );
-
-print STDERR "Time in dumpgff " . time . "\n";
-
-my $humandb = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host             => $host,
-						 -user             => $dbuser,
-						 -dbname           => $dbname1,
-						 -pass             => $pass,
-						 -perlonlyfeatures => 0);
-
-my $mousedb = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host             => $host,
-						 -user             => $dbuser,
-						 -dbname           => $dbname2,
-						 -pass             => $pass,
-						 -perlonlyfeatures => 0);
-
-
-print STDERR "Time after db  " . time . "\n";
-
-open(IN,"<$infile") || die "Can't open [$infile]";
-
-my %contig;
-
-open(MUS,">mouse_human.sql");
-open(HUM,">human_mouse.sql");
-
-while (<IN>) {
-  eval {
-  chomp;
-
-  my @f = split(' ',$_);
-
-  my $h_contig  = $f[0];
-  my $h_start   = $f[1];
-  my $h_end     = $f[2];
-  my $hstrand   = $f[3];
-  my $m_contig  = $f[4];
-  my $m_start   = $f[5];
-  my $m_end     = $f[6];
-  my $m_ori     = $f[7];
-
-#  print STDERR "$h_contig:$h_start:$h_end:$m_contig:$m_start\n";
-
-  if (!defined($contig{$h_contig})) {
-    my $tmp = $humandb->get_Contig($h_contig);
-    $contig{$h_contig} = $tmp;
-  }
-
-  if (!defined($contig{$m_contig})) {
-    my $tmp = $mousedb->get_Contig($m_contig);
-    $contig{$m_contig} = $tmp;
-  }
-  
-  my $hcontigid = $contig{$h_contig}->internal_id;
-  my $mcontigid = $contig{$m_contig}->internal_id;
-
-  #my $h_rawori = get_orientation($contig{$h_contig},$humandb);
-  #my $m_rawori = get_orientation($contig{$m_contig},$mousedb);
-  
-  print HUM "\\N\t$hcontigid\t$h_start\t$h_end\t100\t$m_ori\t1\texonerate\t$m_start\t$m_end\t$m_contig\tNULL\t0\t0\t0\n";
-  print MUS "\\N\t$mcontigid\t$m_start\t$m_end\t100\t$m_ori\t1\texonerate\t$h_start\t$h_end\t$h_contig\tNULL\t0\t0\t0\n";
-};
-  if ($@) {
-    print STDERR "Couldn't parse $_\n[$@]\n";
-  }
-}
-
-sub get_orientation {
-  my ($contig,$db) = @_;
-
-  my $query = "select raw_ori from static_golden_path where raw_id = " . $contig->internal_id;
-
-  my $sth = $db->prepare($query);
-  my $res = $sth->execute;
-
-  my $ref = $sth->fetchrow_hashref;
-
-  return $ref->{raw_ori};
-}
-  
-close(IN);
-close(HUM);
-close(MUS);
-  
-  
--- a/scripts/universal-id-mapper.pl
+++ b/scripts/universal-id-mapper.pl
-#!/usr/bin/env perl
-# $Id$
-# 
-# quick script for id-mapping pretty much anything to anything. 
-#
-# The idea is that this script finds, in STDIN, *all* and *only*, the ID's
-# given in the file.map argument, and replaces all of them on STDOUT.
-#
-# To make this process safer (i.e., if things on STDIN look like ID's but
-# really should be left alone), make the regexp args (see Usage or -h)
-# more restrictive. This will be especially needed for id's that are just
-# numbers (the default ID-regexp, see $dflt_i, therefore ignores numbers;
-# this may be a bit too strict to justify the name of this script :-)
-#
-# To see what happens, use the verbose (-v) option, which writes all
-# unmatcheable mappable-ids (or at-least-they-looked-like,but-may-not-be-IDs)
-
-# to stderr. This is in fact recommended, since anything that is not
-# mapped could be a sign that the script considers things to be IDs that
-# actually aren't (in which case you'll have to tweak the regexps, see
-# above). So this is warned about (once). 
-#
-# To make the things faster, use the -o option (or its opposite, -n)
-#
-# Since the script uses STDIN and STDOUT, you can set up a pipe of
-# different mappers (operating on either the same or different id's). 
-# 
-# Also and/or alternatively, you can specify several mapping files as
-# arguments; all of them will be read into the mapping hashtable.
-#
-#
-
-use strict; 
-use Getopt::Std;
-
-my $opts='hi:d:vo:n:';
-use vars qw($opt_h $opt_i $opt_d $opt_v $opt_o $opt_n);
-
-my $dflt_i = "[A-Z]{3,6}[PGET]\\d{11}";
-my $dflt_d = "[-\\\\> \t:.;,/]+";
-
-my ($prog) = ( $0 =~ m|/([^/]*)$| );
-
-my $Usage=<<END_USAGE;
-
-Usage:
-
-  $prog [ options ]  *.map  < file-with-old-ids  > file-with-new-ids
-
-  Options:
-   -h       : this message
-   -i REGEXP: use REGEXP to regcognize IDs to be mapped (default: $dflt_i)
-   -d REGEXP: use REGEXP as word delimiter on STDIN (default: $dflt_d)
-   -v       : verbose mode (prints un-mapped IDs to stderr etc.)
-   -o REGEXP: only map lines matching REGEXP (amongst others for speed)
-   -n REGEXP: map all lines apart from those matching REGEXP
-
-The *.map files should contain pairs of whitespace delimited words. 
-
-END_USAGE
-#'; # pacify emacs
-
-if ( !getopts($opts) || $opt_h ||  @ARGV==0 ) {
-    die $Usage; 
-}
-
-die "combining options -o and -n doesn't make sense\n$Usage" 
-    if ($opt_o && $opt_n);
-
-my $word_delim = ($opt_d || $dflt_d);
-my $not_allowed = '[a-zA-Z0-9_]';
-die "word delim $word_delim may not contain any of $not_allowed (to avoid problems with '\\b' word boundaries"
- if $word_delim =~ /$not_allowed/;
-
-my $id_regexp =  ($opt_i || $dflt_i);
-
-# read all the map files:
-my %map;
-my $nmappings = 0;
-warn "Reading mappings\n" if $opt_v;
-foreach my $f (@ARGV) {
-    open (FILE,"<$f") || die "$f:$!";
-    while (<FILE>) {
-	chomp;
-	my ($old, $new) = ( /(\S+)\s+(\S+)/ );
-        die "$0: didn't find old:$old -> new:$new, file $f, line $."
-          unless $old && $new;
-        if (defined $map{$old}) {
-            warn "Already saw mapping $old -> $map{$old};\n"
-              ."\twill overwrite it with later one $old -> $new "
-              ." (file $f, line $.)\n";
-        } else {
-            $nmappings++;
-        }
-	$map{$old}=$new;
-    }
-    close (FILE) || die "$f:$!";
-}
-warn "Done reading mappings\n" if $opt_v;
-
-my $remapped=0;
-my $notmapped=0;
-LINE:
-while (<STDIN>) {
-
-    if ($opt_n && /$opt_n/ || ($opt_o && !/$opt_o/) ) {
-        print;
-        next LINE;
-    }
-    chomp;
-    if ( /$id_regexp/ ) { # ... but only if anything looks like an ID
-        my @words = split /$word_delim/;
-        foreach my $w ( @words ) {
-            if ( $w =~ /$id_regexp/) {
-                if (defined $map{$w}) {
-                    s/\b$w\b/$map{$w}/g; # works on $_
-                    $remapped++;
-                } else {
-                    if ($opt_v) {
-                        warn "Unmapped IDs:\n" unless $notmapped >0;
-                        warn $w, "\n" ;
-                    } else { 
-                        warn "Found unmapped ID's; run with -v to find them\n"
-                          unless $notmapped >0; # already warned
-                    }
-                    $notmapped++;
-                }
-            }
-        }
-    }
-    print; 
-    print "\n";
-}
-my $total=$remapped+$notmapped;
-warn "Mapped $remapped out of $total identifiers, given $nmappings mappings\n";
-exit ($remapped != $total);             # i.e, 0 if OK, 1 otherwise