Skip to content
Snippets Groups Projects
Commit 2bed7638 authored by Abel Ureta-Vidal's avatar Abel Ureta-Vidal
Browse files

Deprecated script

parent 722d1469
No related branches found
No related tags found
No related merge requests found
#!/usr/local/bin/perl
use strict;
use Bio::EnsEMBL::Pipeline::RunnableDB::CrossComparer;
use Bio::EnsEMBL::DBLoader;
use Getopt::Long;
$| = 1;
my $dbtype = 'rdb';
my $host = 'ensrv3';
my $port = '';
my $dbname = 'homo_sapiens_core_110';
my $dbuser = 'ensro';
my $dbpass = '';
my $module = 'Bio::EnsEMBL::DBSQL::DBAdaptor';
my $dbtype2 = 'rdb';
my $host2 = 'ensrv3';
my $port2 = '';
my $dbname2 = 'homo_sapiens_core_110';
my $dbuser2 = 'ensro';
my $dbpass2 = '';
my $module2 = 'Bio::EnsEMBL::DBSQL::DBAdaptor';
my $contig;
my $contig2;
&GetOptions (
'dbtype:s' => \$dbtype,
'host:s' => \$host,
'port:n' => \$port,
'dbname:s' => \$dbname,
'dbuser:s' => \$dbuser,
'dbpass:s' => \$dbpass,
'module:s' => \$module,
'dbtype2:s' => \$dbtype2,
'host2:s' => \$host2,
'port2:n' => \$port2,
'dbname2:s' => \$dbname2,
'dbuser2:s' => \$dbuser2,
'dbpass2:s' => \$dbpass2,
'module2:s' => \$module2,
'contig1:s' => \$contig,
'contig2:s' => \$contig2
);
$contig || die("Need to specify contigs to compare (got $contig and $contig2)");
$contig2 || die("Need to specify contigs to compare (got $contig and $contig2)");
my $locator = "$module/host=$host;port=$port;dbname=$dbname;user=$dbuser;pass=$dbpass";
print STDERR "Using $locator for db 1\n";
my $locator2 = "$module2/host=$host2;port=$port2;dbname=$dbname2;user=$dbuser2;pass=$dbpass2";
print STDERR "Using $locator for db 2\n";
my $querydb = Bio::EnsEMBL::DBLoader->new($locator);
my $targetdb = Bio::EnsEMBL::DBLoader->new($locator2);
my $input_id = "$locator-$contig~$locator2-$contig2";
my $crosscomparer = Bio::EnsEMBL::Pipeline::RunnableDB::CrossComparer->new(
-dbobj => $querydb,
-input_id => $input_id,
-min_score => 100
);
$crosscomparer->fetch_input;
$crosscomparer->run;
$crosscomparer->write_output;
#!/bin/sh -x
# -*- mode: sh; -*-
# $Id$
# Script to merge existing gene_descriptions with those from the families
# (where known) into a new table called $new_gene_description, which is a
# complete replacement for the old table. The script assumes that the family
# and ensembl-core database live in the same server (so you can do joins).
# The new gene_description table is created inside the family database!
usage="Usage: $0 core_database family_database DBNAME -h host -u user -ppass"
if [ $# -lt 4 ] ; then
echo $usage
exit 1
fi
# where mysql lives (or how it can be found, provided PATH is ok):
mysql=mysql; mysql_extra_flags='--batch'
# mysql=cat # for debugging
# Database to read existing descriptions from:
core_db=$1; shift
# family database:
fam_db=$1; shift;
# check the name contains 'fam', to avoid trouble:
if echo "$fam_db" | grep -i 'fam' > /dev/null 2>&1 ; then
: # OK
else
echo "arg 2: '$fam_db' does not match 'fam'" >&2
exit 2
fi
# inside the family database, which database to read additional descriptions
# from (e.g, ENSG, ENSMUSG etc.; it's the prefix of the IDs)
echo "warning: hard coded stuff ahead ... ! " >&2
db_name=$1; shift;
# db_name='ENSEMBLGENE';
# check if the db_name makes sense:
should_match='^ENS.*GENE$' # egrep pattern
if echo "$db_name" | egrep $should_match > /dev/null 2>&1 ; then
: # OK
else
echo "arg 2: database name to use: '$db_name' does not match $should_match" >&2
exit 2
fi
# Name of the table having the existing descriptions (only changes when the
# schema does, really, but also useful during testing/debugging)
core_gene_desc_table='gene_description'
# the table with new descriptions to be created (in the family database):
merged_gene_desc_table='gene_description'
# merged_gene_desc_table='merged_gene_description'
# now produce the SQL (with the $variables being replaced with their values)
# and pipe this as input into mysql:
(cat <<EOF
SELECT COUNT(*) AS all_old_descriptions
FROM $core_db.$core_gene_desc_table gd
WHERE gd.description IS NOT NULL
AND gd.description NOT IN ('', 'unknown', 'UNKNOWN');
# creating new local gene_description table with same layout+types as main one:
CREATE TABLE $merged_gene_desc_table
SELECT *
FROM $core_db.$core_gene_desc_table gd
WHERE gd.gene_id IS NULL;
ALTER TABLE $merged_gene_desc_table ADD PRIMARY KEY(gene_id);
# insert existing desc's (ie. the ones coming from swissprot/sptrembl)
INSERT INTO $merged_gene_desc_table
SELECT *
FROM $core_db.$core_gene_desc_table;
# delete anything that looks remotely unknown:
DELETE FROM $merged_gene_desc_table WHERE description is null;
DELETE FROM $merged_gene_desc_table WHERE description = '';
DELETE FROM $merged_gene_desc_table WHERE description = 'unknown';
DELETE FROM $merged_gene_desc_table WHERE description = 'UNKNOWN';
DELETE FROM $merged_gene_desc_table WHERE description REGEXP '^[ \t]*$';
DELETE FROM $merged_gene_desc_table WHERE description REGEXP '^.$';
# selecting all genes from gene, this time properly as 'unknown'
# This will fail on the knowns, as they should; only the 'unknowns' do get
# in. This is so that all the unknowns are recognizable by the string 'unknown'
#
INSERT INTO $merged_gene_desc_table
SELECT gene_id, 'unknown'
FROM $core_db.gene;
# give stats:
SELECT COUNT(*) AS unknown_old_descriptions
FROM $merged_gene_desc_table
WHERE description = 'unknown';
# set up a temporary table to hold descriptions from the family database, for
# those ones that are unknown. (This table is local to our mysql session,
# and is deleted automatically when it ends).
CREATE TEMPORARY TABLE tmp_new_descriptions
SELECT gd.gene_id, CONCAT(f.description," [Source:ENSEMBL_PROTEIN_FAMILIES;Acc:",f.id,"]") as concat
FROM family f, family_members fm, $merged_gene_desc_table gd
WHERE gd.description ='unknown'
AND fm.db_name ='$db_name'
AND fm.db_id = gd.gene_id
AND fm.family = f.internal_id
AND f.description not in ('', 'unknown', 'UNKNOWN');
# get rid of the unknowns:
DELETE FROM $merged_gene_desc_table where description = 'unknown';
# and insert them from the tmp (this should produce any duplicate
# warnings. But that wouldn't matter anyway).
INSERT INTO $merged_gene_desc_table
SELECT *
FROM tmp_new_descriptions;
# done; give new stats:
SELECT COUNT(*) AS all_new_descriptions
FROM $merged_gene_desc_table;
EOF
) | $mysql $mysql_extra_flags "$@" $fam_db
#!/usr/local/bin/perl
use strict;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::FeaturePair;
use Getopt::Long;
my $host = 'ecs1d';
my $dbname1 = 'homo_sapiens_core_130',
my $dbname2 = 'mus_musculus_core_1_3',
my $dbuser = 'ensadmin';
my $pass = 'ensembl';
my $infile;
$| = 1;
&GetOptions( 'host:s' => \$host,
'dbuser:s' => \$dbuser,
'dbname1:s' => \$dbname1,
'dbname2:s' => \$dbname2,
'pass:s' => \$pass,
'infile:s' => \$infile,
);
print STDERR "Time in dumpgff " . time . "\n";
my $humandb = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host => $host,
-user => $dbuser,
-dbname => $dbname1,
-pass => $pass,
-perlonlyfeatures => 0);
my $mousedb = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host => $host,
-user => $dbuser,
-dbname => $dbname2,
-pass => $pass,
-perlonlyfeatures => 0);
print STDERR "Time after db " . time . "\n";
open(IN,"<$infile") || die "Can't open [$infile]";
my %contig;
open(MUS,">mouse_human.sql");
open(HUM,">human_mouse.sql");
while (<IN>) {
eval {
chomp;
my @f = split(' ',$_);
my $h_contig = $f[0];
my $h_start = $f[1];
my $h_end = $f[2];
my $hstrand = $f[3];
my $m_contig = $f[4];
my $m_start = $f[5];
my $m_end = $f[6];
my $m_ori = $f[7];
# print STDERR "$h_contig:$h_start:$h_end:$m_contig:$m_start\n";
if (!defined($contig{$h_contig})) {
my $tmp = $humandb->get_Contig($h_contig);
$contig{$h_contig} = $tmp;
}
if (!defined($contig{$m_contig})) {
my $tmp = $mousedb->get_Contig($m_contig);
$contig{$m_contig} = $tmp;
}
my $hcontigid = $contig{$h_contig}->internal_id;
my $mcontigid = $contig{$m_contig}->internal_id;
#my $h_rawori = get_orientation($contig{$h_contig},$humandb);
#my $m_rawori = get_orientation($contig{$m_contig},$mousedb);
print HUM "\\N\t$hcontigid\t$h_start\t$h_end\t100\t$m_ori\t1\texonerate\t$m_start\t$m_end\t$m_contig\tNULL\t0\t0\t0\n";
print MUS "\\N\t$mcontigid\t$m_start\t$m_end\t100\t$m_ori\t1\texonerate\t$h_start\t$h_end\t$h_contig\tNULL\t0\t0\t0\n";
};
if ($@) {
print STDERR "Couldn't parse $_\n[$@]\n";
}
}
sub get_orientation {
my ($contig,$db) = @_;
my $query = "select raw_ori from static_golden_path where raw_id = " . $contig->internal_id;
my $sth = $db->prepare($query);
my $res = $sth->execute;
my $ref = $sth->fetchrow_hashref;
return $ref->{raw_ori};
}
close(IN);
close(HUM);
close(MUS);
#!/usr/bin/env perl
# $Id$
#
# quick script for id-mapping pretty much anything to anything.
#
# The idea is that this script finds, in STDIN, *all* and *only*, the ID's
# given in the file.map argument, and replaces all of them on STDOUT.
#
# To make this process safer (i.e., if things on STDIN look like ID's but
# really should be left alone), make the regexp args (see Usage or -h)
# more restrictive. This will be especially needed for id's that are just
# numbers (the default ID-regexp, see $dflt_i, therefore ignores numbers;
# this may be a bit too strict to justify the name of this script :-)
#
# To see what happens, use the verbose (-v) option, which writes all
# unmatcheable mappable-ids (or at-least-they-looked-like,but-may-not-be-IDs)
# to stderr. This is in fact recommended, since anything that is not
# mapped could be a sign that the script considers things to be IDs that
# actually aren't (in which case you'll have to tweak the regexps, see
# above). So this is warned about (once).
#
# To make the things faster, use the -o option (or its opposite, -n)
#
# Since the script uses STDIN and STDOUT, you can set up a pipe of
# different mappers (operating on either the same or different id's).
#
# Also and/or alternatively, you can specify several mapping files as
# arguments; all of them will be read into the mapping hashtable.
#
#
use strict;
use Getopt::Std;
my $opts='hi:d:vo:n:';
use vars qw($opt_h $opt_i $opt_d $opt_v $opt_o $opt_n);
my $dflt_i = "[A-Z]{3,6}[PGET]\\d{11}";
my $dflt_d = "[-\\\\> \t:.;,/]+";
my ($prog) = ( $0 =~ m|/([^/]*)$| );
my $Usage=<<END_USAGE;
Usage:
$prog [ options ] *.map < file-with-old-ids > file-with-new-ids
Options:
-h : this message
-i REGEXP: use REGEXP to regcognize IDs to be mapped (default: $dflt_i)
-d REGEXP: use REGEXP as word delimiter on STDIN (default: $dflt_d)
-v : verbose mode (prints un-mapped IDs to stderr etc.)
-o REGEXP: only map lines matching REGEXP (amongst others for speed)
-n REGEXP: map all lines apart from those matching REGEXP
The *.map files should contain pairs of whitespace delimited words.
END_USAGE
#'; # pacify emacs
if ( !getopts($opts) || $opt_h || @ARGV==0 ) {
die $Usage;
}
die "combining options -o and -n doesn't make sense\n$Usage"
if ($opt_o && $opt_n);
my $word_delim = ($opt_d || $dflt_d);
my $not_allowed = '[a-zA-Z0-9_]';
die "word delim $word_delim may not contain any of $not_allowed (to avoid problems with '\\b' word boundaries"
if $word_delim =~ /$not_allowed/;
my $id_regexp = ($opt_i || $dflt_i);
# read all the map files:
my %map;
my $nmappings = 0;
warn "Reading mappings\n" if $opt_v;
foreach my $f (@ARGV) {
open (FILE,"<$f") || die "$f:$!";
while (<FILE>) {
chomp;
my ($old, $new) = ( /(\S+)\s+(\S+)/ );
die "$0: didn't find old:$old -> new:$new, file $f, line $."
unless $old && $new;
if (defined $map{$old}) {
warn "Already saw mapping $old -> $map{$old};\n"
."\twill overwrite it with later one $old -> $new "
." (file $f, line $.)\n";
} else {
$nmappings++;
}
$map{$old}=$new;
}
close (FILE) || die "$f:$!";
}
warn "Done reading mappings\n" if $opt_v;
my $remapped=0;
my $notmapped=0;
LINE:
while (<STDIN>) {
if ($opt_n && /$opt_n/ || ($opt_o && !/$opt_o/) ) {
print;
next LINE;
}
chomp;
if ( /$id_regexp/ ) { # ... but only if anything looks like an ID
my @words = split /$word_delim/;
foreach my $w ( @words ) {
if ( $w =~ /$id_regexp/) {
if (defined $map{$w}) {
s/\b$w\b/$map{$w}/g; # works on $_
$remapped++;
} else {
if ($opt_v) {
warn "Unmapped IDs:\n" unless $notmapped >0;
warn $w, "\n" ;
} else {
warn "Found unmapped ID's; run with -v to find them\n"
unless $notmapped >0; # already warned
}
$notmapped++;
}
}
}
}
print;
print "\n";
}
my $total=$remapped+$notmapped;
warn "Mapped $remapped out of $total identifiers, given $nmappings mappings\n";
exit ($remapped != $total); # i.e, 0 if OK, 1 otherwise
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment