Commit 54f99236 authored by Patrick Meidl's avatar Patrick Meidl
Browse files

changes from branch-vega-31-dev

parent f59f6a04
...@@ -2,25 +2,58 @@ ...@@ -2,25 +2,58 @@
=head1 NAME =head1 NAME
glovar_snp_density.pl - glovar_snp_density.pl - Script to calculate glovar SNP density and stats (and
Script to calculate glovar SNP density and stats (and optioanlly prepare AV optioanlly prepare AV index dumps) for Vega.
index dumps) for Vega.
=head1 SYNOPSIS =head1 SYNOPSIS
./glovar_snp_density.pl glovar_snp_density.pl [options]
--species=Homo_sapiens
[--chr=6,13,14] General options:
[--dry_run|-n] --conffile, --conf=FILE read parameters from FILE
[--avdump|-a] (default: conf/Conversion.ini)
[--help|-h]
--dbname, db_name=NAME use database NAME
--host, --dbhost, --db_host=HOST use database host HOST
--port, --dbport, --db_port=PORT use database port PORT
--user, --dbuser, --db_user=USER use database username USER
--pass, --dbpass, --db_pass=PASS use database passwort PASS
--logfile, --log=FILE log to FILE (default: *STDOUT)
--logpath=PATH write logfile to PATH (default: .)
--logappend, --log_append append to logfile (default: truncate)
-v, --verbose verbose logging (default: false)
-i, --interactive=0|1 run script interactively (default: true)
-n, --dry_run, --dry=0|1 don't write results to database
-h, --help, -? print help (this message)
Specific options:
--chromosomes, --chr=LIST only process LIST chromosomes
--avdump=0|1 create AV dump
--glovardbname=NAME use Glovar database NAME
--glovarhost=HOST use Glovar database host HOST
--glovarport=PORT use Glovar database port PORT
--glovaruser=USER use Glovar database username USER
--glovarpass=PASS use Glovar database passwort PASS
--oracle_home=PATH set $ORACLE_HOME env variable to PATH
--ld_library_path=PATH set $LD_LIBRARY_PATH env variable to
PATH
--glovar_snp_consequence_exp=NUM use NUM glovar SNP consequence
experiment
=head1 DESCRIPTION =head1 DESCRIPTION
This script calculates Glovar SNP densities and total numbers per chromosome This script calculates Glovar SNP densities and total numbers per chromosome
for use in mapview. Can be run for individual chromosomes if desired (default: for use in mapview. Can be run for individual chromosomes if desired (default:
all chromosomes). It optionally also dumps SNPs into a file for generating the all chromosomes). Since it uses a lot of memory, there is a wrapper script
AV search index. which runs this script one chromosome at a time (glovar_snp_wrapper.pl). It
optionally also dumps SNPs into a file for generating the AV search index.
The block size is determined so that you have 150 bins for the smallest
chromosome over 5 Mb in length. For chromosomes smaller than 5 Mb, an
additional smaller block size is used to yield 150 bins for the overall
smallest chromosome. This will result in reasonable resolution for small
chromosomes and high performance for big ones.
=head1 LICENCE =head1 LICENCE
...@@ -38,219 +71,224 @@ Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk ...@@ -38,219 +71,224 @@ Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk
=cut =cut
use strict; use strict;
use warnings;
no warnings 'uninitialized';
use FindBin qw($Bin);
use vars qw($SERVERROOT);
BEGIN { BEGIN {
$ENV{'ENSEMBL_SERVERROOT'} = "../../.."; $SERVERROOT = "$Bin/../../..";
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/conf"); unshift(@INC, "$SERVERROOT/ensembl-otter/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-compara/modules"); unshift(@INC, "$SERVERROOT/ensembl/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-draw/modules"); unshift(@INC, "$SERVERROOT/ensembl-external/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-external/modules"); unshift(@INC, "$SERVERROOT/ensembl-variation/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-otter/modules"); unshift(@INC, "$SERVERROOT/bioperl-live");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-variation/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/bioperl-live");
} }
use SiteDefs;
use EnsWeb;
use EnsEMBL::DB::Core;
use Getopt::Long; use Getopt::Long;
use Pod::Usage;
use Bio::EnsEMBL::Utils::ConversionSupport;
use Bio::EnsEMBL::DensityType; use Bio::EnsEMBL::DensityType;
use Bio::EnsEMBL::DensityFeature; use Bio::EnsEMBL::DensityFeature;
use POSIX; use POSIX;
use Bio::EnsEMBL::Registry;
my $reg = "Bio::EnsEMBL::Registry"; $| = 1;
my ($species, $chr, $dry, $avdump, $help); my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
&GetOptions(
"species=s" => \$species, # parse options
"chr=s" => \$chr, $support->parse_common_options(@_);
"dry_run" => \$dry, $support->parse_extra_options(
"n" => \$dry, 'chromosomes|chr=s@',
"avdump" => \$avdump, 'avdump=s',
"a" => \$avdump, 'glovarhost=s',
"help" => \$help, 'glovarport=s',
"h" => \$help, 'glovaruser=s',
'glovarpass=s',
'glovardbname=s',
'oracle_home=s',
'ld_library_path=s',
'glovar_snp_consequence_exp=n',
);
$support->allowed_params($support->get_common_params,
'chromosomes',
'avdump',
'glovarhost',
'glovarport',
'glovaruser',
'glovarpass',
'glovardbname',
'oracle_home',
'ld_library_path',
'glovar_snp_consequence_exp',
); );
if($help || !$species){ if ($support->param('help') or $support->error) {
print qq(Usage: warn $support->error if $support->error;
./glovar_snp_density.pl pod2usage(1);
--species=Homo_sapiens
[--chr=6,13,14]
[--dry_run|-n]
[--avdump|-a]
[--help|-h]\n\n);
exit;
} }
$ENV{'ENSEMBL_SPECIES'} = $species; $support->comma_to_list('chromosomes');
## set db user/pass to allow write access
$EnsWeb::species_defs->set_write_access('ENSEMBL_DB',$species);
# connect to databases # ask user to confirm parameters to proceed
my $databases = &EnsEMBL::DB::Core::get_databases(qw(core glovar)); $support->confirm_params;
die "Problem connecting to databases: $databases->{'error'}\n" # get log filehandle and print heading and parameters to logfile
if $databases->{'error'} ; $support->init_log;
warn "Database error: $databases->{'non_fatal_error'}\n"
if $databases->{'non_fatal_error'};
# get the adaptors needed # connect to database and get adaptors
my $dfa = $databases->{'core'}->get_DensityFeatureAdaptor; my $dba = $support->get_database('ensembl');
my $dta = $databases->{'core'}->get_DensityTypeAdaptor; my $dba_glovar = $support->get_glovar_database;
my $aa = $databases->{'core'}->get_AnalysisAdaptor; my $dfa = $dba->get_DensityFeatureAdaptor;
my $attrib_adaptor = $databases->{'core'}->get_AttributeAdaptor; my $dta = $dba->get_DensityTypeAdaptor;
my $slice_adaptor = $databases->{'core'}->get_SliceAdaptor; my $aa = $dba->get_AnalysisAdaptor;
my $attrib_adaptor = $dba->get_AttributeAdaptor;
# which chromosomes do we run? # split chromosomes by size and determine block size
my @top_slices; my $chr_slices = $support->split_chromosomes_by_size(5000000);
if ($chr) {
# run chromosomes specified on commandline
foreach (split(",", $chr)) {
push @top_slices, $slice_adaptor->fetch_by_region("toplevel", $_);
}
} else {
# run all chromosomes for this species
@top_slices = @{$slice_adaptor->fetch_all("toplevel")};
}
# calculate block size (assuming 4000 blocks per genome)
my ( $block_size, $genome_size );
for my $slice ( @{$slice_adaptor->fetch_all("toplevel")} ) {
$genome_size += $slice->length;
}
$block_size = int( $genome_size / 4000 );
# analysis # create Analysis object
my $analysis = new Bio::EnsEMBL::Analysis ( my $analysis = Bio::EnsEMBL::Analysis->new(
-program => "glovar_snp_density.pl", -program => "glovar_snp_density.pl",
-database => "vega", -database => "vega",
-gff_source => "glovar_snp_density.pl", -gff_source => "glovar_snp_density.pl",
-gff_feature => "density", -gff_feature => "density",
-logic_name => "snpDensity"); -logic_name => "snpDensity",
$aa->store( $analysis ) unless $dry; );
$aa->store( $analysis ) unless ($support->param('dry_run'));
# density type
my $dt = Bio::EnsEMBL::DensityType->new(
-analysis => $analysis,
-block_size => $block_size,
-value_type => 'sum');
$dta->store($dt) unless $dry;
# loop over chromosomes
my @chr;
foreach my $sl (@top_slices) {
push @chr, $sl->seq_region_name;
}
print STDERR "\nAvailable chromosomes: @chr\n";
# settings for AV index dump # settings for AV index dump
use constant SNP_LINE => join("\t", use constant SNP_LINE => join("\t",
'Glovar SNP', '%s', '/%s/snpview?snp=%s&source=glovar', '%s', 'Glovar SNP', '%s', '/%s/snpview?snp=%s&source=glovar', '%s',
"Single nucleotide polymorphism (SNP) %s [Alleles: %s]. Alternative IDs: %s.\n" "Single nucleotide polymorphism (SNP) %s [Alleles: %s]. Alternative IDs: %s.\n"
); );
if ($avdump) { my $species = $support->species;
print STDERR "Preparing directories for AV index dump...\n"; my $fh;
my $dumpdir = "$ENV{'ENSEMBL_SERVERROOT'}/utils/indexing/input"; if ($support->param('avdump')) {
$support->log("Preparing directories for AV index dump...\n");
my $dumpdir = $support->serverroot."/utils/indexing/input";
unless (-e $dumpdir) { unless (-e $dumpdir) {
mkdir $dumpdir, 0777 or die "Could not creat directory $dumpdir: $!\n"; mkdir $dumpdir, 0777 or
$support->log_error("Could not creat directory $dumpdir: $!\n");
} }
unless (-e "$dumpdir/$species") { unless (-e "$dumpdir/$species") {
mkdir "$dumpdir/$species", 0777 or die mkdir "$dumpdir/$species", 0777 or
"Could not creat directory $dumpdir/$species: $!\n"; $support->log_error("Could not creat directory $dumpdir/$species: $!\n");
} }
open (AV, ">>$dumpdir/$species/SNP.txt") or die $fh = $support->filehandle('>>', "$dumpdir/$species/SNP.txt");
"Could not open $dumpdir/$species/SNP.txt for writing: $!\n"; $support->log("Done.\n");
print STDERR "Done.\n";
} }
my ($current_start, $current_end); # loop over block sizes
foreach my $slice (@top_slices) { my %av_done;
$current_start = 1; foreach my $block_size (keys %{ $chr_slices }) {
my $chr = $slice->seq_region_name; $support->log("Available chromosomes using block size of $block_size:\n ");
my ($total, $i); $support->log(join("\n ", map { $_->seq_region_name } @{ $chr_slices->{$block_size} })."\n");
my $bins = POSIX::ceil($slice->end / $block_size);
# create DensityType objects
print STDERR "\nSNP densities for chr $chr with block size $block_size\n"; my $dt = Bio::EnsEMBL::DensityType->new(
print STDERR "Start at " . `date`; -analysis => $analysis,
-block_size => $block_size,
# loop over blocks -value_type => 'sum',
while($current_start <= $slice->end) { );
$i++; $dta->store($dt) unless ($support->param('dry_run'));
$current_end = $current_start+$block_size-1;
if ($current_end > $slice->end) { # looping over chromosomes
$current_end = $slice->end; $support->log_stamped("Looping over chromosomes...\n");
} my ($current_start, $current_end);
my $sub_slice = $slice->sub_Slice( $current_start, $current_end ); foreach my $slice (@{ $chr_slices->{$block_size} }) {
my $count = 0; $current_start = 1;
my $chr = $slice->seq_region_name;
my ($total, $i);
my $bins = POSIX::ceil($slice->end/$block_size);
$support->log_stamped("Chromosome $chr with block size $block_size...\n", 1);
# loop over blocks
while ($current_start <= $slice->end) {
$i++;
$current_end = $current_start + $block_size - 1;
if ($current_end > $slice->end) {
$current_end = $slice->end;
}
my $sub_slice = $slice->sub_Slice($current_start, $current_end);
my $count = 0;
my $varfeats;
eval { $varfeats = $sub_slice->get_all_ExternalFeatures('GlovarSNP'); };
if ($@) {
$support->log_warning($@);
$current_start = $current_end + 1;
next;
}
my $varfeats; # only count varfeats that don't overlap slice start
eval { $varfeats = $sub_slice->get_all_ExternalFeatures('GlovarSNP'); }; # also, avoid duplicate counting
if ($@) { my %varfeats = map { $_->start > 0 ? ($_->variation_name => 1) : () } @{$varfeats};
warn $@; $count = scalar(keys %varfeats);
$current_start = $current_end + 1;
next; # AV index dump
} if ($support->param('avdump') && (! $av_done{$chr})) {
# only count varfeats that don't overlap slice start foreach my $varfeat (@{$varfeats}) {
# also, avoid duplicate counting next if ($varfeat->start < 1);
my %varfeats = map { "$_->variation_name => 1" if ($_->start >= 1) } @{$varfeats}; my $snpid = $varfeat->variation_name;
$count = scalar(keys %varfeats);
# dblinks
# AV index dump my @sources = @{ $varfeat->variation->get_all_synonym_sources };
if ($avdump) { my (@IDs, @desc);
foreach my $varfeat (@{$varfeats}) { foreach my $source (@sources) {
next if ($varfeat->start < 1); my @extIDs = @{ $varfeat->variation->get_all_synonyms($source) };
my $snpid = $varfeat->variation_name; push @IDs, @extIDs;
push @desc, "$source: @extIDs";
# dblinks }
my @sources = @{ $varfeat->variation->get_all_synonym_sources };
my (@IDs, @desc); print $fh sprintf SNP_LINE,
foreach my $source (@sources) { $snpid,
my @extIDs = @{ $varfeat->variation->get_all_synonyms($source) }; $species,
push @IDs, @extIDs; $snpid,
push @desc, "$source: @extIDs"; join(" ", @IDs),
$snpid,
$varfeat->allele_string,
join(", ", @desc)
;
} }
print AV sprintf SNP_LINE,
$snpid,
$species,
$snpid,
join(" ", @IDs),
$snpid,
$varfeat->allele_string,
join(", ", @desc)
;
} }
# density
my $df = Bio::EnsEMBL::DensityFeature->new(
-seq_region => $slice,
-start => $current_start,
-end => $current_end,
-density_type => $dt,
-density_value => $count,
);
$current_start = $current_end + 1;
$dfa->store($df) unless ($support->param('dry_run'));
$total += $count;
# logging
$support->log_verbose("Chr: $chr | Bin: $i/$bins | Count: $count | ".$support->date_and_mem."\n", 2);
} }
# density # set flag to do AV dump only once for each chromosome
my $df = Bio::EnsEMBL::DensityFeature->new $av_done{$chr} = 1;
(-seq_region => $slice,
-start => $current_start, # stats
-end => $current_end, $support->log("Total for chr $chr: $total\n", 1);
-density_type => $dt, my $stat = Bio::EnsEMBL::Attribute->new(
-density_value => $count); -NAME => 'SNPs',
$current_start = $current_end + 1; -CODE => 'SNPCount',
$dfa->store($df) unless $dry; -VALUE => $total,
$total += $count; -DESCRIPTION => 'Total Number of SNPs',
);
# logging $attrib_adaptor->store_on_Slice($slice, [$stat]) unless ($support->param('dry_run'));
print STDERR "Chr: $chr | Bin: $i/$bins | Count: $count | ";
print STDERR "Mem: " . `ps -p $$ -o vsz |tail -1`; $support->log_stamped("Done.\n", 1);
} }
$support->log_stamped("Done.\n");
# stats
print STDERR "Total for chr $chr: $total\n";
my $stat = Bio::EnsEMBL::Attribute->new
(-NAME => 'SNPs',
-CODE => 'SNPCount',
-VALUE => $total,
-DESCRIPTION => 'Total Number of SNPs');
$attrib_adaptor->store_on_Slice($slice, [$stat]) unless $dry;
} }
close AV if $avdump;
print STDERR "\nAll done at " . `date` . "\n"; # finish logfile
$support->finish_log;
#!/usr/local/bin/perl -w #!/usr/local/bin/perl
# =head1 NAME
# Calculate the GC content for top level seq_regions
# small regions 500bp to be able to display on contigview
# big regions genomesize / 4000 for 4000 features on the genome
use strict; vega_percent_gc_calc.pl - calculate GC content
BEGIN {
$ENV{'ENSEMBL_SERVERROOT'} = "../../..";
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/conf");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-compara/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-draw/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-external/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-otter/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/bioperl-live");
}
#use SiteDefs; =head1 SYNOPSIS
use EnsWeb;
use EnsEMBL::DB::Core;
use Getopt::Long;
use Bio::EnsEMBL::DensityType;
use Bio::EnsEMBL::DensityFeature;
use POSIX;
use Data::Dumper; vega_percent_gc_calc.pl [options]
my ($species, $dry, $help); General options:
&GetOptions( --conffile, --conf=FILE read parameters from FILE
"species=s" => \$species, (default: conf/Conversion.ini)
"dry_run" => \$dry,
"n" => \$dry,
"help" => \$help,
"h" => \$help,
);
if($help || !$species){ --dbname, db_name=NAME use database NAME
print qq(Usage: --host, --dbhost, --db_host=HOST use database host HOST
./vega_gene_density.pl --port, --dbport, --db_port=PORT use database port PORT
--species=Homo_sapiens --user, --dbuser, --db_user=USER use database username USER
[--dry_run|-n] --pass, --dbpass, --db_pass=PASS use database passwort PASS
[--help|-h]\n\n); --logfile, --log=FILE log to FILE (default: *STDOUT)
exit; --logpath=PATH write logfile to PATH (default: .)
} --logappend, --log_append append to logfile (default: truncate)
-v, --verbose verbose logging (default: false)
-i, --interactive=0|1 run script interactively (default: true)
-n, --dry_run, --dry=0|1 don't write results to database
-h, --help, -? print help (this message)
$ENV{'ENSEMBL_SPECIES'} = $species; =head1 DESCRIPTION
#get the adaptors needed This script calculates GC content per chromosomes.
my $slice_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species,"vega","Slice") or die "can't load slice adaptor - is the species name correct?";
my $dfa = Bio::EnsEMBL::Registry->get_adaptor($species,"vega","DensityFeature") or die;
my $dta = Bio::EnsEMBL::Registry->get_adaptor($species,"vega","DensityType") or die;
my $aa = Bio::EnsEMBL::Registry->get_adaptor($species,"vega","Analysis") or die;
## set db user/pass to allow write access The block size is determined so that you have 150 bins for the smallest
$EnsWeb::species_defs->set_write_access('ENSEMBL_DB',$species); chromosome over 5 Mb in length. For chromosomes smaller than 5 Mb, an
additional smaller block size is used to yield 150 bins for the overall
smallest chromosome. This will result in reasonable resolution for small
chromosomes and high performance for big ones.
my $top_slices = $slice_adaptor->fetch_all( "toplevel" ); =head1 LICENCE