Commit 3ec37fc5 authored by Patrick Meidl's avatar Patrick Meidl
Browse files

changes from branch-vega-23-dev

parent f6782625
......@@ -3,7 +3,8 @@
=head1 NAME
glovar_snp_density.pl -
script to calculate glovar SNP density and stats for Vega
Script to calculate glovar SNP density and stats (and optioanlly prepare AV
index dumps) for Vega.
=head1 SYNOPSIS
......@@ -11,13 +12,15 @@ script to calculate glovar SNP density and stats for Vega
--species=Homo_sapiens
[--chr=6,13,14]
[--dry-run|-n]
[--avdump|-a]
[--help|-h]
=head1 DESCRIPTION
This script calculates Glovar SNP densities and total numbers per chromosome
for use in mapview. Can be run for individual chromosomes if desired (default:
all chromosomes).
all chromosomes). It optionally also dumps SNPs into a file for generating the
AV search index.
=head1 LICENCE
......@@ -55,12 +58,14 @@ use Bio::EnsEMBL::DensityType;
use Bio::EnsEMBL::DensityFeature;
use POSIX;
my ($species, $chr, $dry, $help);
my ($species, $chr, $dry, $avdump, $help);
&GetOptions(
"species=s" => \$species,
"chr=s" => \$chr,
"dry-run" => \$dry,
"n" => \$dry,
"avdump" => \$avdump,
"a" => \$avdump,
"help" => \$help,
"h" => \$help,
);
......@@ -71,18 +76,19 @@ if($help || !$species){
--species=Homo_sapiens
[--chr=6,13,14]
[--dry-run|-n]
[--avdump|-a]
[--help|-h]\n\n);
exit;
}
$ENV{'ENSEMBL_SPECIES'} = $species;
## set db user/pass to allow write access
# set db user/pass to allow write access
my $db_ref = $EnsWeb::species_defs->databases;
$db_ref->{'ENSEMBL_DB'}{'USER'} = $EnsWeb::species_defs->ENSEMBL_WRITE_USER;
$db_ref->{'ENSEMBL_DB'}{'PASS'} = $EnsWeb::species_defs->ENSEMBL_WRITE_PASS;
## connect to databases
# connect to databases
my $databases = &EnsEMBL::DB::Core::get_databases(qw(core glovar));
die "Problem connecting to databases: $databases->{'error'}\n"
......@@ -90,33 +96,33 @@ die "Problem connecting to databases: $databases->{'error'}\n"
warn "Database error: $databases->{'non_fatal_error'}\n"
if $databases->{'non_fatal_error'};
## get the adaptors needed
# get the adaptors needed
my $dfa = $databases->{'core'}->get_DensityFeatureAdaptor;
my $dta = $databases->{'core'}->get_DensityTypeAdaptor;
my $aa = $databases->{'core'}->get_AnalysisAdaptor;
my $attrib_adaptor = $databases->{'core'}->get_AttributeAdaptor;
my $slice_adaptor = $databases->{'core'}->get_SliceAdaptor;
## which chromosomes do we run?
# which chromosomes do we run?
my @top_slices;
if ($chr) {
## run chromosomes specified on commandline
# run chromosomes specified on commandline
foreach (split(",", $chr)) {
push @top_slices, $slice_adaptor->fetch_by_region("toplevel", $_);
}
} else {
## run all chromosomes for this species
# run all chromosomes for this species
@top_slices = @{$slice_adaptor->fetch_all("toplevel")};
}
## calculate block size (assuming 4000 blocks per genome)
# calculate block size (assuming 4000 blocks per genome)
my ( $block_size, $genome_size );
for my $slice ( @{$slice_adaptor->fetch_all("toplevel")} ) {
$genome_size += $slice->length;
}
$block_size = int( $genome_size / 4000 );
## analysis
# analysis
my $analysis = new Bio::EnsEMBL::Analysis (
-program => "glovar_snp_density.pl",
-database => "vega",
......@@ -125,20 +131,40 @@ my $analysis = new Bio::EnsEMBL::Analysis (
-logic_name => "snpDensity");
$aa->store( $analysis ) unless $dry;
## density type
# density type
my $dt = Bio::EnsEMBL::DensityType->new(
-analysis => $analysis,
-block_size => $block_size,
-value_type => 'sum');
$dta->store($dt) unless $dry;
## loop over chromosomes
# loop over chromosomes
my @chr;
foreach my $sl (@top_slices) {
push @chr, $sl->seq_region_name;
}
print STDERR "\nAvailable chromosomes: @chr\n";
# settings for AV index dump
use constant SNP_LINE => join("\t",
'Glovar SNP', '%s', '/%s/snpview?snp=%s&source=glovar', '%s',
"Single nucleotide polymorphism (SNP) %s [Alleles: %s]. Alternative IDs: %s.\n"
);
if ($avdump) {
print STDERR "Preparing directories for AV index dump...\n";
my $dumpdir = "$ENV{'ENSEMBL_SERVERROOT'}/utils/indexing/input";
unless (-e $dumpdir) {
mkdir $dumpdir, 0777 or die "Could not creat directory $dumpdir: $!\n";
}
unless (-e "$dumpdir/$species") {
mkdir "$dumpdir/$species", 0777 or die
"Could not creat directory $dumpdir/$species: $!\n";
}
open (AV, ">>$dumpdir/$species/SNP.txt") or die
"Could not open $dumpdir/$species/SNP.txt for writing: $!\n";
print STDERR "Done.\n";
}
my ($current_start, $current_end);
foreach my $slice (@top_slices) {
$current_start = 1;
......@@ -149,7 +175,7 @@ foreach my $slice (@top_slices) {
print STDERR "\nSNP densities for chr $chr with block size $block_size\n";
print STDERR "Start at " . `date`;
## loop over blocks
# loop over blocks
while($current_start <= $slice->end) {
$i++;
$current_end = $current_start+$block_size-1;
......@@ -166,12 +192,34 @@ foreach my $slice (@top_slices) {
$current_start = $current_end + 1;
next;
}
## only count snps that don't overlap slice start
## also, avoid duplicate counting
my %snps = map { "$_->name => 1" if ($_->start >= 1) } @{$snps};
# only count snps that don't overlap slice start
# also, avoid duplicate counting
my %snps = map { "$_->display_id => 1" if ($_->start >= 1) } @{$snps};
$count = scalar(keys %snps);
## density
# AV index dump
if ($avdump) {
foreach my $snpo (@{$snps}) {
next if ($snpo->start < 1);
my $snpid = $snpo->display_id;
my (@IDs, @desc);
foreach my $link ($snpo->each_DBLink) {
push @IDs, $link->primary_id;
push @desc, $link->database . ": " . $link->primary_id;
}
print AV sprintf SNP_LINE,
$snpid,
$species,
$snpid,
join(" ", @IDs),
$snpid,
$snpo->alleles,
join(", ", @desc)
;
}
}
# density
my $df = Bio::EnsEMBL::DensityFeature->new
(-seq_region => $slice,
-start => $current_start,
......@@ -182,12 +230,12 @@ foreach my $slice (@top_slices) {
$dfa->store($df) unless $dry;
$total += $count;
## logging
# logging
print STDERR "Chr: $chr | Bin: $i/$bins | Count: $count | ";
print STDERR "Mem: " . `ps $$ -o vsz |tail -1`;
}
## stats
# stats
print STDERR "Total for chr $chr: $total\n";
my $stat = Bio::EnsEMBL::Attribute->new
(-NAME => 'SNPs',
......@@ -196,6 +244,7 @@ foreach my $slice (@top_slices) {
-DESCRIPTION => 'Total Number of SNPs');
$attrib_adaptor->store_on_Slice($slice, [$stat]) unless $dry;
}
close AV if $avdump;
print STDERR "\nAll done at " . `date` . "\n";
......@@ -10,12 +10,13 @@ Wrapper for glovar_snp_density.pl
./glovar_snp_density.pl
--species=Homo_sapiens
[--dry-run|-n]
[--avdump|-a]
=head1 DESCRIPTION
Wrapper for glovar_snp_density.pl to run it chromosome by chromosome. This is
an attempt to avoid high memory footprints caused by a memory leak somerwhere
in the API ...
in the API. See glovar_snp_density.pl for more detailed documentation.
=head1 LICENCE
......@@ -49,17 +50,20 @@ use SiteDefs;
use EnsWeb;
use Getopt::Long;
my ($species, $dry);
my ($species, $dry, $avdump);
&GetOptions(
"species=s" => \$species,
"dry-run" => \$dry,
"n" => \$dry,
"avdump" => \$avdump,
"a" => \$avdump,
);
unless ($species) {
print qq(Usage:
./glovar_snp_density.pl
--species=Homo_sapiens
[--avdump|-a]
[--dry-run|-n]\n\n);
exit;
}
......@@ -69,8 +73,9 @@ $ENV{'ENSEMBL_SPECIES'} = $species;
## run glovar_snp_density.pl for each chromsome in this species
my $command = "./glovar_snp_density.pl --species=$species";
$command .= " -n" if ($dry);
$command .= " -a" if ($avdump);
foreach my $chr (@{$EnsWeb::species_defs->ENSEMBL_CHROMOSOMES}) {
warn "$command --chr=$chr\n";
system("$command --chr=$chr");
system("$command --chr=$chr") == 0 or die "$command --chr=$chr failed: $!\n";
}
......@@ -102,17 +102,33 @@ my $slice_adaptor = $db->get_SliceAdaptor;
my $top_slices = $slice_adaptor->fetch_all('toplevel');
## determine blocksize, assuming you want 150 blocks for the smallest
## chromosome
my ($block_count, $genome_size, $block_size);
my (@chr, $block_size, $min_chr);
## chromosome over 5Mb in size. Use an extra, smaller bin size for chromosomes smaller than 5Mb
my $big_chr = [];
my $small_chr = [];
my (@big_chr_names, $big_block_size, $min_big_chr);
my (@small_chr_names, $small_block_size, $min_small_chr);
for my $slice ( @$top_slices ) {
if (! $min_chr or ($min_chr > $slice->length)) {
$min_chr = $slice->length;
if ($slice->length < 5000000) {
if (! $min_small_chr or ($min_small_chr > $slice->length)) {
$min_small_chr = $slice->length;
}
push @chr, $slice->seq_region_name;
push @small_chr_names, $slice->seq_region_name;
push @{$small_chr->[0]}, $slice;
}
if (! $min_big_chr or ($min_big_chr > $slice->length) && $slice->length > 5000000) {
$min_big_chr = $slice->length;
}
push @big_chr_names, $slice->seq_region_name;
push @{$big_chr->[0]}, $slice;
}
$block_size = int( $min_chr / 150 );
print STDERR "\nAvailable chromosomes: @chr\n";
$big_block_size = int( $min_big_chr / 150 );
push @{$big_chr}, $big_block_size;
$small_block_size = int( $min_small_chr / 150 );
push @{$small_chr}, $small_block_size;
print STDERR "\nAvailable chromosomes using block size of $big_block_size: @big_chr_names\n";
print STDERR "\nAvailable chromosomes using block size of $small_block_size: @small_chr_names\n";
## gene types
my %gene_types = (
......@@ -138,7 +154,9 @@ print STDERR "\n";
## create analysis and density type objects
my %dtcache;
foreach my $type (keys %gene_types) {
foreach my $block_size ($big_block_size,$small_block_size) {
eval {
foreach my $type (keys %gene_types) {
my $analysis = new Bio::EnsEMBL::Analysis (
-program => "vega_gene_density.pl",
-database => "ensembl",
......@@ -147,17 +165,22 @@ foreach my $type (keys %gene_types) {
-logic_name => $gene_types{$type});
$aa->store($analysis) unless $dry;
$analysis = $aa->fetch_by_logic_name($gene_types{$type});
my $dt = Bio::EnsEMBL::DensityType->new(-analysis => $analysis,
-block_size => $block_size,
-value_type => 'sum');
$dta->store($dt) unless $dry;
$dtcache{$gene_types{$type}} = $dt;
$dtcache{$block_size}{$gene_types{$type}} = $dt;
}
}
}
## loop over chromosomes
## loop over chromosomes, doing big ones then small ones
my ( $current_start, $current_end );
foreach my $slice (@$top_slices){
foreach my $object ($big_chr, $small_chr) {
eval {
my $block_size = $object->[1];
foreach my $slice (@{$object->[0]}){
$current_start = 1;
my $chr = $slice->seq_region_name;
my (%total, $i, %gene_names);
......@@ -201,7 +224,7 @@ foreach my $slice (@$top_slices){
(-seq_region => $slice,
-start => $current_start,
-end => $current_end,
-density_type => $dtcache{$type},
-density_type => $dtcache{$block_size}{$type},
-density_value => $num{$type} ||0
);
}
......@@ -271,12 +294,6 @@ foreach my $slice (@$top_slices){
+ $total{'Unprocessed_pseudogene' || 0},
-DESCRIPTION => 'Total Number of Pseudogenes');
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '23:Unclassified pseudogenes',
-CODE => 'UnclassPsCount',
-VALUE => $total{'Pseudogene'} || 0,
-DESCRIPTION => 'Number of Unclassified pseudogenes');
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '20:Processed pseudogenes',
-CODE => 'ProcPsCount',
......@@ -301,12 +318,22 @@ foreach my $slice (@$top_slices){
-VALUE => $total{'Novel_CDS_in_progress'} || 0,
-DESCRIPTION => 'Number of novel CDS in progress');
#only store unclassified pseudogenes if there are no processed and unprocessed pseudos, ie if
#total pseudos eq pseudos
unless ($total{'Unprocessed_pseudogene'} == 0 && $total{'Processed_pseudogene'} == 0) {
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '23:Unclassified pseudogenes',
-CODE => 'UnclassPsCount',
-VALUE => $total{'Pseudogene'} || 0,
-DESCRIPTION => 'Number of Unclassified pseudogenes');
}
$attrib_adaptor->store_on_Slice($slice, \@attribs) unless $dry;
print STDERR "Total for chr $chr:\n";
print STDERR map { "\t$_ => $total{$_}\n" } sort keys %total;
}
}
}
print STDERR "\nAll done at " . `date` . "\n";
#!/usr/bin/perl -w
#
# Calculate the GC content for top level seq_regions
# small regions 500bp to be able to display on contigview
# big regions genomesize / 4000 for 4000 features on the genome
use strict;
BEGIN {
$ENV{'ENSEMBL_SERVERROOT'} = "../../..";
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/conf");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-compara/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-draw/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-external/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-otter/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl/modules");
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/bioperl-live");
}
use SiteDefs;
use EnsWeb;
use EnsEMBL::DB::Core;
use Getopt::Long;
use Bio::EnsEMBL::DensityType;
use Bio::EnsEMBL::DensityFeature;
use POSIX;
use Data::Dumper;
my ($species, $dry, $help);
&GetOptions(
"species=s" => \$species,
"dry-run" => \$dry,
"n" => \$dry,
"help" => \$help,
"h" => \$help,
);
if($help || !$species){
print qq(Usage:
./vega_gene_density.pl
--species=Homo_sapiens
[--dry-run|-n]
[--help|-h]\n\n);
exit;
}
$ENV{'ENSEMBL_SPECIES'} = $species;
## set db user/pass to allow write access
my $db_ref = $EnsWeb::species_defs->databases;
$db_ref->{'ENSEMBL_DB'}{'USER'} = $EnsWeb::species_defs->ENSEMBL_WRITE_USER;
$db_ref->{'ENSEMBL_DB'}{'PASS'} = $EnsWeb::species_defs->ENSEMBL_WRITE_PASS;
## connect to databases
my $databases = &EnsEMBL::DB::Core::get_databases(qw(core));
my $db = $databases->{'core'};
die "Problem connecting to databases: $databases->{'error'}\n"
if $databases->{'error'} ;
warn "Database error: $databases->{'non_fatal_error'}\n"
if $databases->{'non_fatal_error'};
#
# Get the adaptors needed;
#
my $slice_adaptor = $db->get_SliceAdaptor();
my $dfa = $db->get_DensityFeatureAdaptor();
my $dta = $db->get_DensityTypeAdaptor();
my $aa = $db->get_AnalysisAdaptor();
my $top_slices = $slice_adaptor->fetch_all( "toplevel" );
my $big_chr = [];
my $small_chr = [];
my (@big_chr_names, $big_block_size, $min_big_chr);
my (@small_chr_names, $small_block_size, $min_small_chr);
for my $slice ( @$top_slices ) {
if ($slice->length < 5000000) {
if (! $min_small_chr or ($min_small_chr > $slice->length)) {
$min_small_chr = $slice->length;
}
push @small_chr_names, $slice->seq_region_name;
push @{$small_chr->[0]}, $slice;
}
if (! $min_big_chr or ($min_big_chr > $slice->length) && $slice->length > 5000000) {
$min_big_chr = $slice->length;
}
push @big_chr_names, $slice->seq_region_name;
push @{$big_chr->[0]}, $slice;
}
$big_block_size = int( $min_big_chr / 150 );
$small_block_size = int( $min_small_chr / 150 );
push @{$big_chr}, $big_block_size;
push @{$small_chr}, $small_block_size;
print STDERR "\nAvailable chromosomes using block size of $big_block_size: @big_chr_names\n";
print STDERR "\nAvailable chromosomes using block size of $small_block_size: @small_chr_names\n";
#
# Create new analysis object for density calculation.
#
my $analysis = new Bio::EnsEMBL::Analysis (-program => "percent_gc_calc.pl",
-database => "ensembl",
-gff_source => "percent_gc_calc.pl",
-gff_feature => "density",
-logic_name => "PercentGC");
$aa->store($analysis) unless $dry;
#
# Create new density types
#
if ($small_block_size) {
my $small_density_type = Bio::EnsEMBL::DensityType->new
(-analysis => $analysis,
-block_size => $small_block_size,
-value_type => 'ratio');
$dta->store($small_density_type) unless $dry;
push @{$small_chr}, $small_density_type;
}
if ($big_block_size) {
my $big_density_type = Bio::EnsEMBL::DensityType->new
(-analysis => $analysis,
-block_size => $big_block_size,
-value_type => 'ratio');
$dta->store($big_density_type) unless $dry;
push @{$big_chr}, $big_density_type;
}
my ( $current_start, $current_end );
$Data::Dumper::Maxdepth = 2;
my $types = [];
foreach my $object ( $big_chr, $small_chr) {
eval {
my $block_size = $object->[1];
foreach my $slice (@{$object->[0]}){
$current_start = 1;
warn "Chromosome ",$slice->seq_region_name;
while($current_start <= $slice->end()) {
$current_end = $current_start+$block_size-1;
if( $current_end > $slice->end() ) {
$current_end = $slice->end();
}
my $sub_slice = $slice->sub_Slice( $current_start, $current_end );
warn "start = $current_start, end = $current_end\n";
my $gc = $sub_slice->get_base_count()->{'%gc'};
my $df = Bio::EnsEMBL::DensityFeature->new
(-seq_region => $slice,
-start => $current_start,
-end => $current_end,
-density_type => $object->[2],
-density_value => $gc);
$dfa->store($df) unless $dry;
$current_start = $current_end+1;
}
}
};
}
#
# helper to draw an ascii representation of the density features
#
sub print_features {
my $features = shift;
return if(!@$features);
my $sum = 0;
my $length = 0;
# my $type = $features->[0]->{'density_type'}->value_type();
print("\n");
my $max=0;
foreach my $f (@$features) {
if($f->density_value() > $max){
$max=$f->density_value();
}
}
foreach my $f (@$features) {
my $i=1;
for(; $i< ($f->density_value()/$max)*40; $i++){
print "*";
}
for(my $j=$i;$j<40;$j++){
print " ";
}
print " ".$f->density_value()."\t".$f->start()."\n";
}
}
#!/usr/bin/perl -w
#
# Calculate the repeat coverage for given database.
# condition: 1k blocks to show contigview displays
# 4000 blocks for a whole genome
#
# checks wether database contains repeats before doing anything
use strict;
BEGIN {
$ENV{'ENSEMBL_SERVERROOT'} = "../../..";
unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/conf");