Commit 3ec37fc5 authored by Patrick Meidl's avatar Patrick Meidl
Browse files

changes from branch-vega-23-dev

parent f6782625
...@@ -3,7 +3,8 @@ ...@@ -3,7 +3,8 @@
=head1 NAME =head1 NAME
glovar_snp_density.pl - glovar_snp_density.pl -
script to calculate glovar SNP density and stats for Vega Script to calculate glovar SNP density and stats (and optioanlly prepare AV
index dumps) for Vega.
=head1 SYNOPSIS =head1 SYNOPSIS
...@@ -11,13 +12,15 @@ script to calculate glovar SNP density and stats for Vega ...@@ -11,13 +12,15 @@ script to calculate glovar SNP density and stats for Vega
--species=Homo_sapiens --species=Homo_sapiens
[--chr=6,13,14] [--chr=6,13,14]
[--dry-run|-n] [--dry-run|-n]
[--avdump|-a]
[--help|-h] [--help|-h]
=head1 DESCRIPTION =head1 DESCRIPTION
This script calculates Glovar SNP densities and total numbers per chromosome This script calculates Glovar SNP densities and total numbers per chromosome
for use in mapview. Can be run for individual chromosomes if desired (default: for use in mapview. Can be run for individual chromosomes if desired (default:
all chromosomes). all chromosomes). It optionally also dumps SNPs into a file for generating the
AV search index.
=head1 LICENCE =head1 LICENCE
...@@ -55,12 +58,14 @@ use Bio::EnsEMBL::DensityType; ...@@ -55,12 +58,14 @@ use Bio::EnsEMBL::DensityType;
use Bio::EnsEMBL::DensityFeature; use Bio::EnsEMBL::DensityFeature;
use POSIX; use POSIX;
my ($species, $chr, $dry, $help); my ($species, $chr, $dry, $avdump, $help);
&GetOptions( &GetOptions(
"species=s" => \$species, "species=s" => \$species,
"chr=s" => \$chr, "chr=s" => \$chr,
"dry-run" => \$dry, "dry-run" => \$dry,
"n" => \$dry, "n" => \$dry,
"avdump" => \$avdump,
"a" => \$avdump,
"help" => \$help, "help" => \$help,
"h" => \$help, "h" => \$help,
); );
...@@ -71,18 +76,19 @@ if($help || !$species){ ...@@ -71,18 +76,19 @@ if($help || !$species){
--species=Homo_sapiens --species=Homo_sapiens
[--chr=6,13,14] [--chr=6,13,14]
[--dry-run|-n] [--dry-run|-n]
[--avdump|-a]
[--help|-h]\n\n); [--help|-h]\n\n);
exit; exit;
} }
$ENV{'ENSEMBL_SPECIES'} = $species; $ENV{'ENSEMBL_SPECIES'} = $species;
## set db user/pass to allow write access # set db user/pass to allow write access
my $db_ref = $EnsWeb::species_defs->databases; my $db_ref = $EnsWeb::species_defs->databases;
$db_ref->{'ENSEMBL_DB'}{'USER'} = $EnsWeb::species_defs->ENSEMBL_WRITE_USER; $db_ref->{'ENSEMBL_DB'}{'USER'} = $EnsWeb::species_defs->ENSEMBL_WRITE_USER;
$db_ref->{'ENSEMBL_DB'}{'PASS'} = $EnsWeb::species_defs->ENSEMBL_WRITE_PASS; $db_ref->{'ENSEMBL_DB'}{'PASS'} = $EnsWeb::species_defs->ENSEMBL_WRITE_PASS;
## connect to databases # connect to databases
my $databases = &EnsEMBL::DB::Core::get_databases(qw(core glovar)); my $databases = &EnsEMBL::DB::Core::get_databases(qw(core glovar));
die "Problem connecting to databases: $databases->{'error'}\n" die "Problem connecting to databases: $databases->{'error'}\n"
...@@ -90,33 +96,33 @@ die "Problem connecting to databases: $databases->{'error'}\n" ...@@ -90,33 +96,33 @@ die "Problem connecting to databases: $databases->{'error'}\n"
warn "Database error: $databases->{'non_fatal_error'}\n" warn "Database error: $databases->{'non_fatal_error'}\n"
if $databases->{'non_fatal_error'}; if $databases->{'non_fatal_error'};
## get the adaptors needed # get the adaptors needed
my $dfa = $databases->{'core'}->get_DensityFeatureAdaptor; my $dfa = $databases->{'core'}->get_DensityFeatureAdaptor;
my $dta = $databases->{'core'}->get_DensityTypeAdaptor; my $dta = $databases->{'core'}->get_DensityTypeAdaptor;
my $aa = $databases->{'core'}->get_AnalysisAdaptor; my $aa = $databases->{'core'}->get_AnalysisAdaptor;
my $attrib_adaptor = $databases->{'core'}->get_AttributeAdaptor; my $attrib_adaptor = $databases->{'core'}->get_AttributeAdaptor;
my $slice_adaptor = $databases->{'core'}->get_SliceAdaptor; my $slice_adaptor = $databases->{'core'}->get_SliceAdaptor;
## which chromosomes do we run? # which chromosomes do we run?
my @top_slices; my @top_slices;
if ($chr) { if ($chr) {
## run chromosomes specified on commandline # run chromosomes specified on commandline
foreach (split(",", $chr)) { foreach (split(",", $chr)) {
push @top_slices, $slice_adaptor->fetch_by_region("toplevel", $_); push @top_slices, $slice_adaptor->fetch_by_region("toplevel", $_);
} }
} else { } else {
## run all chromosomes for this species # run all chromosomes for this species
@top_slices = @{$slice_adaptor->fetch_all("toplevel")}; @top_slices = @{$slice_adaptor->fetch_all("toplevel")};
} }
## calculate block size (assuming 4000 blocks per genome) # calculate block size (assuming 4000 blocks per genome)
my ( $block_size, $genome_size ); my ( $block_size, $genome_size );
for my $slice ( @{$slice_adaptor->fetch_all("toplevel")} ) { for my $slice ( @{$slice_adaptor->fetch_all("toplevel")} ) {
$genome_size += $slice->length; $genome_size += $slice->length;
} }
$block_size = int( $genome_size / 4000 ); $block_size = int( $genome_size / 4000 );
## analysis # analysis
my $analysis = new Bio::EnsEMBL::Analysis ( my $analysis = new Bio::EnsEMBL::Analysis (
-program => "glovar_snp_density.pl", -program => "glovar_snp_density.pl",
-database => "vega", -database => "vega",
...@@ -125,20 +131,40 @@ my $analysis = new Bio::EnsEMBL::Analysis ( ...@@ -125,20 +131,40 @@ my $analysis = new Bio::EnsEMBL::Analysis (
-logic_name => "snpDensity"); -logic_name => "snpDensity");
$aa->store( $analysis ) unless $dry; $aa->store( $analysis ) unless $dry;
## density type # density type
my $dt = Bio::EnsEMBL::DensityType->new( my $dt = Bio::EnsEMBL::DensityType->new(
-analysis => $analysis, -analysis => $analysis,
-block_size => $block_size, -block_size => $block_size,
-value_type => 'sum'); -value_type => 'sum');
$dta->store($dt) unless $dry; $dta->store($dt) unless $dry;
## loop over chromosomes # loop over chromosomes
my @chr; my @chr;
foreach my $sl (@top_slices) { foreach my $sl (@top_slices) {
push @chr, $sl->seq_region_name; push @chr, $sl->seq_region_name;
} }
print STDERR "\nAvailable chromosomes: @chr\n"; print STDERR "\nAvailable chromosomes: @chr\n";
# settings for AV index dump
use constant SNP_LINE => join("\t",
'Glovar SNP', '%s', '/%s/snpview?snp=%s&source=glovar', '%s',
"Single nucleotide polymorphism (SNP) %s [Alleles: %s]. Alternative IDs: %s.\n"
);
if ($avdump) {
print STDERR "Preparing directories for AV index dump...\n";
my $dumpdir = "$ENV{'ENSEMBL_SERVERROOT'}/utils/indexing/input";
unless (-e $dumpdir) {
mkdir $dumpdir, 0777 or die "Could not creat directory $dumpdir: $!\n";
}
unless (-e "$dumpdir/$species") {
mkdir "$dumpdir/$species", 0777 or die
"Could not creat directory $dumpdir/$species: $!\n";
}
open (AV, ">>$dumpdir/$species/SNP.txt") or die
"Could not open $dumpdir/$species/SNP.txt for writing: $!\n";
print STDERR "Done.\n";
}
my ($current_start, $current_end); my ($current_start, $current_end);
foreach my $slice (@top_slices) { foreach my $slice (@top_slices) {
$current_start = 1; $current_start = 1;
...@@ -149,7 +175,7 @@ foreach my $slice (@top_slices) { ...@@ -149,7 +175,7 @@ foreach my $slice (@top_slices) {
print STDERR "\nSNP densities for chr $chr with block size $block_size\n"; print STDERR "\nSNP densities for chr $chr with block size $block_size\n";
print STDERR "Start at " . `date`; print STDERR "Start at " . `date`;
## loop over blocks # loop over blocks
while($current_start <= $slice->end) { while($current_start <= $slice->end) {
$i++; $i++;
$current_end = $current_start+$block_size-1; $current_end = $current_start+$block_size-1;
...@@ -166,12 +192,34 @@ foreach my $slice (@top_slices) { ...@@ -166,12 +192,34 @@ foreach my $slice (@top_slices) {
$current_start = $current_end + 1; $current_start = $current_end + 1;
next; next;
} }
## only count snps that don't overlap slice start # only count snps that don't overlap slice start
## also, avoid duplicate counting # also, avoid duplicate counting
my %snps = map { "$_->name => 1" if ($_->start >= 1) } @{$snps}; my %snps = map { "$_->display_id => 1" if ($_->start >= 1) } @{$snps};
$count = scalar(keys %snps); $count = scalar(keys %snps);
## density # AV index dump
if ($avdump) {
foreach my $snpo (@{$snps}) {
next if ($snpo->start < 1);
my $snpid = $snpo->display_id;
my (@IDs, @desc);
foreach my $link ($snpo->each_DBLink) {
push @IDs, $link->primary_id;
push @desc, $link->database . ": " . $link->primary_id;
}
print AV sprintf SNP_LINE,
$snpid,
$species,
$snpid,
join(" ", @IDs),
$snpid,
$snpo->alleles,
join(", ", @desc)
;
}
}
# density
my $df = Bio::EnsEMBL::DensityFeature->new my $df = Bio::EnsEMBL::DensityFeature->new
(-seq_region => $slice, (-seq_region => $slice,
-start => $current_start, -start => $current_start,
...@@ -182,12 +230,12 @@ foreach my $slice (@top_slices) { ...@@ -182,12 +230,12 @@ foreach my $slice (@top_slices) {
$dfa->store($df) unless $dry; $dfa->store($df) unless $dry;
$total += $count; $total += $count;
## logging # logging
print STDERR "Chr: $chr | Bin: $i/$bins | Count: $count | "; print STDERR "Chr: $chr | Bin: $i/$bins | Count: $count | ";
print STDERR "Mem: " . `ps $$ -o vsz |tail -1`; print STDERR "Mem: " . `ps $$ -o vsz |tail -1`;
} }
## stats # stats
print STDERR "Total for chr $chr: $total\n"; print STDERR "Total for chr $chr: $total\n";
my $stat = Bio::EnsEMBL::Attribute->new my $stat = Bio::EnsEMBL::Attribute->new
(-NAME => 'SNPs', (-NAME => 'SNPs',
...@@ -196,6 +244,7 @@ foreach my $slice (@top_slices) { ...@@ -196,6 +244,7 @@ foreach my $slice (@top_slices) {
-DESCRIPTION => 'Total Number of SNPs'); -DESCRIPTION => 'Total Number of SNPs');
$attrib_adaptor->store_on_Slice($slice, [$stat]) unless $dry; $attrib_adaptor->store_on_Slice($slice, [$stat]) unless $dry;
} }
close AV if $avdump;
print STDERR "\nAll done at " . `date` . "\n"; print STDERR "\nAll done at " . `date` . "\n";
...@@ -10,12 +10,13 @@ Wrapper for glovar_snp_density.pl ...@@ -10,12 +10,13 @@ Wrapper for glovar_snp_density.pl
./glovar_snp_density.pl ./glovar_snp_density.pl
--species=Homo_sapiens --species=Homo_sapiens
[--dry-run|-n] [--dry-run|-n]
[--avdump|-a]
=head1 DESCRIPTION =head1 DESCRIPTION
Wrapper for glovar_snp_density.pl to run it chromosome by chromosome. This is Wrapper for glovar_snp_density.pl to run it chromosome by chromosome. This is
an attempt to avoid high memory footprints caused by a memory leak somerwhere an attempt to avoid high memory footprints caused by a memory leak somerwhere
in the API ... in the API. See glovar_snp_density.pl for more detailed documentation.
=head1 LICENCE =head1 LICENCE
...@@ -49,17 +50,20 @@ use SiteDefs; ...@@ -49,17 +50,20 @@ use SiteDefs;
use EnsWeb; use EnsWeb;
use Getopt::Long; use Getopt::Long;
my ($species, $dry); my ($species, $dry, $avdump);
&GetOptions( &GetOptions(
"species=s" => \$species, "species=s" => \$species,
"dry-run" => \$dry, "dry-run" => \$dry,
"n" => \$dry, "n" => \$dry,
"avdump" => \$avdump,
"a" => \$avdump,
); );
unless ($species) { unless ($species) {
print qq(Usage: print qq(Usage:
./glovar_snp_density.pl ./glovar_snp_density.pl
--species=Homo_sapiens --species=Homo_sapiens
[--avdump|-a]
[--dry-run|-n]\n\n); [--dry-run|-n]\n\n);
exit; exit;
} }
...@@ -69,8 +73,9 @@ $ENV{'ENSEMBL_SPECIES'} = $species; ...@@ -69,8 +73,9 @@ $ENV{'ENSEMBL_SPECIES'} = $species;
## run glovar_snp_density.pl for each chromsome in this species ## run glovar_snp_density.pl for each chromsome in this species
my $command = "./glovar_snp_density.pl --species=$species"; my $command = "./glovar_snp_density.pl --species=$species";
$command .= " -n" if ($dry); $command .= " -n" if ($dry);
$command .= " -a" if ($avdump);
foreach my $chr (@{$EnsWeb::species_defs->ENSEMBL_CHROMOSOMES}) { foreach my $chr (@{$EnsWeb::species_defs->ENSEMBL_CHROMOSOMES}) {
warn "$command --chr=$chr\n"; warn "$command --chr=$chr\n";
system("$command --chr=$chr"); system("$command --chr=$chr") == 0 or die "$command --chr=$chr failed: $!\n";
} }
...@@ -102,17 +102,33 @@ my $slice_adaptor = $db->get_SliceAdaptor; ...@@ -102,17 +102,33 @@ my $slice_adaptor = $db->get_SliceAdaptor;
my $top_slices = $slice_adaptor->fetch_all('toplevel'); my $top_slices = $slice_adaptor->fetch_all('toplevel');
## determine blocksize, assuming you want 150 blocks for the smallest ## determine blocksize, assuming you want 150 blocks for the smallest
## chromosome ## chromosome over 5Mb in size. Use an extra, smaller bin size for chromosomes smaller than 5Mb
my ($block_count, $genome_size, $block_size); my $big_chr = [];
my (@chr, $block_size, $min_chr); my $small_chr = [];
my (@big_chr_names, $big_block_size, $min_big_chr);
my (@small_chr_names, $small_block_size, $min_small_chr);
for my $slice ( @$top_slices ) { for my $slice ( @$top_slices ) {
if (! $min_chr or ($min_chr > $slice->length)) { if ($slice->length < 5000000) {
$min_chr = $slice->length; if (! $min_small_chr or ($min_small_chr > $slice->length)) {
$min_small_chr = $slice->length;
}
push @small_chr_names, $slice->seq_region_name;
push @{$small_chr->[0]}, $slice;
} }
push @chr, $slice->seq_region_name; if (! $min_big_chr or ($min_big_chr > $slice->length) && $slice->length > 5000000) {
$min_big_chr = $slice->length;
}
push @big_chr_names, $slice->seq_region_name;
push @{$big_chr->[0]}, $slice;
} }
$block_size = int( $min_chr / 150 );
print STDERR "\nAvailable chromosomes: @chr\n"; $big_block_size = int( $min_big_chr / 150 );
push @{$big_chr}, $big_block_size;
$small_block_size = int( $min_small_chr / 150 );
push @{$small_chr}, $small_block_size;
print STDERR "\nAvailable chromosomes using block size of $big_block_size: @big_chr_names\n";
print STDERR "\nAvailable chromosomes using block size of $small_block_size: @small_chr_names\n";
## gene types ## gene types
my %gene_types = ( my %gene_types = (
...@@ -137,176 +153,187 @@ print STDERR join(" ", sort keys %gene_types); ...@@ -137,176 +153,187 @@ print STDERR join(" ", sort keys %gene_types);
print STDERR "\n"; print STDERR "\n";
## create analysis and density type objects ## create analysis and density type objects
my %dtcache; my %dtcache;
foreach my $type (keys %gene_types) { foreach my $block_size ($big_block_size,$small_block_size) {
my $analysis = new Bio::EnsEMBL::Analysis ( eval {
-program => "vega_gene_density.pl", foreach my $type (keys %gene_types) {
-database => "ensembl", my $analysis = new Bio::EnsEMBL::Analysis (
-gff_source => "vega_gene_density.pl", -program => "vega_gene_density.pl",
-gff_feature => "density", -database => "ensembl",
-logic_name => $gene_types{$type}); -gff_source => "vega_gene_density.pl",
$aa->store($analysis) unless $dry; -gff_feature => "density",
$analysis = $aa->fetch_by_logic_name($gene_types{$type}); -logic_name => $gene_types{$type});
$aa->store($analysis) unless $dry;
my $dt = Bio::EnsEMBL::DensityType->new(-analysis => $analysis, $analysis = $aa->fetch_by_logic_name($gene_types{$type});
-block_size => $block_size, my $dt = Bio::EnsEMBL::DensityType->new(-analysis => $analysis,
-value_type => 'sum'); -block_size => $block_size,
$dta->store($dt) unless $dry; -value_type => 'sum');
$dtcache{$gene_types{$type}} = $dt; $dta->store($dt) unless $dry;
$dtcache{$block_size}{$gene_types{$type}} = $dt;
}
}
} }
## loop over chromosomes
## loop over chromosomes, doing big ones then small ones
my ( $current_start, $current_end ); my ( $current_start, $current_end );
foreach my $slice (@$top_slices){ foreach my $object ($big_chr, $small_chr) {
$current_start = 1; eval {
my $chr = $slice->seq_region_name; my $block_size = $object->[1];
my (%total, $i, %gene_names); foreach my $slice (@{$object->[0]}){
my $bins = POSIX::ceil($slice->end / $block_size); $current_start = 1;
my $chr = $slice->seq_region_name;
print STDERR "\nGene densities for chr $chr with block size $block_size\n"; my (%total, $i, %gene_names);
print STDERR "Start at " . `date`; my $bins = POSIX::ceil($slice->end / $block_size);
## loop over blocks print STDERR "\nGene densities for chr $chr with block size $block_size\n";
my @density_features; print STDERR "Start at " . `date`;
while($current_start <= $slice->end) {
$i++; ## loop over blocks
$current_end = $current_start+$block_size-1; my @density_features;
if( $current_end > $slice->end ) { while($current_start <= $slice->end) {
$current_end = $slice->end; $i++;
} $current_end = $current_start+$block_size-1;
my $sub_slice = $slice->sub_Slice( $current_start, $current_end ); if( $current_end > $slice->end ) {
my %num = (); $current_end = $slice->end;
}
## count genes by type my $sub_slice = $slice->sub_Slice( $current_start, $current_end );
my $genes; my %num = ();
eval { $genes = $sub_slice->get_all_Genes; };
if ($@) { ## count genes by type
warn $@; my $genes;
$current_start = $current_end + 1; eval { $genes = $sub_slice->get_all_Genes; };
next; if ($@) {
} warn $@;
foreach my $gene (@{$genes}) { $current_start = $current_end + 1;
## only count genes that don't overlap the subslice start next;
## (since these were already counted in the last bin) }
if ($gene->start >= 1) { foreach my $gene (@{$genes}) {
$total{$gene->type}++; ## only count genes that don't overlap the subslice start
} ## (since these were already counted in the last bin)
$num{$gene_types{$gene->type}}++; if ($gene->start >= 1) {
} $total{$gene->type}++;
}
## create DensityFeature objects for each type $num{$gene_types{$gene->type}}++;
foreach my $type (keys %density_types) { }
push @density_features, Bio::EnsEMBL::DensityFeature->new ## create DensityFeature objects for each type
(-seq_region => $slice, foreach my $type (keys %density_types) {
-start => $current_start,
-end => $current_end, push @density_features, Bio::EnsEMBL::DensityFeature->new
-density_type => $dtcache{$type}, (-seq_region => $slice,
-density_value => $num{$type} ||0 -start => $current_start,
); -end => $current_end,
} -density_type => $dtcache{$block_size}{$type},
$current_start = $current_end + 1; -density_value => $num{$type} ||0
);
## logging }
print STDERR "Chr: $chr | Bin: $i/$bins | Counts: "; $current_start = $current_end + 1;
print STDERR join(",", map { $num{$gene_types{$_}} || 0 }
sort keys %gene_types); ## logging
print STDERR " | "; print STDERR "Chr: $chr | Bin: $i/$bins | Counts: ";
print STDERR "Mem: " . `ps $$ -o vsz |tail -1`; print STDERR join(",", map { $num{$gene_types{$_}} || 0 }
sort keys %gene_types);
print STDERR " | ";
print STDERR "Mem: " . `ps $$ -o vsz |tail -1`;
}
## store DensityFeatures for the chromosome
$dfa->store(@density_features) unless $dry;
## stats
my @attribs;
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '12:Known genes',
-CODE => 'KnownGeneCount',