Commit 3ec37fc5 authored by Patrick Meidl's avatar Patrick Meidl
Browse files

changes from branch-vega-23-dev

parent f6782625
......@@ -3,7 +3,8 @@
=head1 NAME
glovar_snp_density.pl -
script to calculate glovar SNP density and stats for Vega
Script to calculate glovar SNP density and stats (and optioanlly prepare AV
index dumps) for Vega.
=head1 SYNOPSIS
......@@ -11,13 +12,15 @@ script to calculate glovar SNP density and stats for Vega
--species=Homo_sapiens
[--chr=6,13,14]
[--dry-run|-n]
[--avdump|-a]
[--help|-h]
=head1 DESCRIPTION
This script calculates Glovar SNP densities and total numbers per chromosome
for use in mapview. Can be run for individual chromosomes if desired (default:
all chromosomes).
all chromosomes). It optionally also dumps SNPs into a file for generating the
AV search index.
=head1 LICENCE
......@@ -55,12 +58,14 @@ use Bio::EnsEMBL::DensityType;
use Bio::EnsEMBL::DensityFeature;
use POSIX;
my ($species, $chr, $dry, $help);
my ($species, $chr, $dry, $avdump, $help);
&GetOptions(
"species=s" => \$species,
"chr=s" => \$chr,
"dry-run" => \$dry,
"n" => \$dry,
"avdump" => \$avdump,
"a" => \$avdump,
"help" => \$help,
"h" => \$help,
);
......@@ -71,18 +76,19 @@ if($help || !$species){
--species=Homo_sapiens
[--chr=6,13,14]
[--dry-run|-n]
[--avdump|-a]
[--help|-h]\n\n);
exit;
}
$ENV{'ENSEMBL_SPECIES'} = $species;
## set db user/pass to allow write access
# set db user/pass to allow write access
my $db_ref = $EnsWeb::species_defs->databases;
$db_ref->{'ENSEMBL_DB'}{'USER'} = $EnsWeb::species_defs->ENSEMBL_WRITE_USER;
$db_ref->{'ENSEMBL_DB'}{'PASS'} = $EnsWeb::species_defs->ENSEMBL_WRITE_PASS;
## connect to databases
# connect to databases
my $databases = &EnsEMBL::DB::Core::get_databases(qw(core glovar));
die "Problem connecting to databases: $databases->{'error'}\n"
......@@ -90,33 +96,33 @@ die "Problem connecting to databases: $databases->{'error'}\n"
warn "Database error: $databases->{'non_fatal_error'}\n"
if $databases->{'non_fatal_error'};
## get the adaptors needed
# get the adaptors needed
my $dfa = $databases->{'core'}->get_DensityFeatureAdaptor;
my $dta = $databases->{'core'}->get_DensityTypeAdaptor;
my $aa = $databases->{'core'}->get_AnalysisAdaptor;
my $attrib_adaptor = $databases->{'core'}->get_AttributeAdaptor;
my $slice_adaptor = $databases->{'core'}->get_SliceAdaptor;
## which chromosomes do we run?
# which chromosomes do we run?
my @top_slices;
if ($chr) {
## run chromosomes specified on commandline
# run chromosomes specified on commandline
foreach (split(",", $chr)) {
push @top_slices, $slice_adaptor->fetch_by_region("toplevel", $_);
}
} else {
## run all chromosomes for this species
# run all chromosomes for this species
@top_slices = @{$slice_adaptor->fetch_all("toplevel")};
}
## calculate block size (assuming 4000 blocks per genome)
# calculate block size (assuming 4000 blocks per genome)
my ( $block_size, $genome_size );
for my $slice ( @{$slice_adaptor->fetch_all("toplevel")} ) {
$genome_size += $slice->length;
}
$block_size = int( $genome_size / 4000 );
## analysis
# analysis
my $analysis = new Bio::EnsEMBL::Analysis (
-program => "glovar_snp_density.pl",
-database => "vega",
......@@ -125,20 +131,40 @@ my $analysis = new Bio::EnsEMBL::Analysis (
-logic_name => "snpDensity");
$aa->store( $analysis ) unless $dry;
## density type
# density type
my $dt = Bio::EnsEMBL::DensityType->new(
-analysis => $analysis,
-block_size => $block_size,
-value_type => 'sum');
$dta->store($dt) unless $dry;
## loop over chromosomes
# loop over chromosomes
my @chr;
foreach my $sl (@top_slices) {
push @chr, $sl->seq_region_name;
}
print STDERR "\nAvailable chromosomes: @chr\n";
# settings for AV index dump
use constant SNP_LINE => join("\t",
'Glovar SNP', '%s', '/%s/snpview?snp=%s&source=glovar', '%s',
"Single nucleotide polymorphism (SNP) %s [Alleles: %s]. Alternative IDs: %s.\n"
);
if ($avdump) {
print STDERR "Preparing directories for AV index dump...\n";
my $dumpdir = "$ENV{'ENSEMBL_SERVERROOT'}/utils/indexing/input";
unless (-e $dumpdir) {
mkdir $dumpdir, 0777 or die "Could not creat directory $dumpdir: $!\n";
}
unless (-e "$dumpdir/$species") {
mkdir "$dumpdir/$species", 0777 or die
"Could not creat directory $dumpdir/$species: $!\n";
}
open (AV, ">>$dumpdir/$species/SNP.txt") or die
"Could not open $dumpdir/$species/SNP.txt for writing: $!\n";
print STDERR "Done.\n";
}
my ($current_start, $current_end);
foreach my $slice (@top_slices) {
$current_start = 1;
......@@ -149,7 +175,7 @@ foreach my $slice (@top_slices) {
print STDERR "\nSNP densities for chr $chr with block size $block_size\n";
print STDERR "Start at " . `date`;
## loop over blocks
# loop over blocks
while($current_start <= $slice->end) {
$i++;
$current_end = $current_start+$block_size-1;
......@@ -166,12 +192,34 @@ foreach my $slice (@top_slices) {
$current_start = $current_end + 1;
next;
}
## only count snps that don't overlap slice start
## also, avoid duplicate counting
my %snps = map { "$_->name => 1" if ($_->start >= 1) } @{$snps};
# only count snps that don't overlap slice start
# also, avoid duplicate counting
my %snps = map { "$_->display_id => 1" if ($_->start >= 1) } @{$snps};
$count = scalar(keys %snps);
## density
# AV index dump
if ($avdump) {
foreach my $snpo (@{$snps}) {
next if ($snpo->start < 1);
my $snpid = $snpo->display_id;
my (@IDs, @desc);
foreach my $link ($snpo->each_DBLink) {
push @IDs, $link->primary_id;
push @desc, $link->database . ": " . $link->primary_id;
}
print AV sprintf SNP_LINE,
$snpid,
$species,
$snpid,
join(" ", @IDs),
$snpid,
$snpo->alleles,
join(", ", @desc)
;
}
}
# density
my $df = Bio::EnsEMBL::DensityFeature->new
(-seq_region => $slice,
-start => $current_start,
......@@ -182,12 +230,12 @@ foreach my $slice (@top_slices) {
$dfa->store($df) unless $dry;
$total += $count;
## logging
# logging
print STDERR "Chr: $chr | Bin: $i/$bins | Count: $count | ";
print STDERR "Mem: " . `ps $$ -o vsz |tail -1`;
}
## stats
# stats
print STDERR "Total for chr $chr: $total\n";
my $stat = Bio::EnsEMBL::Attribute->new
(-NAME => 'SNPs',
......@@ -196,6 +244,7 @@ foreach my $slice (@top_slices) {
-DESCRIPTION => 'Total Number of SNPs');
$attrib_adaptor->store_on_Slice($slice, [$stat]) unless $dry;
}
close AV if $avdump;
print STDERR "\nAll done at " . `date` . "\n";
......@@ -10,12 +10,13 @@ Wrapper for glovar_snp_density.pl
./glovar_snp_density.pl
--species=Homo_sapiens
[--dry-run|-n]
[--avdump|-a]
=head1 DESCRIPTION
Wrapper for glovar_snp_density.pl to run it chromosome by chromosome. This is
an attempt to avoid high memory footprints caused by a memory leak somerwhere
in the API ...
in the API. See glovar_snp_density.pl for more detailed documentation.
=head1 LICENCE
......@@ -49,17 +50,20 @@ use SiteDefs;
use EnsWeb;
use Getopt::Long;
my ($species, $dry);
my ($species, $dry, $avdump);
&GetOptions(
"species=s" => \$species,
"dry-run" => \$dry,
"n" => \$dry,
"avdump" => \$avdump,
"a" => \$avdump,
);
unless ($species) {
print qq(Usage:
./glovar_snp_density.pl
--species=Homo_sapiens
[--avdump|-a]
[--dry-run|-n]\n\n);
exit;
}
......@@ -69,8 +73,9 @@ $ENV{'ENSEMBL_SPECIES'} = $species;
## run glovar_snp_density.pl for each chromsome in this species
my $command = "./glovar_snp_density.pl --species=$species";
$command .= " -n" if ($dry);
$command .= " -a" if ($avdump);
foreach my $chr (@{$EnsWeb::species_defs->ENSEMBL_CHROMOSOMES}) {
warn "$command --chr=$chr\n";
system("$command --chr=$chr");
system("$command --chr=$chr") == 0 or die "$command --chr=$chr failed: $!\n";
}
......@@ -102,17 +102,33 @@ my $slice_adaptor = $db->get_SliceAdaptor;
my $top_slices = $slice_adaptor->fetch_all('toplevel');
## determine blocksize, assuming you want 150 blocks for the smallest
## chromosome
my ($block_count, $genome_size, $block_size);
my (@chr, $block_size, $min_chr);
## chromosome over 5Mb in size. Use an extra, smaller bin size for chromosomes smaller than 5Mb
my $big_chr = [];
my $small_chr = [];
my (@big_chr_names, $big_block_size, $min_big_chr);
my (@small_chr_names, $small_block_size, $min_small_chr);
for my $slice ( @$top_slices ) {
if (! $min_chr or ($min_chr > $slice->length)) {
$min_chr = $slice->length;
if ($slice->length < 5000000) {
if (! $min_small_chr or ($min_small_chr > $slice->length)) {
$min_small_chr = $slice->length;
}
push @small_chr_names, $slice->seq_region_name;
push @{$small_chr->[0]}, $slice;
}
push @chr, $slice->seq_region_name;
if (! $min_big_chr or ($min_big_chr > $slice->length) && $slice->length > 5000000) {
$min_big_chr = $slice->length;
}
push @big_chr_names, $slice->seq_region_name;
push @{$big_chr->[0]}, $slice;
}
$block_size = int( $min_chr / 150 );
print STDERR "\nAvailable chromosomes: @chr\n";
$big_block_size = int( $min_big_chr / 150 );
push @{$big_chr}, $big_block_size;
$small_block_size = int( $min_small_chr / 150 );
push @{$small_chr}, $small_block_size;
print STDERR "\nAvailable chromosomes using block size of $big_block_size: @big_chr_names\n";
print STDERR "\nAvailable chromosomes using block size of $small_block_size: @small_chr_names\n";
## gene types
my %gene_types = (
......@@ -137,176 +153,187 @@ print STDERR join(" ", sort keys %gene_types);
print STDERR "\n";
## create analysis and density type objects
my %dtcache;
foreach my $type (keys %gene_types) {
my $analysis = new Bio::EnsEMBL::Analysis (
-program => "vega_gene_density.pl",
-database => "ensembl",
-gff_source => "vega_gene_density.pl",
-gff_feature => "density",
-logic_name => $gene_types{$type});
$aa->store($analysis) unless $dry;
$analysis = $aa->fetch_by_logic_name($gene_types{$type});
my $dt = Bio::EnsEMBL::DensityType->new(-analysis => $analysis,
-block_size => $block_size,
-value_type => 'sum');
$dta->store($dt) unless $dry;
$dtcache{$gene_types{$type}} = $dt;
my %dtcache;
foreach my $block_size ($big_block_size,$small_block_size) {
eval {
foreach my $type (keys %gene_types) {
my $analysis = new Bio::EnsEMBL::Analysis (
-program => "vega_gene_density.pl",
-database => "ensembl",
-gff_source => "vega_gene_density.pl",
-gff_feature => "density",
-logic_name => $gene_types{$type});
$aa->store($analysis) unless $dry;
$analysis = $aa->fetch_by_logic_name($gene_types{$type});
my $dt = Bio::EnsEMBL::DensityType->new(-analysis => $analysis,
-block_size => $block_size,
-value_type => 'sum');
$dta->store($dt) unless $dry;
$dtcache{$block_size}{$gene_types{$type}} = $dt;
}
}
}
## loop over chromosomes
## loop over chromosomes, doing big ones then small ones
my ( $current_start, $current_end );
foreach my $slice (@$top_slices){
$current_start = 1;
my $chr = $slice->seq_region_name;
my (%total, $i, %gene_names);
my $bins = POSIX::ceil($slice->end / $block_size);
print STDERR "\nGene densities for chr $chr with block size $block_size\n";
print STDERR "Start at " . `date`;
## loop over blocks
my @density_features;
while($current_start <= $slice->end) {
$i++;
$current_end = $current_start+$block_size-1;
if( $current_end > $slice->end ) {
$current_end = $slice->end;
}
my $sub_slice = $slice->sub_Slice( $current_start, $current_end );
my %num = ();
## count genes by type
my $genes;
eval { $genes = $sub_slice->get_all_Genes; };
if ($@) {
warn $@;
$current_start = $current_end + 1;
next;
}
foreach my $gene (@{$genes}) {
## only count genes that don't overlap the subslice start
## (since these were already counted in the last bin)
if ($gene->start >= 1) {
$total{$gene->type}++;
}
$num{$gene_types{$gene->type}}++;
}
## create DensityFeature objects for each type
foreach my $type (keys %density_types) {
push @density_features, Bio::EnsEMBL::DensityFeature->new
(-seq_region => $slice,
-start => $current_start,
-end => $current_end,
-density_type => $dtcache{$type},
-density_value => $num{$type} ||0
);
}
$current_start = $current_end + 1;
## logging
print STDERR "Chr: $chr | Bin: $i/$bins | Counts: ";
print STDERR join(",", map { $num{$gene_types{$_}} || 0 }
sort keys %gene_types);
print STDERR " | ";
print STDERR "Mem: " . `ps $$ -o vsz |tail -1`;
foreach my $object ($big_chr, $small_chr) {
eval {
my $block_size = $object->[1];
foreach my $slice (@{$object->[0]}){
$current_start = 1;
my $chr = $slice->seq_region_name;
my (%total, $i, %gene_names);
my $bins = POSIX::ceil($slice->end / $block_size);
print STDERR "\nGene densities for chr $chr with block size $block_size\n";
print STDERR "Start at " . `date`;
## loop over blocks
my @density_features;
while($current_start <= $slice->end) {
$i++;
$current_end = $current_start+$block_size-1;
if( $current_end > $slice->end ) {
$current_end = $slice->end;
}
my $sub_slice = $slice->sub_Slice( $current_start, $current_end );
my %num = ();
## count genes by type
my $genes;
eval { $genes = $sub_slice->get_all_Genes; };
if ($@) {
warn $@;
$current_start = $current_end + 1;
next;
}
foreach my $gene (@{$genes}) {
## only count genes that don't overlap the subslice start
## (since these were already counted in the last bin)
if ($gene->start >= 1) {
$total{$gene->type}++;
}
$num{$gene_types{$gene->type}}++;
}
## create DensityFeature objects for each type
foreach my $type (keys %density_types) {
push @density_features, Bio::EnsEMBL::DensityFeature->new
(-seq_region => $slice,
-start => $current_start,
-end => $current_end,
-density_type => $dtcache{$block_size}{$type},
-density_value => $num{$type} ||0
);
}
$current_start = $current_end + 1;
## logging
print STDERR "Chr: $chr | Bin: $i/$bins | Counts: ";
print STDERR join(",", map { $num{$gene_types{$_}} || 0 }
sort keys %gene_types);
print STDERR " | ";
print STDERR "Mem: " . `ps $$ -o vsz |tail -1`;
}
## store DensityFeatures for the chromosome
$dfa->store(@density_features) unless $dry;
## stats
my @attribs;
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '12:Known genes',
-CODE => 'KnownGeneCount',
-VALUE => $total{'Known'} || 0,
-DESCRIPTION => 'Total Number of Known genes');
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '14:Novel CDS',
-CODE => 'NovelCDSCount',
-VALUE => $total{'Novel_CDS'} || 0,
-DESCRIPTION => 'Total Number of Novel CDSs');
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '16:Novel transcripts',
-CODE => 'NovelTransCount',
-VALUE => $total{'Novel_Transcript'} || 0,
-DESCRIPTION => 'Total Number of Novel transcripts');
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '24:Putative transcripts',
-CODE => 'PutTransCount',
-VALUE => $total{'Putative'} || 0,
-DESCRIPTION => 'Total Number of Putative transcripts');
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '30:Predicted transcripts',
-CODE => 'PredTransCount',
-VALUE => $total{'Predicted_Gene'} || 0,
-DESCRIPTION => 'Total Number of Predicted transcripts');
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '26:Ig segments',
-CODE => 'IgSegCount',
-VALUE => $total{'Ig_Segment'} || 0,
-DESCRIPTION => 'Total Number of Ig Segments');
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '28:Ig pseudogene Segments',
-CODE => 'IgPsSegCount',
-VALUE => $total{'Ig_Pseudogene_Segment'} || 0,
-DESCRIPTION => 'Total Number of Ig Pseudogene Segments');
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '18:Total pseudogenes',
-CODE => 'TotPsCount',
-VALUE => $total{'Pseudogene'}
+ $total{'Processed_pseudogene'}
+ $total{'Unprocessed_pseudogene' || 0},
-DESCRIPTION => 'Total Number of Pseudogenes');
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '20:Processed pseudogenes',
-CODE => 'ProcPsCount',
-VALUE => $total{'Processed_pseudogene'} || 0,
-DESCRIPTION => 'Number of Processed pseudogenes');
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '22:Unprocessed pseudogenes',
-CODE => 'UnprocPsCount',
-VALUE => $total{'Unprocessed_pseudogene'} || 0,
-DESCRIPTION => 'Number of Unprocessed pseudogenes');
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '13:Known genes (in progress)',
-CODE => 'KnwnprogCount',
-VALUE => $total{'Known_in_progress'} || 0,
-DESCRIPTION => 'Number of Known Genes in progress');
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '15:Novel CDS (in progress)',
-CODE => 'NovCDSprogCount',
-VALUE => $total{'Novel_CDS_in_progress'} || 0,
-DESCRIPTION => 'Number of novel CDS in progress');
#only store unclassified pseudogenes if there are no processed and unprocessed pseudos, ie if
#total pseudos eq pseudos
unless ($total{'Unprocessed_pseudogene'} == 0 && $total{'Processed_pseudogene'} == 0) {
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '23:Unclassified pseudogenes',
-CODE => 'UnclassPsCount',
-VALUE => $total{'Pseudogene'} || 0,
-DESCRIPTION => 'Number of Unclassified pseudogenes');
}
$attrib_adaptor->store_on_Slice($slice, \@attribs) unless $dry;
print STDERR "Total for chr $chr:\n";
print STDERR map { "\t$_ => $total{$_}\n" } sort keys %total;
}
}
## store DensityFeatures for the chromosome
$dfa->store(@density_features) unless $dry;
## stats
my @attribs;
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '12:Known genes',
-CODE => 'KnownGeneCount',
-VALUE => $total{'Known'} || 0,
-DESCRIPTION => 'Total Number of Known genes');
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '14:Novel CDS',
-CODE => 'NovelCDSCount',
-VALUE => $total{'Novel_CDS'} || 0,
-DESCRIPTION => 'Total Number of Novel CDSs');
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '16:Novel transcripts',
-CODE => 'NovelTransCount',
-VALUE => $total{'Novel_Transcript'} || 0,
-DESCRIPTION => 'Total Number of Novel transcripts');
push @attribs, Bio::EnsEMBL::Attribute->new
(-NAME => '24:Putative transcripts',
-CODE => 'PutTransCount',
-VALUE => $total{'Putative'} || 0,
-DESCRIPTION => 'Total Number of Putative transcripts');