diff --git a/misc-scripts/density_feature/glovar_snp_density.pl b/misc-scripts/density_feature/glovar_snp_density.pl index c208d5a02ee973d89c686d180cdfa1cc2e14e386..6b64d9ed472f13192de78df6d8990e304c4c332f 100755 --- a/misc-scripts/density_feature/glovar_snp_density.pl +++ b/misc-scripts/density_feature/glovar_snp_density.pl @@ -3,7 +3,8 @@ =head1 NAME glovar_snp_density.pl - -script to calculate glovar SNP density and stats for Vega +Script to calculate glovar SNP density and stats (and optioanlly prepare AV +index dumps) for Vega. =head1 SYNOPSIS @@ -11,13 +12,15 @@ script to calculate glovar SNP density and stats for Vega --species=Homo_sapiens [--chr=6,13,14] [--dry-run|-n] + [--avdump|-a] [--help|-h] =head1 DESCRIPTION This script calculates Glovar SNP densities and total numbers per chromosome for use in mapview. Can be run for individual chromosomes if desired (default: -all chromosomes). +all chromosomes). It optionally also dumps SNPs into a file for generating the +AV search index. =head1 LICENCE @@ -55,12 +58,14 @@ use Bio::EnsEMBL::DensityType; use Bio::EnsEMBL::DensityFeature; use POSIX; -my ($species, $chr, $dry, $help); +my ($species, $chr, $dry, $avdump, $help); &GetOptions( "species=s" => \$species, "chr=s" => \$chr, "dry-run" => \$dry, "n" => \$dry, + "avdump" => \$avdump, + "a" => \$avdump, "help" => \$help, "h" => \$help, ); @@ -71,18 +76,19 @@ if($help || !$species){ --species=Homo_sapiens [--chr=6,13,14] [--dry-run|-n] + [--avdump|-a] [--help|-h]\n\n); exit; } $ENV{'ENSEMBL_SPECIES'} = $species; -## set db user/pass to allow write access +# set db user/pass to allow write access my $db_ref = $EnsWeb::species_defs->databases; $db_ref->{'ENSEMBL_DB'}{'USER'} = $EnsWeb::species_defs->ENSEMBL_WRITE_USER; $db_ref->{'ENSEMBL_DB'}{'PASS'} = $EnsWeb::species_defs->ENSEMBL_WRITE_PASS; -## connect to databases +# connect to databases my $databases = &EnsEMBL::DB::Core::get_databases(qw(core glovar)); die "Problem connecting to databases: $databases->{'error'}\n" @@ -90,33 +96,33 @@ die "Problem connecting to databases: $databases->{'error'}\n" warn "Database error: $databases->{'non_fatal_error'}\n" if $databases->{'non_fatal_error'}; -## get the adaptors needed +# get the adaptors needed my $dfa = $databases->{'core'}->get_DensityFeatureAdaptor; my $dta = $databases->{'core'}->get_DensityTypeAdaptor; my $aa = $databases->{'core'}->get_AnalysisAdaptor; my $attrib_adaptor = $databases->{'core'}->get_AttributeAdaptor; my $slice_adaptor = $databases->{'core'}->get_SliceAdaptor; -## which chromosomes do we run? +# which chromosomes do we run? my @top_slices; if ($chr) { - ## run chromosomes specified on commandline + # run chromosomes specified on commandline foreach (split(",", $chr)) { push @top_slices, $slice_adaptor->fetch_by_region("toplevel", $_); } } else { - ## run all chromosomes for this species + # run all chromosomes for this species @top_slices = @{$slice_adaptor->fetch_all("toplevel")}; } -## calculate block size (assuming 4000 blocks per genome) +# calculate block size (assuming 4000 blocks per genome) my ( $block_size, $genome_size ); for my $slice ( @{$slice_adaptor->fetch_all("toplevel")} ) { $genome_size += $slice->length; } $block_size = int( $genome_size / 4000 ); -## analysis +# analysis my $analysis = new Bio::EnsEMBL::Analysis ( -program => "glovar_snp_density.pl", -database => "vega", @@ -125,20 +131,40 @@ my $analysis = new Bio::EnsEMBL::Analysis ( -logic_name => "snpDensity"); $aa->store( $analysis ) unless $dry; -## density type +# density type my $dt = Bio::EnsEMBL::DensityType->new( -analysis => $analysis, -block_size => $block_size, -value_type => 'sum'); $dta->store($dt) unless $dry; -## loop over chromosomes +# loop over chromosomes my @chr; foreach my $sl (@top_slices) { push @chr, $sl->seq_region_name; } print STDERR "\nAvailable chromosomes: @chr\n"; +# settings for AV index dump +use constant SNP_LINE => join("\t", + 'Glovar SNP', '%s', '/%s/snpview?snp=%s&source=glovar', '%s', + "Single nucleotide polymorphism (SNP) %s [Alleles: %s]. Alternative IDs: %s.\n" +); +if ($avdump) { + print STDERR "Preparing directories for AV index dump...\n"; + my $dumpdir = "$ENV{'ENSEMBL_SERVERROOT'}/utils/indexing/input"; + unless (-e $dumpdir) { + mkdir $dumpdir, 0777 or die "Could not creat directory $dumpdir: $!\n"; + } + unless (-e "$dumpdir/$species") { + mkdir "$dumpdir/$species", 0777 or die + "Could not creat directory $dumpdir/$species: $!\n"; + } + open (AV, ">>$dumpdir/$species/SNP.txt") or die + "Could not open $dumpdir/$species/SNP.txt for writing: $!\n"; + print STDERR "Done.\n"; +} + my ($current_start, $current_end); foreach my $slice (@top_slices) { $current_start = 1; @@ -149,7 +175,7 @@ foreach my $slice (@top_slices) { print STDERR "\nSNP densities for chr $chr with block size $block_size\n"; print STDERR "Start at " . `date`; - ## loop over blocks + # loop over blocks while($current_start <= $slice->end) { $i++; $current_end = $current_start+$block_size-1; @@ -166,12 +192,34 @@ foreach my $slice (@top_slices) { $current_start = $current_end + 1; next; } - ## only count snps that don't overlap slice start - ## also, avoid duplicate counting - my %snps = map { "$_->name => 1" if ($_->start >= 1) } @{$snps}; + # only count snps that don't overlap slice start + # also, avoid duplicate counting + my %snps = map { "$_->display_id => 1" if ($_->start >= 1) } @{$snps}; $count = scalar(keys %snps); - ## density + # AV index dump + if ($avdump) { + foreach my $snpo (@{$snps}) { + next if ($snpo->start < 1); + my $snpid = $snpo->display_id; + my (@IDs, @desc); + foreach my $link ($snpo->each_DBLink) { + push @IDs, $link->primary_id; + push @desc, $link->database . ": " . $link->primary_id; + } + print AV sprintf SNP_LINE, + $snpid, + $species, + $snpid, + join(" ", @IDs), + $snpid, + $snpo->alleles, + join(", ", @desc) + ; + } + } + + # density my $df = Bio::EnsEMBL::DensityFeature->new (-seq_region => $slice, -start => $current_start, @@ -182,12 +230,12 @@ foreach my $slice (@top_slices) { $dfa->store($df) unless $dry; $total += $count; - ## logging + # logging print STDERR "Chr: $chr | Bin: $i/$bins | Count: $count | "; print STDERR "Mem: " . `ps $$ -o vsz |tail -1`; } - ## stats + # stats print STDERR "Total for chr $chr: $total\n"; my $stat = Bio::EnsEMBL::Attribute->new (-NAME => 'SNPs', @@ -196,6 +244,7 @@ foreach my $slice (@top_slices) { -DESCRIPTION => 'Total Number of SNPs'); $attrib_adaptor->store_on_Slice($slice, [$stat]) unless $dry; } +close AV if $avdump; print STDERR "\nAll done at " . `date` . "\n"; diff --git a/misc-scripts/density_feature/glovar_snp_wrapper.pl b/misc-scripts/density_feature/glovar_snp_wrapper.pl index a3748b3447e95f3da4e6bbe0f564f8247251b1a8..9008a4170095c0f2732192890dc28f7297d3151a 100755 --- a/misc-scripts/density_feature/glovar_snp_wrapper.pl +++ b/misc-scripts/density_feature/glovar_snp_wrapper.pl @@ -10,12 +10,13 @@ Wrapper for glovar_snp_density.pl ./glovar_snp_density.pl --species=Homo_sapiens [--dry-run|-n] + [--avdump|-a] =head1 DESCRIPTION Wrapper for glovar_snp_density.pl to run it chromosome by chromosome. This is an attempt to avoid high memory footprints caused by a memory leak somerwhere -in the API ... +in the API. See glovar_snp_density.pl for more detailed documentation. =head1 LICENCE @@ -49,17 +50,20 @@ use SiteDefs; use EnsWeb; use Getopt::Long; -my ($species, $dry); +my ($species, $dry, $avdump); &GetOptions( "species=s" => \$species, "dry-run" => \$dry, "n" => \$dry, + "avdump" => \$avdump, + "a" => \$avdump, ); unless ($species) { print qq(Usage: ./glovar_snp_density.pl --species=Homo_sapiens + [--avdump|-a] [--dry-run|-n]\n\n); exit; } @@ -69,8 +73,9 @@ $ENV{'ENSEMBL_SPECIES'} = $species; ## run glovar_snp_density.pl for each chromsome in this species my $command = "./glovar_snp_density.pl --species=$species"; $command .= " -n" if ($dry); +$command .= " -a" if ($avdump); foreach my $chr (@{$EnsWeb::species_defs->ENSEMBL_CHROMOSOMES}) { warn "$command --chr=$chr\n"; - system("$command --chr=$chr"); + system("$command --chr=$chr") == 0 or die "$command --chr=$chr failed: $!\n"; } diff --git a/misc-scripts/density_feature/vega_gene_density.pl b/misc-scripts/density_feature/vega_gene_density.pl index 24a3d04eb23bc3b8833e73244254ec18bebf2d7c..31bbb578ac38dcca5f35892bcd15739570afcf5a 100755 --- a/misc-scripts/density_feature/vega_gene_density.pl +++ b/misc-scripts/density_feature/vega_gene_density.pl @@ -102,17 +102,33 @@ my $slice_adaptor = $db->get_SliceAdaptor; my $top_slices = $slice_adaptor->fetch_all('toplevel'); ## determine blocksize, assuming you want 150 blocks for the smallest -## chromosome -my ($block_count, $genome_size, $block_size); -my (@chr, $block_size, $min_chr); +## chromosome over 5Mb in size. Use an extra, smaller bin size for chromosomes smaller than 5Mb +my $big_chr = []; +my $small_chr = []; +my (@big_chr_names, $big_block_size, $min_big_chr); +my (@small_chr_names, $small_block_size, $min_small_chr); for my $slice ( @$top_slices ) { - if (! $min_chr or ($min_chr > $slice->length)) { - $min_chr = $slice->length; + if ($slice->length < 5000000) { + if (! $min_small_chr or ($min_small_chr > $slice->length)) { + $min_small_chr = $slice->length; + } + push @small_chr_names, $slice->seq_region_name; + push @{$small_chr->[0]}, $slice; } - push @chr, $slice->seq_region_name; + if (! $min_big_chr or ($min_big_chr > $slice->length) && $slice->length > 5000000) { + $min_big_chr = $slice->length; + } + push @big_chr_names, $slice->seq_region_name; + push @{$big_chr->[0]}, $slice; } -$block_size = int( $min_chr / 150 ); -print STDERR "\nAvailable chromosomes: @chr\n"; + +$big_block_size = int( $min_big_chr / 150 ); +push @{$big_chr}, $big_block_size; +$small_block_size = int( $min_small_chr / 150 ); +push @{$small_chr}, $small_block_size; + +print STDERR "\nAvailable chromosomes using block size of $big_block_size: @big_chr_names\n"; +print STDERR "\nAvailable chromosomes using block size of $small_block_size: @small_chr_names\n"; ## gene types my %gene_types = ( @@ -137,176 +153,187 @@ print STDERR join(" ", sort keys %gene_types); print STDERR "\n"; ## create analysis and density type objects -my %dtcache; -foreach my $type (keys %gene_types) { - my $analysis = new Bio::EnsEMBL::Analysis ( - -program => "vega_gene_density.pl", - -database => "ensembl", - -gff_source => "vega_gene_density.pl", - -gff_feature => "density", - -logic_name => $gene_types{$type}); - $aa->store($analysis) unless $dry; - $analysis = $aa->fetch_by_logic_name($gene_types{$type}); - - my $dt = Bio::EnsEMBL::DensityType->new(-analysis => $analysis, - -block_size => $block_size, - -value_type => 'sum'); - $dta->store($dt) unless $dry; - $dtcache{$gene_types{$type}} = $dt; +my %dtcache; +foreach my $block_size ($big_block_size,$small_block_size) { + eval { + foreach my $type (keys %gene_types) { + my $analysis = new Bio::EnsEMBL::Analysis ( + -program => "vega_gene_density.pl", + -database => "ensembl", + -gff_source => "vega_gene_density.pl", + -gff_feature => "density", + -logic_name => $gene_types{$type}); + $aa->store($analysis) unless $dry; + $analysis = $aa->fetch_by_logic_name($gene_types{$type}); + my $dt = Bio::EnsEMBL::DensityType->new(-analysis => $analysis, + -block_size => $block_size, + -value_type => 'sum'); + $dta->store($dt) unless $dry; + $dtcache{$block_size}{$gene_types{$type}} = $dt; + } + } } -## loop over chromosomes + +## loop over chromosomes, doing big ones then small ones my ( $current_start, $current_end ); -foreach my $slice (@$top_slices){ - $current_start = 1; - my $chr = $slice->seq_region_name; - my (%total, $i, %gene_names); - my $bins = POSIX::ceil($slice->end / $block_size); - - print STDERR "\nGene densities for chr $chr with block size $block_size\n"; - print STDERR "Start at " . `date`; - - ## loop over blocks - my @density_features; - while($current_start <= $slice->end) { - $i++; - $current_end = $current_start+$block_size-1; - if( $current_end > $slice->end ) { - $current_end = $slice->end; - } - my $sub_slice = $slice->sub_Slice( $current_start, $current_end ); - my %num = (); - - ## count genes by type - my $genes; - eval { $genes = $sub_slice->get_all_Genes; }; - if ($@) { - warn $@; - $current_start = $current_end + 1; - next; - } - foreach my $gene (@{$genes}) { - ## only count genes that don't overlap the subslice start - ## (since these were already counted in the last bin) - if ($gene->start >= 1) { - $total{$gene->type}++; - } - $num{$gene_types{$gene->type}}++; - } - - ## create DensityFeature objects for each type - foreach my $type (keys %density_types) { - - push @density_features, Bio::EnsEMBL::DensityFeature->new - (-seq_region => $slice, - -start => $current_start, - -end => $current_end, - -density_type => $dtcache{$type}, - -density_value => $num{$type} ||0 - ); - } - $current_start = $current_end + 1; - - ## logging - print STDERR "Chr: $chr | Bin: $i/$bins | Counts: "; - print STDERR join(",", map { $num{$gene_types{$_}} || 0 } - sort keys %gene_types); - print STDERR " | "; - print STDERR "Mem: " . `ps $$ -o vsz |tail -1`; +foreach my $object ($big_chr, $small_chr) { + eval { + my $block_size = $object->[1]; + foreach my $slice (@{$object->[0]}){ + $current_start = 1; + my $chr = $slice->seq_region_name; + my (%total, $i, %gene_names); + my $bins = POSIX::ceil($slice->end / $block_size); + + print STDERR "\nGene densities for chr $chr with block size $block_size\n"; + print STDERR "Start at " . `date`; + + ## loop over blocks + my @density_features; + while($current_start <= $slice->end) { + $i++; + $current_end = $current_start+$block_size-1; + if( $current_end > $slice->end ) { + $current_end = $slice->end; + } + my $sub_slice = $slice->sub_Slice( $current_start, $current_end ); + my %num = (); + + ## count genes by type + my $genes; + eval { $genes = $sub_slice->get_all_Genes; }; + if ($@) { + warn $@; + $current_start = $current_end + 1; + next; + } + foreach my $gene (@{$genes}) { + ## only count genes that don't overlap the subslice start + ## (since these were already counted in the last bin) + if ($gene->start >= 1) { + $total{$gene->type}++; + } + $num{$gene_types{$gene->type}}++; + } + + ## create DensityFeature objects for each type + foreach my $type (keys %density_types) { + + push @density_features, Bio::EnsEMBL::DensityFeature->new + (-seq_region => $slice, + -start => $current_start, + -end => $current_end, + -density_type => $dtcache{$block_size}{$type}, + -density_value => $num{$type} ||0 + ); + } + $current_start = $current_end + 1; + + ## logging + print STDERR "Chr: $chr | Bin: $i/$bins | Counts: "; + print STDERR join(",", map { $num{$gene_types{$_}} || 0 } + sort keys %gene_types); + print STDERR " | "; + print STDERR "Mem: " . `ps $$ -o vsz |tail -1`; + } + + + ## store DensityFeatures for the chromosome + $dfa->store(@density_features) unless $dry; + + ## stats + my @attribs; + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => '12:Known genes', + -CODE => 'KnownGeneCount', + -VALUE => $total{'Known'} || 0, + -DESCRIPTION => 'Total Number of Known genes'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => '14:Novel CDS', + -CODE => 'NovelCDSCount', + -VALUE => $total{'Novel_CDS'} || 0, + -DESCRIPTION => 'Total Number of Novel CDSs'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => '16:Novel transcripts', + -CODE => 'NovelTransCount', + -VALUE => $total{'Novel_Transcript'} || 0, + -DESCRIPTION => 'Total Number of Novel transcripts'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => '24:Putative transcripts', + -CODE => 'PutTransCount', + -VALUE => $total{'Putative'} || 0, + -DESCRIPTION => 'Total Number of Putative transcripts'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => '30:Predicted transcripts', + -CODE => 'PredTransCount', + -VALUE => $total{'Predicted_Gene'} || 0, + -DESCRIPTION => 'Total Number of Predicted transcripts'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => '26:Ig segments', + -CODE => 'IgSegCount', + -VALUE => $total{'Ig_Segment'} || 0, + -DESCRIPTION => 'Total Number of Ig Segments'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => '28:Ig pseudogene Segments', + -CODE => 'IgPsSegCount', + -VALUE => $total{'Ig_Pseudogene_Segment'} || 0, + -DESCRIPTION => 'Total Number of Ig Pseudogene Segments'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => '18:Total pseudogenes', + -CODE => 'TotPsCount', + -VALUE => $total{'Pseudogene'} + + $total{'Processed_pseudogene'} + + $total{'Unprocessed_pseudogene' || 0}, + -DESCRIPTION => 'Total Number of Pseudogenes'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => '20:Processed pseudogenes', + -CODE => 'ProcPsCount', + -VALUE => $total{'Processed_pseudogene'} || 0, + -DESCRIPTION => 'Number of Processed pseudogenes'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => '22:Unprocessed pseudogenes', + -CODE => 'UnprocPsCount', + -VALUE => $total{'Unprocessed_pseudogene'} || 0, + -DESCRIPTION => 'Number of Unprocessed pseudogenes'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => '13:Known genes (in progress)', + -CODE => 'KnwnprogCount', + -VALUE => $total{'Known_in_progress'} || 0, + -DESCRIPTION => 'Number of Known Genes in progress'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => '15:Novel CDS (in progress)', + -CODE => 'NovCDSprogCount', + -VALUE => $total{'Novel_CDS_in_progress'} || 0, + -DESCRIPTION => 'Number of novel CDS in progress'); + + #only store unclassified pseudogenes if there are no processed and unprocessed pseudos, ie if + #total pseudos eq pseudos + unless ($total{'Unprocessed_pseudogene'} == 0 && $total{'Processed_pseudogene'} == 0) { + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => '23:Unclassified pseudogenes', + -CODE => 'UnclassPsCount', + -VALUE => $total{'Pseudogene'} || 0, + -DESCRIPTION => 'Number of Unclassified pseudogenes'); + } + + $attrib_adaptor->store_on_Slice($slice, \@attribs) unless $dry; + + print STDERR "Total for chr $chr:\n"; + print STDERR map { "\t$_ => $total{$_}\n" } sort keys %total; + } } - - - ## store DensityFeatures for the chromosome - $dfa->store(@density_features) unless $dry; - - ## stats - my @attribs; - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => '12:Known genes', - -CODE => 'KnownGeneCount', - -VALUE => $total{'Known'} || 0, - -DESCRIPTION => 'Total Number of Known genes'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => '14:Novel CDS', - -CODE => 'NovelCDSCount', - -VALUE => $total{'Novel_CDS'} || 0, - -DESCRIPTION => 'Total Number of Novel CDSs'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => '16:Novel transcripts', - -CODE => 'NovelTransCount', - -VALUE => $total{'Novel_Transcript'} || 0, - -DESCRIPTION => 'Total Number of Novel transcripts'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => '24:Putative transcripts', - -CODE => 'PutTransCount', - -VALUE => $total{'Putative'} || 0, - -DESCRIPTION => 'Total Number of Putative transcripts'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => '30:Predicted transcripts', - -CODE => 'PredTransCount', - -VALUE => $total{'Predicted_Gene'} || 0, - -DESCRIPTION => 'Total Number of Predicted transcripts'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => '26:Ig segments', - -CODE => 'IgSegCount', - -VALUE => $total{'Ig_Segment'} || 0, - -DESCRIPTION => 'Total Number of Ig Segments'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => '28:Ig pseudogene Segments', - -CODE => 'IgPsSegCount', - -VALUE => $total{'Ig_Pseudogene_Segment'} || 0, - -DESCRIPTION => 'Total Number of Ig Pseudogene Segments'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => '18:Total pseudogenes', - -CODE => 'TotPsCount', - -VALUE => $total{'Pseudogene'} - + $total{'Processed_pseudogene'} - + $total{'Unprocessed_pseudogene' || 0}, - -DESCRIPTION => 'Total Number of Pseudogenes'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => '23:Unclassified pseudogenes', - -CODE => 'UnclassPsCount', - -VALUE => $total{'Pseudogene'} || 0, - -DESCRIPTION => 'Number of Unclassified pseudogenes'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => '20:Processed pseudogenes', - -CODE => 'ProcPsCount', - -VALUE => $total{'Processed_pseudogene'} || 0, - -DESCRIPTION => 'Number of Processed pseudogenes'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => '22:Unprocessed pseudogenes', - -CODE => 'UnprocPsCount', - -VALUE => $total{'Unprocessed_pseudogene'} || 0, - -DESCRIPTION => 'Number of Unprocessed pseudogenes'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => '13:Known genes (in progress)', - -CODE => 'KnwnprogCount', - -VALUE => $total{'Known_in_progress'} || 0, - -DESCRIPTION => 'Number of Known Genes in progress'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => '15:Novel CDS (in progress)', - -CODE => 'NovCDSprogCount', - -VALUE => $total{'Novel_CDS_in_progress'} || 0, - -DESCRIPTION => 'Number of novel CDS in progress'); - - $attrib_adaptor->store_on_Slice($slice, \@attribs) unless $dry; - - print STDERR "Total for chr $chr:\n"; - print STDERR map { "\t$_ => $total{$_}\n" } sort keys %total; - } - print STDERR "\nAll done at " . `date` . "\n"; diff --git a/misc-scripts/density_feature/vega_percent_gc_calc.pl b/misc-scripts/density_feature/vega_percent_gc_calc.pl new file mode 100644 index 0000000000000000000000000000000000000000..3ac5dd2e44773245a15a94bdd67795c575da3a25 --- /dev/null +++ b/misc-scripts/density_feature/vega_percent_gc_calc.pl @@ -0,0 +1,196 @@ +#!/usr/bin/perl -w + +# +# Calculate the GC content for top level seq_regions +# small regions 500bp to be able to display on contigview +# big regions genomesize / 4000 for 4000 features on the genome + +use strict; +BEGIN { + $ENV{'ENSEMBL_SERVERROOT'} = "../../.."; + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/conf"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-compara/modules"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-draw/modules"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-external/modules"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-otter/modules"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/modules"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl/modules"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/bioperl-live"); +} + +use SiteDefs; +use EnsWeb; +use EnsEMBL::DB::Core; +use Getopt::Long; +use Bio::EnsEMBL::DensityType; +use Bio::EnsEMBL::DensityFeature; +use POSIX; + +use Data::Dumper; + +my ($species, $dry, $help); +&GetOptions( + "species=s" => \$species, + "dry-run" => \$dry, + "n" => \$dry, + "help" => \$help, + "h" => \$help, +); + +if($help || !$species){ + print qq(Usage: + ./vega_gene_density.pl + --species=Homo_sapiens + [--dry-run|-n] + [--help|-h]\n\n); + exit; +} + +$ENV{'ENSEMBL_SPECIES'} = $species; + +## set db user/pass to allow write access +my $db_ref = $EnsWeb::species_defs->databases; +$db_ref->{'ENSEMBL_DB'}{'USER'} = $EnsWeb::species_defs->ENSEMBL_WRITE_USER; +$db_ref->{'ENSEMBL_DB'}{'PASS'} = $EnsWeb::species_defs->ENSEMBL_WRITE_PASS; + +## connect to databases +my $databases = &EnsEMBL::DB::Core::get_databases(qw(core)); +my $db = $databases->{'core'}; + +die "Problem connecting to databases: $databases->{'error'}\n" + if $databases->{'error'} ; +warn "Database error: $databases->{'non_fatal_error'}\n" + if $databases->{'non_fatal_error'}; + +# +# Get the adaptors needed; +# +my $slice_adaptor = $db->get_SliceAdaptor(); +my $dfa = $db->get_DensityFeatureAdaptor(); +my $dta = $db->get_DensityTypeAdaptor(); +my $aa = $db->get_AnalysisAdaptor(); + +my $top_slices = $slice_adaptor->fetch_all( "toplevel" ); + +my $big_chr = []; +my $small_chr = []; + +my (@big_chr_names, $big_block_size, $min_big_chr); +my (@small_chr_names, $small_block_size, $min_small_chr); +for my $slice ( @$top_slices ) { + if ($slice->length < 5000000) { + if (! $min_small_chr or ($min_small_chr > $slice->length)) { + $min_small_chr = $slice->length; + } + push @small_chr_names, $slice->seq_region_name; + push @{$small_chr->[0]}, $slice; + } + if (! $min_big_chr or ($min_big_chr > $slice->length) && $slice->length > 5000000) { + $min_big_chr = $slice->length; + } + push @big_chr_names, $slice->seq_region_name; + push @{$big_chr->[0]}, $slice; +} + +$big_block_size = int( $min_big_chr / 150 ); +$small_block_size = int( $min_small_chr / 150 ); +push @{$big_chr}, $big_block_size; +push @{$small_chr}, $small_block_size; + + +print STDERR "\nAvailable chromosomes using block size of $big_block_size: @big_chr_names\n"; +print STDERR "\nAvailable chromosomes using block size of $small_block_size: @small_chr_names\n"; + +# +# Create new analysis object for density calculation. +# +my $analysis = new Bio::EnsEMBL::Analysis (-program => "percent_gc_calc.pl", + -database => "ensembl", + -gff_source => "percent_gc_calc.pl", + -gff_feature => "density", + -logic_name => "PercentGC"); +$aa->store($analysis) unless $dry; + +# +# Create new density types +# +if ($small_block_size) { + my $small_density_type = Bio::EnsEMBL::DensityType->new + (-analysis => $analysis, + -block_size => $small_block_size, + -value_type => 'ratio'); + $dta->store($small_density_type) unless $dry; + push @{$small_chr}, $small_density_type; +} + +if ($big_block_size) { + my $big_density_type = Bio::EnsEMBL::DensityType->new + (-analysis => $analysis, + -block_size => $big_block_size, + -value_type => 'ratio'); + $dta->store($big_density_type) unless $dry; + push @{$big_chr}, $big_density_type; +} + +my ( $current_start, $current_end ); + +$Data::Dumper::Maxdepth = 2; + +my $types = []; +foreach my $object ( $big_chr, $small_chr) { + eval { + my $block_size = $object->[1]; + foreach my $slice (@{$object->[0]}){ + $current_start = 1; + warn "Chromosome ",$slice->seq_region_name; + while($current_start <= $slice->end()) { + $current_end = $current_start+$block_size-1; + if( $current_end > $slice->end() ) { + $current_end = $slice->end(); + } + my $sub_slice = $slice->sub_Slice( $current_start, $current_end ); + warn "start = $current_start, end = $current_end\n"; + my $gc = $sub_slice->get_base_count()->{'%gc'}; + my $df = Bio::EnsEMBL::DensityFeature->new + (-seq_region => $slice, + -start => $current_start, + -end => $current_end, + -density_type => $object->[2], + -density_value => $gc); + $dfa->store($df) unless $dry; + $current_start = $current_end+1; + } + } + }; +} + +# +# helper to draw an ascii representation of the density features +# +sub print_features { + my $features = shift; + + return if(!@$features); + + my $sum = 0; + my $length = 0; + # my $type = $features->[0]->{'density_type'}->value_type(); + + print("\n"); + my $max=0; + foreach my $f (@$features) { + if($f->density_value() > $max){ + $max=$f->density_value(); + } + } + foreach my $f (@$features) { + my $i=1; + for(; $i< ($f->density_value()/$max)*40; $i++){ + print "*"; + } + for(my $j=$i;$j<40;$j++){ + print " "; + } + print " ".$f->density_value()."\t".$f->start()."\n"; + } +} diff --git a/misc-scripts/density_feature/vega_repeat_coverage_calc.pl b/misc-scripts/density_feature/vega_repeat_coverage_calc.pl new file mode 100644 index 0000000000000000000000000000000000000000..a11f6b35fa1b0b8d8e830c71c38e784659dc123d --- /dev/null +++ b/misc-scripts/density_feature/vega_repeat_coverage_calc.pl @@ -0,0 +1,201 @@ +#!/usr/bin/perl -w +# +# Calculate the repeat coverage for given database. +# condition: 1k blocks to show contigview displays +# 4000 blocks for a whole genome +# +# checks wether database contains repeats before doing anything +use strict; +BEGIN { + $ENV{'ENSEMBL_SERVERROOT'} = "../../.."; + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/conf"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-compara/modules"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-draw/modules"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-external/modules"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-otter/modules"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/modules"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl/modules"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/bioperl-live"); +} + +use SiteDefs; +use EnsWeb; +use EnsEMBL::DB::Core; +use Getopt::Long; +use Bio::EnsEMBL::DensityType; +use Bio::EnsEMBL::DensityFeature; +use Bio::EnsEMBL::Mapper::RangeRegistry; +use POSIX; + +use Data::Dumper; + +my ($species, $dry, $help); +&GetOptions( + "species=s" => \$species, + "dry-run" => \$dry, + "n" => \$dry, + "help" => \$help, + "h" => \$help, +); + +if($help || !$species){ + print qq(Usage: + ./vega_gene_density.pl + --species=Homo_sapiens + [--dry-run|-n] + [--help|-h]\n\n); + exit; +} + +$ENV{'ENSEMBL_SPECIES'} = $species; + +## set db user/pass to allow write access +my $db_ref = $EnsWeb::species_defs->databases; +$db_ref->{'ENSEMBL_DB'}{'USER'} = $EnsWeb::species_defs->ENSEMBL_WRITE_USER; +$db_ref->{'ENSEMBL_DB'}{'PASS'} = $EnsWeb::species_defs->ENSEMBL_WRITE_PASS; + +## connect to databases +my $databases = &EnsEMBL::DB::Core::get_databases(qw(core)); +my $db = $databases->{'core'}; + +die "Problem connecting to databases: $databases->{'error'}\n" + if $databases->{'error'} ; +warn "Database error: $databases->{'non_fatal_error'}\n" + if $databases->{'non_fatal_error'}; + +# +# Get the adaptors needed; +# +my $slice_adaptor = $db->get_SliceAdaptor(); +my $dfa = $db->get_DensityFeatureAdaptor(); +my $dta = $db->get_DensityTypeAdaptor(); +my $aa = $db->get_AnalysisAdaptor(); + +my $top_slices = $slice_adaptor->fetch_all( "toplevel" ); +my $big_chr = []; +my $small_chr = []; +my (@big_chr_names, $big_block_size, $min_big_chr); +my (@small_chr_names, $small_block_size, $min_small_chr); +for my $slice ( @$top_slices ) { + if ($slice->length < 5000000) { + if (! $min_small_chr or ($min_small_chr > $slice->length)) { + $min_small_chr = $slice->length; + } + push @small_chr_names, $slice->seq_region_name; + push @{$small_chr->[0]}, $slice; + } + if (! $min_big_chr or ($min_big_chr > $slice->length) && $slice->length > 5000000) { + $min_big_chr = $slice->length; + } + push @big_chr_names, $slice->seq_region_name; + push @{$big_chr->[0]}, $slice; +} + + +$big_block_size = int( $min_big_chr / 150 ); +$small_block_size = int( $min_small_chr / 150 ); +push @{$big_chr}, $big_block_size; +push @{$small_chr}, $small_block_size; + +print STDERR "\nAvailable chromosomes using block size of $big_block_size: @big_chr_names\n"; +print STDERR "\nAvailable chromosomes using block size of $small_block_size: @small_chr_names\n"; + +# +# Create new analysis object for density calculation. +# +my $analysis = new Bio::EnsEMBL::Analysis (-program => "repeat_coverage_calc.pl", + -database => "ensembl", + -gff_source => "repeat_coverage_calc.pl", + -gff_feature => "density", + -logic_name => "PercentageRepeat"); +$aa->store($analysis) unless $dry; + +if ($small_block_size) { + my $small_density_type = Bio::EnsEMBL::DensityType->new + (-analysis => $analysis, + -block_size => $small_block_size, + -value_type => 'ratio'); + $dta->store($small_density_type) unless $dry; + push @{$small_chr}, $small_density_type; +} + +if ($big_block_size) { + my $big_density_type = Bio::EnsEMBL::DensityType->new + (-analysis => $analysis, + -block_size => $big_block_size, + -value_type => 'ratio'); + $dta->store($big_density_type) unless $dry; + push @{$big_chr}, $big_density_type; +} + +my ( $current_start, $current_end ); + +foreach my $object ($big_chr, $small_chr) { + eval { + my $block_size = $object->[1]; + foreach my $slice ( @{$object->[0]} ) { + $current_start = 1; + warn "Chromosome ", $slice->seq_region_name; + while($current_start <= $slice->end()) { + $current_end = $current_start+$block_size-1; + if( $current_end > $slice->end() ) { + $current_end = $slice->end(); + } + my $this_block_size = $current_end - $current_start + 1; + my $sub_slice = $slice->sub_Slice( $current_start, $current_end ); + warn "start = $current_start, end = $current_end\n"; + my $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new(); + foreach my $repeat (@{$sub_slice->get_all_RepeatFeatures()}){ + $rr->check_and_register("1",$repeat->start,$repeat->end); + } + my $count = 0; + my $non_repeats = $rr->check_and_register("1",1,$this_block_size); + if( defined $non_repeats ) { + foreach my $non_repeat ( @$non_repeats ) { + $count += ($non_repeat->[1]-$non_repeat->[0])+1; + } + } + my $percentage_repeat = (($this_block_size-$count)/$this_block_size)*100; + my $df = Bio::EnsEMBL::DensityFeature->new + (-seq_region => $slice, + -start => $current_start, + -end => $current_end, + -density_type => $object->[2], + -density_value => $percentage_repeat); + $dfa->store($df) unless $dry; + $current_start = $current_end + 1; + } + } + }; +} + +# +# helper to draw an ascii representation of the density features +# +sub print_features { + my $features = shift; + + return if(!@$features); + + my $sum = 0; + my $length = 0; +# my $type = $features->[0]->{'density_type'}->value_type(); + + print("\n"); + my $max=0; + foreach my $f (@$features) { + if($f->density_value() > $max){ + $max=$f->density_value(); + } + } + foreach my $f (@$features) { + my $i=1; + for(; $i< ($f->density_value()/$max)*40; $i++){ + print "*"; + } + for(my $j=$i;$j<40;$j++){ + print " "; + } + print " ".$f->density_value()."\t".$f->start()."\n"; + } +} diff --git a/misc-scripts/repeats/vega_repeat_libraries.pl b/misc-scripts/repeats/vega_repeat_libraries.pl index c536dd5bae0cec60d428fbb2fb5ad721bbc3fbf9..9ff458b02fcfc1111d0df98033b2e64eb79fe148 100644 --- a/misc-scripts/repeats/vega_repeat_libraries.pl +++ b/misc-scripts/repeats/vega_repeat_libraries.pl @@ -47,7 +47,7 @@ my $dbh = DBI->connect( $dsn, $user, $pass ); my @dbnames = map {$_->[0] } @{ $dbh->selectall_arrayref( "show databases" ) }; -my @dbs = grep {$_ =~ /$dbpattern/} @dbnames; +my @dbs = grep {$_ =~ /^$dbpattern$/} @dbnames; foreach my $db (@dbs) { open RFILE, $repeatfile or die("Could not open repeat file $repeatfile");