changes from branch-vega-23-dev

3ec37fc5 · Patrick Meidl · f6782625 · 3ec37fc5 · 3ec37fc5 · 3ec37fc5
Commit 3ec37fc5 authored 20 years ago by Patrick Meidl
--- a/misc-scripts/density_feature/glovar_snp_density.pl
+++ b/misc-scripts/density_feature/glovar_snp_density.pl
@@ -3,7 +3,8 @@
 =head1 NAME

 glovar_snp_density.pl -
-script to calculate glovar SNP density and stats for Vega
+Script to calculate glovar SNP density and stats (and optioanlly prepare AV
+index dumps) for Vega.

 =head1 SYNOPSIS

@@ -11,13 +12,15 @@ script to calculate glovar SNP density and stats for Vega
        --species=Homo_sapiens
        [--chr=6,13,14]
        [--dry-run|-n]
+        [--avdump|-a]
        [--help|-h]

 =head1 DESCRIPTION

 This script calculates Glovar SNP densities and total numbers per chromosome
 for use in mapview. Can be run for individual chromosomes if desired (default:
-all chromosomes).
+all chromosomes). It optionally also dumps SNPs into a file for generating the
+AV search index.

 =head1 LICENCE

@@ -55,12 +58,14 @@ use Bio::EnsEMBL::DensityType;
 use Bio::EnsEMBL::DensityFeature;
 use POSIX;

-my ($species, $chr, $dry, $help);
+my ($species, $chr, $dry, $avdump, $help);
 &GetOptions(
    "species=s" => \$species,
    "chr=s"     => \$chr,
    "dry-run"   => \$dry,
    "n"         => \$dry,
+    "avdump"    => \$avdump,
+    "a"         => \$avdump,
    "help"      => \$help,
    "h"         => \$help,
 );
@@ -71,18 +76,19 @@ if($help || !$species){
        --species=Homo_sapiens
        [--chr=6,13,14]
        [--dry-run|-n]
+        [--avdump|-a]
        [--help|-h]\n\n);
    exit;
 }

 $ENV{'ENSEMBL_SPECIES'} = $species;

-## set db user/pass to allow write access
+# set db user/pass to allow write access
 my $db_ref = $EnsWeb::species_defs->databases;
 $db_ref->{'ENSEMBL_DB'}{'USER'} = $EnsWeb::species_defs->ENSEMBL_WRITE_USER;
 $db_ref->{'ENSEMBL_DB'}{'PASS'} = $EnsWeb::species_defs->ENSEMBL_WRITE_PASS;

-## connect to databases
+# connect to databases
 my $databases = &EnsEMBL::DB::Core::get_databases(qw(core glovar));

 die "Problem connecting to databases: $databases->{'error'}\n"
@@ -90,33 +96,33 @@ die "Problem connecting to databases: $databases->{'error'}\n"
 warn "Database error: $databases->{'non_fatal_error'}\n"
    if $databases->{'non_fatal_error'};

-## get the adaptors needed
+# get the adaptors needed
 my $dfa = $databases->{'core'}->get_DensityFeatureAdaptor;
 my $dta = $databases->{'core'}->get_DensityTypeAdaptor;
 my $aa  = $databases->{'core'}->get_AnalysisAdaptor;
 my $attrib_adaptor = $databases->{'core'}->get_AttributeAdaptor;
 my $slice_adaptor = $databases->{'core'}->get_SliceAdaptor;

-## which chromosomes do we run?
+# which chromosomes do we run?
 my @top_slices;
 if ($chr) {
-    ## run chromosomes specified on commandline
+    # run chromosomes specified on commandline
    foreach (split(",", $chr)) {
        push @top_slices, $slice_adaptor->fetch_by_region("toplevel", $_);
    }
 } else {
-    ## run all chromosomes for this species
+    # run all chromosomes for this species
    @top_slices = @{$slice_adaptor->fetch_all("toplevel")};
 }

-## calculate block size (assuming 4000 blocks per genome)
+# calculate block size (assuming 4000 blocks per genome)
 my ( $block_size, $genome_size );
 for my $slice ( @{$slice_adaptor->fetch_all("toplevel")} ) {
    $genome_size += $slice->length;
 }
 $block_size = int( $genome_size / 4000 );

-## analysis
+# analysis
 my $analysis = new Bio::EnsEMBL::Analysis (
        -program     => "glovar_snp_density.pl",
        -database    => "vega",
@@ -125,20 +131,40 @@ my $analysis = new Bio::EnsEMBL::Analysis (
        -logic_name  => "snpDensity");
 $aa->store( $analysis ) unless $dry;

-## density type
+# density type
 my $dt = Bio::EnsEMBL::DensityType->new(
        -analysis   => $analysis,
        -block_size => $block_size,
        -value_type => 'sum');
 $dta->store($dt) unless $dry;

-## loop over chromosomes
+# loop over chromosomes
 my @chr;
 foreach my $sl (@top_slices) {
    push @chr, $sl->seq_region_name;
 }
 print STDERR "\nAvailable chromosomes: @chr\n";

+# settings for AV index dump
+use constant SNP_LINE => join("\t",
+    'Glovar SNP', '%s', '/%s/snpview?snp=%s&source=glovar', '%s',
+    "Single nucleotide polymorphism (SNP) %s [Alleles: %s]. Alternative IDs: %s.\n"
+);
+if ($avdump) {
+    print STDERR "Preparing directories for AV index dump...\n";
+    my $dumpdir = "$ENV{'ENSEMBL_SERVERROOT'}/utils/indexing/input";
+    unless (-e $dumpdir) {
+        mkdir $dumpdir, 0777 or die "Could not creat directory $dumpdir: $!\n";
+    }
+    unless (-e "$dumpdir/$species") {
+        mkdir "$dumpdir/$species", 0777 or die
+            "Could not creat directory $dumpdir/$species: $!\n";
+    }
+    open (AV, ">>$dumpdir/$species/SNP.txt") or die
+        "Could not open $dumpdir/$species/SNP.txt for writing: $!\n";
+    print STDERR "Done.\n";
+}
+
 my ($current_start, $current_end);
 foreach my $slice (@top_slices) {
    $current_start = 1;
@@ -149,7 +175,7 @@ foreach my $slice (@top_slices) {
    print STDERR "\nSNP densities for chr $chr with block size $block_size\n";
    print STDERR "Start at " . `date`;

-    ## loop over blocks
+    # loop over blocks
    while($current_start <= $slice->end) {
        $i++;
        $current_end = $current_start+$block_size-1;
@@ -166,12 +192,34 @@ foreach my $slice (@top_slices) {
            $current_start = $current_end + 1;
            next;
        }
-        ## only count snps that don't overlap slice start
-        ## also, avoid duplicate counting
-        my %snps = map { "$_->name => 1" if ($_->start >= 1) } @{$snps};
+        # only count snps that don't overlap slice start
+        # also, avoid duplicate counting
+        my %snps = map { "$_->display_id => 1" if ($_->start >= 1) } @{$snps};
        $count = scalar(keys %snps);

-        ## density
+        # AV index dump
+        if ($avdump) {
+            foreach my $snpo (@{$snps}) {
+                next if ($snpo->start < 1);
+                my $snpid = $snpo->display_id;
+                my (@IDs, @desc);
+                foreach my $link ($snpo->each_DBLink) {
+                    push @IDs, $link->primary_id;
+                    push @desc, $link->database . ": " . $link->primary_id;
+                }
+                print AV sprintf SNP_LINE,
+                    $snpid,
+                    $species,
+                    $snpid,
+                    join(" ", @IDs),
+                    $snpid,
+                    $snpo->alleles,
+                    join(", ", @desc)
+                ; 
+            }
+        }
+
+        # density
        my $df = Bio::EnsEMBL::DensityFeature->new
            (-seq_region    => $slice,
             -start         => $current_start,
@@ -182,12 +230,12 @@ foreach my $slice (@top_slices) {
        $dfa->store($df) unless $dry;
        $total += $count;

-        ## logging
+        # logging
        print STDERR "Chr: $chr | Bin: $i/$bins | Count: $count | ";
        print STDERR "Mem: " . `ps $$ -o vsz |tail -1`;
    }

-    ## stats
+    # stats
    print STDERR "Total for chr $chr: $total\n";
    my $stat = Bio::EnsEMBL::Attribute->new
 	(-NAME => 'SNPs',
@@ -196,6 +244,7 @@ foreach my $slice (@top_slices) {
 	 -DESCRIPTION => 'Total Number of SNPs');
    $attrib_adaptor->store_on_Slice($slice, [$stat]) unless $dry;
 }
+close AV if $avdump;

 print STDERR "\nAll done at " . `date` . "\n";

--- a/misc-scripts/density_feature/glovar_snp_wrapper.pl
+++ b/misc-scripts/density_feature/glovar_snp_wrapper.pl
@@ -10,12 +10,13 @@ Wrapper for glovar_snp_density.pl
    ./glovar_snp_density.pl
        --species=Homo_sapiens
        [--dry-run|-n]
+        [--avdump|-a]

 =head1 DESCRIPTION

 Wrapper for glovar_snp_density.pl to run it chromosome by chromosome. This is
 an attempt to avoid high memory footprints caused by a memory leak somerwhere
-in the API ...
+in the API. See glovar_snp_density.pl for more detailed documentation.

 =head1 LICENCE

@@ -49,17 +50,20 @@ use SiteDefs;
 use EnsWeb;
 use Getopt::Long;

-my ($species, $dry);
+my ($species, $dry, $avdump);
 &GetOptions(
    "species=s" => \$species,
    "dry-run"   => \$dry,
    "n"         => \$dry,
+    "avdump"    => \$avdump,
+    "a"         => \$avdump,
 );

 unless ($species) {
    print qq(Usage:
    ./glovar_snp_density.pl
        --species=Homo_sapiens
+        [--avdump|-a]
        [--dry-run|-n]\n\n);
    exit;
 }
@@ -69,8 +73,9 @@ $ENV{'ENSEMBL_SPECIES'} = $species;
 ## run glovar_snp_density.pl for each chromsome in this species
 my $command = "./glovar_snp_density.pl --species=$species";
 $command .= " -n" if ($dry);
+$command .= " -a" if ($avdump);
 foreach my $chr (@{$EnsWeb::species_defs->ENSEMBL_CHROMOSOMES}) {
    warn "$command --chr=$chr\n";
-    system("$command --chr=$chr");
+    system("$command --chr=$chr") == 0 or die "$command --chr=$chr failed: $!\n";
 }

--- a/misc-scripts/density_feature/vega_gene_density.pl
+++ b/misc-scripts/density_feature/vega_gene_density.pl
@@ -102,17 +102,33 @@ my $slice_adaptor = $db->get_SliceAdaptor;
 my $top_slices = $slice_adaptor->fetch_all('toplevel');

 ## determine blocksize, assuming you want 150 blocks for the smallest
-## chromosome
-my ($block_count, $genome_size, $block_size);
-my (@chr, $block_size, $min_chr);
+## chromosome over 5Mb in size. Use an extra, smaller bin size for chromosomes smaller than 5Mb
+my $big_chr = [];
+my $small_chr = [];
+my (@big_chr_names, $big_block_size, $min_big_chr);
+my (@small_chr_names, $small_block_size, $min_small_chr);
 for my $slice ( @$top_slices ) {
-    if (! $min_chr or ($min_chr > $slice->length)) {
-        $min_chr = $slice->length;
+    if ($slice->length < 5000000) {
+	if (! $min_small_chr or ($min_small_chr > $slice->length)) {
+	    $min_small_chr = $slice->length;
+	}
+	push @small_chr_names, $slice->seq_region_name;
+	push @{$small_chr->[0]}, $slice;
    }
-    push @chr, $slice->seq_region_name;
+    if (! $min_big_chr or ($min_big_chr > $slice->length) && $slice->length > 5000000) {
+	$min_big_chr = $slice->length;
+    }
+    push @big_chr_names, $slice->seq_region_name;
+    push @{$big_chr->[0]}, $slice;
 }
-$block_size = int( $min_chr / 150 );
-print STDERR "\nAvailable chromosomes: @chr\n";
+
+$big_block_size = int( $min_big_chr / 150 );
+push @{$big_chr}, $big_block_size;
+$small_block_size = int( $min_small_chr / 150 );
+push @{$small_chr}, $small_block_size;
+
+print STDERR "\nAvailable chromosomes using block size of $big_block_size: @big_chr_names\n";
+print STDERR "\nAvailable chromosomes using block size of $small_block_size: @small_chr_names\n";

 ## gene types
 my %gene_types = (
@@ -137,176 +153,187 @@ print STDERR join(" ", sort keys %gene_types);
 print STDERR "\n";

 ## create analysis and density type objects
-my %dtcache;
-foreach my $type (keys %gene_types) {
-    my $analysis = new Bio::EnsEMBL::Analysis (
-            -program     => "vega_gene_density.pl",
-            -database    => "ensembl",
-            -gff_source  => "vega_gene_density.pl",
-            -gff_feature => "density",
-            -logic_name  => $gene_types{$type});
-    $aa->store($analysis) unless $dry;
-    $analysis = $aa->fetch_by_logic_name($gene_types{$type});
-
-    my $dt = Bio::EnsEMBL::DensityType->new(-analysis   => $analysis,
-            -block_size => $block_size,
-            -value_type => 'sum');
-    $dta->store($dt) unless $dry;
-    $dtcache{$gene_types{$type}} = $dt;
+my %dtcache;    
+foreach my $block_size ($big_block_size,$small_block_size) {
+    eval {
+	foreach my $type (keys %gene_types) {
+	    my $analysis = new Bio::EnsEMBL::Analysis (
+						       -program     => "vega_gene_density.pl",
+						       -database    => "ensembl",
+						       -gff_source  => "vega_gene_density.pl",
+						       -gff_feature => "density",
+						       -logic_name  => $gene_types{$type});
+	    $aa->store($analysis) unless $dry;
+	    $analysis = $aa->fetch_by_logic_name($gene_types{$type});
+	    my $dt = Bio::EnsEMBL::DensityType->new(-analysis   => $analysis,
+						    -block_size => $block_size,
+						    -value_type => 'sum');
+	    $dta->store($dt) unless $dry;
+	    $dtcache{$block_size}{$gene_types{$type}} = $dt;
+	}    
+    }
 }

-## loop over chromosomes
+
+## loop over chromosomes, doing big ones then small ones
 my ( $current_start, $current_end );
-foreach my $slice (@$top_slices){
-    $current_start = 1;
-    my $chr = $slice->seq_region_name;
-    my (%total, $i, %gene_names);
-    my $bins = POSIX::ceil($slice->end / $block_size);
-    
-    print STDERR "\nGene densities for chr $chr with block size $block_size\n";
-    print STDERR "Start at " . `date`;
-
-    ## loop over blocks
-    my @density_features;
-    while($current_start <= $slice->end) {
-        $i++;
-        $current_end = $current_start+$block_size-1;
-        if( $current_end > $slice->end ) {
-            $current_end = $slice->end;
-        }
-        my $sub_slice = $slice->sub_Slice( $current_start, $current_end );
-        my %num = ();
-
-        ## count genes by type
-        my $genes;
-        eval { $genes = $sub_slice->get_all_Genes; };
-        if ($@) {
-            warn $@;
-            $current_start = $current_end + 1;
-            next;
-        }
-        foreach my $gene (@{$genes}) {
-            ## only count genes that don't overlap the subslice start
-            ## (since these were already counted in the last bin)
-            if ($gene->start >= 1) {
-	      $total{$gene->type}++;
-            }
-	    $num{$gene_types{$gene->type}}++;
-        }
-
-        ## create DensityFeature objects for each type
-        foreach my $type (keys %density_types) {
-  
-            push @density_features, Bio::EnsEMBL::DensityFeature->new
-                (-seq_region    => $slice,
-                 -start         => $current_start,
-                 -end           => $current_end,
-                 -density_type  => $dtcache{$type},
-                 -density_value => $num{$type} ||0
-            );
-        }
-        $current_start = $current_end + 1;
-
-        ## logging
-        print STDERR "Chr: $chr | Bin: $i/$bins | Counts: ";
-        print STDERR join(",", map { $num{$gene_types{$_}} || 0 }
-                            sort keys %gene_types);
-        print STDERR " | ";
-        print STDERR "Mem: " . `ps $$ -o vsz |tail -1`;
+foreach my $object ($big_chr, $small_chr) {
+    eval {
+	my $block_size = $object->[1];
+	foreach my $slice (@{$object->[0]}){
+	    $current_start = 1;
+	    my $chr = $slice->seq_region_name;
+	    my (%total, $i, %gene_names);
+	    my $bins = POSIX::ceil($slice->end / $block_size);
+	    
+	    print STDERR "\nGene densities for chr $chr with block size $block_size\n";
+	    print STDERR "Start at " . `date`;
+	    
+	    ## loop over blocks
+	    my @density_features;
+	    while($current_start <= $slice->end) {
+		$i++;
+		$current_end = $current_start+$block_size-1;
+		if( $current_end > $slice->end ) {
+		    $current_end = $slice->end;
+		}
+		my $sub_slice = $slice->sub_Slice( $current_start, $current_end );
+		my %num = ();
+		
+		## count genes by type
+		my $genes;
+		eval { $genes = $sub_slice->get_all_Genes; };
+		if ($@) {
+		    warn $@;
+		    $current_start = $current_end + 1;
+		    next;
+		}
+		foreach my $gene (@{$genes}) {
+		    ## only count genes that don't overlap the subslice start
+		    ## (since these were already counted in the last bin)
+		    if ($gene->start >= 1) {
+			$total{$gene->type}++;
+		    }
+		    $num{$gene_types{$gene->type}}++;
+		}
+		
+		## create DensityFeature objects for each type
+		foreach my $type (keys %density_types) {
+		    
+		    push @density_features, Bio::EnsEMBL::DensityFeature->new
+			(-seq_region    => $slice,
+			 -start         => $current_start,
+			 -end           => $current_end,
+			 -density_type  => $dtcache{$block_size}{$type},
+			 -density_value => $num{$type} ||0
+			);
+		}
+		$current_start = $current_end + 1;
+		
+		## logging
+		print STDERR "Chr: $chr | Bin: $i/$bins | Counts: ";
+		print STDERR join(",", map { $num{$gene_types{$_}} || 0 }
+				  sort keys %gene_types);
+		print STDERR " | ";
+		print STDERR "Mem: " . `ps $$ -o vsz |tail -1`;
+	    }
+	    
+	    
+	    ## store DensityFeatures for the chromosome
+	    $dfa->store(@density_features) unless $dry;
+	    
+	    ## stats
+	    my @attribs;
+	    push @attribs, Bio::EnsEMBL::Attribute->new
+		(-NAME => '12:Known genes',
+		 -CODE => 'KnownGeneCount',
+		 -VALUE => $total{'Known'} || 0,
+		 -DESCRIPTION => 'Total Number of Known genes');
+	    
+	    push @attribs, Bio::EnsEMBL::Attribute->new
+		(-NAME => '14:Novel CDS',
+		 -CODE => 'NovelCDSCount',
+		 -VALUE => $total{'Novel_CDS'} || 0,
+		 -DESCRIPTION => 'Total Number of Novel CDSs');
+	    
+	    push @attribs, Bio::EnsEMBL::Attribute->new
+		(-NAME => '16:Novel transcripts',
+		 -CODE => 'NovelTransCount',
+		 -VALUE => $total{'Novel_Transcript'} || 0,
+		 -DESCRIPTION => 'Total Number of Novel transcripts');
+	    
+	    push @attribs, Bio::EnsEMBL::Attribute->new
+		(-NAME => '24:Putative transcripts',
+		 -CODE => 'PutTransCount',
+		 -VALUE => $total{'Putative'} || 0,
+		 -DESCRIPTION => 'Total Number of Putative transcripts');
+	    
+	    push @attribs, Bio::EnsEMBL::Attribute->new
+		(-NAME => '30:Predicted transcripts',
+		 -CODE => 'PredTransCount',
+		 -VALUE => $total{'Predicted_Gene'} || 0,
+		 -DESCRIPTION => 'Total Number of Predicted transcripts');
+	    
+	    push @attribs, Bio::EnsEMBL::Attribute->new
+		(-NAME => '26:Ig segments',
+		 -CODE => 'IgSegCount',
+		 -VALUE => $total{'Ig_Segment'} || 0,
+		 -DESCRIPTION => 'Total Number of Ig Segments');
+	    
+	    push @attribs, Bio::EnsEMBL::Attribute->new
+		(-NAME => '28:Ig pseudogene Segments',
+		 -CODE => 'IgPsSegCount',
+		 -VALUE => $total{'Ig_Pseudogene_Segment'} || 0,
+		 -DESCRIPTION => 'Total Number of Ig Pseudogene Segments');
+	    
+	    push @attribs, Bio::EnsEMBL::Attribute->new
+		(-NAME => '18:Total pseudogenes',
+		 -CODE => 'TotPsCount',
+		 -VALUE => $total{'Pseudogene'}
+		 + $total{'Processed_pseudogene'}
+		 + $total{'Unprocessed_pseudogene' || 0},
+		 -DESCRIPTION => 'Total Number of Pseudogenes');
+	    
+	    push @attribs, Bio::EnsEMBL::Attribute->new
+		(-NAME => '20:Processed pseudogenes',
+		 -CODE => 'ProcPsCount',
+		 -VALUE => $total{'Processed_pseudogene'} || 0,
+		 -DESCRIPTION => 'Number of Processed pseudogenes');
+	    
+	    push @attribs, Bio::EnsEMBL::Attribute->new
+		(-NAME => '22:Unprocessed pseudogenes',
+		 -CODE => 'UnprocPsCount',
+		 -VALUE => $total{'Unprocessed_pseudogene'} || 0,
+		 -DESCRIPTION => 'Number of Unprocessed pseudogenes');
+	    
+	    push @attribs, Bio::EnsEMBL::Attribute->new
+		(-NAME => '13:Known genes (in progress)',
+		 -CODE => 'KnwnprogCount',
+		 -VALUE => $total{'Known_in_progress'} || 0,
+		 -DESCRIPTION => 'Number of Known Genes in progress');
+	    
+	    push @attribs, Bio::EnsEMBL::Attribute->new
+		(-NAME => '15:Novel CDS (in progress)',
+		 -CODE => 'NovCDSprogCount',
+		 -VALUE => $total{'Novel_CDS_in_progress'} || 0,
+		 -DESCRIPTION => 'Number of novel CDS in progress');
+	    
+	    #only store unclassified pseudogenes if there are no processed and unprocessed pseudos, ie if
+	    #total pseudos eq pseudos
+	    unless ($total{'Unprocessed_pseudogene'} == 0 && $total{'Processed_pseudogene'} == 0) {  
+		push @attribs, Bio::EnsEMBL::Attribute->new
+		    (-NAME => '23:Unclassified pseudogenes',
+		     -CODE => 'UnclassPsCount',
+		     -VALUE => $total{'Pseudogene'} || 0,
+		     -DESCRIPTION => 'Number of Unclassified pseudogenes');
+	    }
+	    
+	    $attrib_adaptor->store_on_Slice($slice, \@attribs) unless $dry;
+	    
+	    print STDERR "Total for chr $chr:\n";
+	    print STDERR map { "\t$_ => $total{$_}\n" } sort keys %total;
+	}
    }
-
-
-    ## store DensityFeatures for the chromosome
-    $dfa->store(@density_features) unless $dry;
-
-    ## stats
-    my @attribs;
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => '12:Known genes',
-     -CODE => 'KnownGeneCount',
-     -VALUE => $total{'Known'} || 0,
-     -DESCRIPTION => 'Total Number of Known genes');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => '14:Novel CDS',
-     -CODE => 'NovelCDSCount',
-     -VALUE => $total{'Novel_CDS'} || 0,
-     -DESCRIPTION => 'Total Number of Novel CDSs');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => '16:Novel transcripts',
-     -CODE => 'NovelTransCount',
-     -VALUE => $total{'Novel_Transcript'} || 0,
-     -DESCRIPTION => 'Total Number of Novel transcripts');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => '24:Putative transcripts',
-     -CODE => 'PutTransCount',
-     -VALUE => $total{'Putative'} || 0,
-     -DESCRIPTION => 'Total Number of Putative transcripts');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => '30:Predicted transcripts',
-     -CODE => 'PredTransCount',
-     -VALUE => $total{'Predicted_Gene'} || 0,
-     -DESCRIPTION => 'Total Number of Predicted transcripts');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => '26:Ig segments',
-     -CODE => 'IgSegCount',
-     -VALUE => $total{'Ig_Segment'} || 0,
-     -DESCRIPTION => 'Total Number of Ig Segments');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => '28:Ig pseudogene Segments',
-     -CODE => 'IgPsSegCount',
-     -VALUE => $total{'Ig_Pseudogene_Segment'} || 0,
-     -DESCRIPTION => 'Total Number of Ig Pseudogene Segments');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => '18:Total pseudogenes',
-     -CODE => 'TotPsCount',
-     -VALUE => $total{'Pseudogene'}
-               + $total{'Processed_pseudogene'}
-               + $total{'Unprocessed_pseudogene' || 0},
-     -DESCRIPTION => 'Total Number of Pseudogenes');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => '23:Unclassified pseudogenes',
-     -CODE => 'UnclassPsCount',
-     -VALUE => $total{'Pseudogene'} || 0,
-     -DESCRIPTION => 'Number of Unclassified pseudogenes');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => '20:Processed pseudogenes',
-     -CODE => 'ProcPsCount',
-     -VALUE => $total{'Processed_pseudogene'} || 0,
-     -DESCRIPTION => 'Number of Processed pseudogenes');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => '22:Unprocessed pseudogenes',
-     -CODE => 'UnprocPsCount',
-     -VALUE => $total{'Unprocessed_pseudogene'} || 0,
-     -DESCRIPTION => 'Number of Unprocessed pseudogenes');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => '13:Known genes (in progress)',
-     -CODE => 'KnwnprogCount',
-     -VALUE => $total{'Known_in_progress'} || 0,
-     -DESCRIPTION => 'Number of Known Genes in progress');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => '15:Novel CDS (in progress)',
-     -CODE => 'NovCDSprogCount',
-     -VALUE => $total{'Novel_CDS_in_progress'} || 0,
-     -DESCRIPTION => 'Number of novel CDS in progress');
-
-    $attrib_adaptor->store_on_Slice($slice, \@attribs) unless $dry;
-
-    print STDERR "Total for chr $chr:\n";
-    print STDERR map { "\t$_ => $total{$_}\n" } sort keys %total;
-
 }
-
 print STDERR "\nAll done at " . `date` . "\n";

--- a/misc-scripts/density_feature/vega_percent_gc_calc.pl
+++ b/misc-scripts/density_feature/vega_percent_gc_calc.pl
+#!/usr/bin/perl -w
+
+#
+# Calculate the GC content for top level seq_regions
+#   small regions 500bp to be able to display on contigview
+#   big regions genomesize / 4000 for 4000 features on the genome
+
+use strict;
+BEGIN {
+    $ENV{'ENSEMBL_SERVERROOT'} = "../../..";
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/conf");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-compara/modules");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-draw/modules");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-external/modules");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-otter/modules");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/modules");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl/modules");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/bioperl-live");
+}
+
+use SiteDefs;
+use EnsWeb;
+use EnsEMBL::DB::Core;
+use Getopt::Long;
+use Bio::EnsEMBL::DensityType;
+use Bio::EnsEMBL::DensityFeature;
+use POSIX;
+
+use Data::Dumper;
+
+my ($species, $dry, $help);
+&GetOptions(
+    "species=s" => \$species,
+    "dry-run"   => \$dry,
+    "n"         => \$dry,
+    "help"      => \$help,
+    "h"         => \$help,
+);
+
+if($help || !$species){
+    print qq(Usage:
+    ./vega_gene_density.pl
+        --species=Homo_sapiens
+        [--dry-run|-n]
+        [--help|-h]\n\n);
+    exit;
+}
+
+$ENV{'ENSEMBL_SPECIES'} = $species;
+
+## set db user/pass to allow write access
+my $db_ref = $EnsWeb::species_defs->databases;
+$db_ref->{'ENSEMBL_DB'}{'USER'} = $EnsWeb::species_defs->ENSEMBL_WRITE_USER;
+$db_ref->{'ENSEMBL_DB'}{'PASS'} = $EnsWeb::species_defs->ENSEMBL_WRITE_PASS;
+
+## connect to databases
+my $databases = &EnsEMBL::DB::Core::get_databases(qw(core));
+my $db = $databases->{'core'};
+
+die "Problem connecting to databases: $databases->{'error'}\n"
+    if  $databases->{'error'} ;
+warn "Database error: $databases->{'non_fatal_error'}\n"
+    if $databases->{'non_fatal_error'};
+
+#
+# Get the adaptors needed;
+#
+my $slice_adaptor = $db->get_SliceAdaptor();
+my $dfa = $db->get_DensityFeatureAdaptor();
+my $dta = $db->get_DensityTypeAdaptor();
+my $aa  = $db->get_AnalysisAdaptor();
+
+my $top_slices = $slice_adaptor->fetch_all( "toplevel" );
+
+my $big_chr = [];
+my $small_chr = [];
+
+my (@big_chr_names, $big_block_size, $min_big_chr);
+my (@small_chr_names, $small_block_size, $min_small_chr);
+for my $slice ( @$top_slices ) {
+    if ($slice->length < 5000000) {
+	if (! $min_small_chr or ($min_small_chr > $slice->length)) {
+	    $min_small_chr = $slice->length;
+	}
+	push @small_chr_names, $slice->seq_region_name;
+	push @{$small_chr->[0]}, $slice;
+    }
+    if (! $min_big_chr or ($min_big_chr > $slice->length) && $slice->length > 5000000) {
+	$min_big_chr = $slice->length;
+    }
+    push @big_chr_names, $slice->seq_region_name;
+    push @{$big_chr->[0]}, $slice;
+}
+
+$big_block_size = int( $min_big_chr / 150 );
+$small_block_size = int( $min_small_chr / 150 );
+push @{$big_chr}, $big_block_size;
+push @{$small_chr}, $small_block_size;
+
+
+print STDERR "\nAvailable chromosomes using block size of $big_block_size: @big_chr_names\n";
+print STDERR "\nAvailable chromosomes using block size of $small_block_size: @small_chr_names\n";
+
+#
+# Create new analysis object for density calculation.
+#
+my $analysis = new Bio::EnsEMBL::Analysis (-program     => "percent_gc_calc.pl",
+					   -database    => "ensembl",
+					   -gff_source  => "percent_gc_calc.pl",
+					   -gff_feature => "density",
+					   -logic_name  => "PercentGC");
+$aa->store($analysis) unless $dry;
+
+#
+# Create new density types
+#
+if ($small_block_size) {
+    my $small_density_type = Bio::EnsEMBL::DensityType->new
+	(-analysis   => $analysis,
+	 -block_size => $small_block_size,
+	 -value_type => 'ratio');
+    $dta->store($small_density_type) unless $dry;
+    push @{$small_chr}, $small_density_type;
+}
+
+if ($big_block_size) {
+    my $big_density_type = Bio::EnsEMBL::DensityType->new
+	(-analysis   => $analysis,
+	 -block_size => $big_block_size,
+	 -value_type => 'ratio');
+    $dta->store($big_density_type) unless $dry;
+    push @{$big_chr}, $big_density_type;
+}
+
+my ( $current_start, $current_end );
+
+$Data::Dumper::Maxdepth = 2;
+
+my $types = [];
+foreach my $object ( $big_chr, $small_chr) {
+    eval {
+	my $block_size = $object->[1];
+	foreach my $slice (@{$object->[0]}){
+	    $current_start = 1;
+	    warn "Chromosome ",$slice->seq_region_name;
+	    while($current_start <= $slice->end()) {
+		$current_end = $current_start+$block_size-1;
+		if( $current_end > $slice->end() ) {
+		    $current_end = $slice->end();
+		}
+		my $sub_slice = $slice->sub_Slice( $current_start, $current_end );
+		warn "start = $current_start, end = $current_end\n";
+		my $gc = $sub_slice->get_base_count()->{'%gc'};
+		my $df =  Bio::EnsEMBL::DensityFeature->new
+		    (-seq_region    => $slice,
+		     -start         => $current_start,
+		     -end           => $current_end,
+		     -density_type  => $object->[2],
+		     -density_value => $gc);
+		$dfa->store($df) unless $dry;
+		$current_start = $current_end+1;
+	    }
+	}
+    };
+}
+
+#
+# helper to draw an ascii representation of the density features
+#
+sub print_features {
+    my $features = shift;
+  
+    return if(!@$features);
+    
+    my $sum = 0;
+    my $length = 0;
+    #  my $type = $features->[0]->{'density_type'}->value_type();
+    
+    print("\n");
+    my $max=0;
+    foreach my $f (@$features) {
+	if($f->density_value() > $max){
+	    $max=$f->density_value();
+	}
+    }
+    foreach my $f (@$features) {
+	my $i=1;
+	for(; $i< ($f->density_value()/$max)*40; $i++){
+	    print "*";
+	}
+	for(my $j=$i;$j<40;$j++){
+	    print " ";
+	}
+	print "  ".$f->density_value()."\t".$f->start()."\n";
+    }
+}
--- a/misc-scripts/density_feature/vega_repeat_coverage_calc.pl
+++ b/misc-scripts/density_feature/vega_repeat_coverage_calc.pl
+#!/usr/bin/perl -w
+#
+# Calculate the repeat coverage for given database.
+# condition: 1k blocks to show contigview displays
+#  4000 blocks for a whole genome
+#
+# checks wether database contains repeats before doing anything
+use strict;
+BEGIN {
+    $ENV{'ENSEMBL_SERVERROOT'} = "../../..";
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/conf");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-compara/modules");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-draw/modules");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-external/modules");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-otter/modules");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/modules");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl/modules");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/bioperl-live");
+}
+
+use SiteDefs;
+use EnsWeb;
+use EnsEMBL::DB::Core;
+use Getopt::Long;
+use Bio::EnsEMBL::DensityType;
+use Bio::EnsEMBL::DensityFeature;
+use Bio::EnsEMBL::Mapper::RangeRegistry;
+use POSIX;
+
+use Data::Dumper;
+
+my ($species, $dry, $help);
+&GetOptions(
+    "species=s" => \$species,
+    "dry-run"   => \$dry,
+    "n"         => \$dry,
+    "help"      => \$help,
+    "h"         => \$help,
+);
+
+if($help || !$species){
+    print qq(Usage:
+    ./vega_gene_density.pl
+        --species=Homo_sapiens
+        [--dry-run|-n]
+        [--help|-h]\n\n);
+    exit;
+}
+
+$ENV{'ENSEMBL_SPECIES'} = $species;
+
+## set db user/pass to allow write access
+my $db_ref = $EnsWeb::species_defs->databases;
+$db_ref->{'ENSEMBL_DB'}{'USER'} = $EnsWeb::species_defs->ENSEMBL_WRITE_USER;
+$db_ref->{'ENSEMBL_DB'}{'PASS'} = $EnsWeb::species_defs->ENSEMBL_WRITE_PASS;
+
+## connect to databases
+my $databases = &EnsEMBL::DB::Core::get_databases(qw(core));
+my $db = $databases->{'core'};
+
+die "Problem connecting to databases: $databases->{'error'}\n"
+    if  $databases->{'error'} ;
+warn "Database error: $databases->{'non_fatal_error'}\n"
+    if $databases->{'non_fatal_error'};
+
+#
+# Get the adaptors needed;
+#
+my $slice_adaptor = $db->get_SliceAdaptor();
+my $dfa = $db->get_DensityFeatureAdaptor();
+my $dta = $db->get_DensityTypeAdaptor();
+my $aa  = $db->get_AnalysisAdaptor();
+
+my $top_slices = $slice_adaptor->fetch_all( "toplevel" );
+my $big_chr = [];
+my $small_chr = [];
+my (@big_chr_names, $big_block_size, $min_big_chr);
+my (@small_chr_names, $small_block_size, $min_small_chr);
+for my $slice ( @$top_slices ) {
+    if ($slice->length < 5000000) {
+	if (! $min_small_chr or ($min_small_chr > $slice->length)) {
+	    $min_small_chr = $slice->length;
+	}
+	push @small_chr_names, $slice->seq_region_name;
+	push @{$small_chr->[0]}, $slice;
+    }
+    if (! $min_big_chr or ($min_big_chr > $slice->length) && $slice->length > 5000000) {
+	$min_big_chr = $slice->length;
+    }
+    push @big_chr_names, $slice->seq_region_name;
+    push @{$big_chr->[0]}, $slice;
+}
+
+
+$big_block_size = int( $min_big_chr / 150 );
+$small_block_size = int( $min_small_chr / 150 );
+push @{$big_chr}, $big_block_size;
+push @{$small_chr}, $small_block_size;
+
+print STDERR "\nAvailable chromosomes using block size of $big_block_size: @big_chr_names\n";
+print STDERR "\nAvailable chromosomes using block size of $small_block_size: @small_chr_names\n";
+
+#
+# Create new analysis object for density calculation.
+#
+my $analysis = new Bio::EnsEMBL::Analysis (-program     => "repeat_coverage_calc.pl",
+					   -database    => "ensembl",
+					   -gff_source  => "repeat_coverage_calc.pl",
+					   -gff_feature => "density",
+					   -logic_name  => "PercentageRepeat");
+$aa->store($analysis) unless $dry;
+
+if ($small_block_size) {
+    my $small_density_type = Bio::EnsEMBL::DensityType->new
+	(-analysis   => $analysis,
+	 -block_size => $small_block_size,
+	 -value_type => 'ratio');
+    $dta->store($small_density_type) unless $dry;
+    push @{$small_chr}, $small_density_type;
+}
+
+if ($big_block_size) {
+    my $big_density_type = Bio::EnsEMBL::DensityType->new
+	(-analysis   => $analysis,
+	 -block_size => $big_block_size,
+	 -value_type => 'ratio');
+    $dta->store($big_density_type) unless $dry;
+    push @{$big_chr}, $big_density_type;
+}
+
+my ( $current_start, $current_end );
+
+foreach my $object ($big_chr, $small_chr) {
+    eval {
+	my $block_size = $object->[1];
+	foreach my $slice ( @{$object->[0]} ) {
+	    $current_start = 1;
+	    warn "Chromosome ", $slice->seq_region_name;
+	    while($current_start <= $slice->end()) {
+		$current_end = $current_start+$block_size-1;
+		if( $current_end > $slice->end() ) {
+		    $current_end = $slice->end();
+		}
+		my $this_block_size = $current_end - $current_start + 1;
+		my $sub_slice = $slice->sub_Slice( $current_start, $current_end );
+		warn "start = $current_start, end = $current_end\n";
+		my $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new();
+		foreach my $repeat (@{$sub_slice->get_all_RepeatFeatures()}){
+		    $rr->check_and_register("1",$repeat->start,$repeat->end);
+		}
+		my $count = 0;
+		my $non_repeats = $rr->check_and_register("1",1,$this_block_size);
+		if( defined $non_repeats ) {
+		    foreach my $non_repeat ( @$non_repeats ) {
+			$count += ($non_repeat->[1]-$non_repeat->[0])+1;
+		    }
+		}
+		my $percentage_repeat = (($this_block_size-$count)/$this_block_size)*100;
+		my $df = Bio::EnsEMBL::DensityFeature->new
+		    (-seq_region    => $slice,
+		     -start         => $current_start,
+		     -end           => $current_end,
+		     -density_type  => $object->[2],
+		     -density_value => $percentage_repeat);
+		$dfa->store($df) unless $dry;
+		$current_start = $current_end + 1;
+	    }
+	}
+    };
+}
+
+#
+# helper to draw an ascii representation of the density features
+#
+sub print_features {
+  my $features = shift;
+
+  return if(!@$features);
+
+  my $sum = 0;
+  my $length = 0;
+#  my $type = $features->[0]->{'density_type'}->value_type();
+
+  print("\n");
+  my $max=0;
+  foreach my $f (@$features) {
+    if($f->density_value() > $max){
+      $max=$f->density_value();
+    }
+  }
+  foreach my $f (@$features) {
+    my $i=1;
+    for(; $i< ($f->density_value()/$max)*40; $i++){
+      print "*";
+    }
+    for(my $j=$i;$j<40;$j++){
+      print " ";
+    }
+    print "  ".$f->density_value()."\t".$f->start()."\n";
+  }
+}
--- a/misc-scripts/repeats/vega_repeat_libraries.pl
+++ b/misc-scripts/repeats/vega_repeat_libraries.pl
@@ -47,7 +47,7 @@ my $dbh = DBI->connect( $dsn, $user, $pass );

 my @dbnames = map {$_->[0] } @{ $dbh->selectall_arrayref( "show databases" ) };

-my @dbs = grep {$_ =~ /$dbpattern/} @dbnames;
+my @dbs = grep {$_ =~ /^$dbpattern$/} @dbnames;

 foreach my $db (@dbs) {
  open RFILE, $repeatfile or die("Could not open repeat file $repeatfile");