added glovar SNP stats

d528572a · Patrick Meidl · 783bd998 · d528572a
Commit d528572a authored 20 years ago by Patrick Meidl
--- a/misc-scripts/density_feature/vega_seq_region_stats.pl
+++ b/misc-scripts/density_feature/vega_seq_region_stats.pl
-use strict;
+#!/usr/local/bin/perl

-use lib '../../modules/','../../../bioperl-live';
+=head1 NAME

-use Bio::EnsEMBL::DBSQL::DBAdaptor;
-use Bio::EnsEMBL::Lite::DBAdaptor;
-use Getopt::Long;
+vega_seq_region_stats.pl -
+script to calculate gene and snp numbers for mapview stats

-my ( $host, $user, $pass, $port, $dbname  );
+=head1 SYNOPSIS


-GetOptions( "host=s", \$host,
-	    "user=s", \$user,
-	    "pass=s", \$pass,
-	    "port=i", \$port,
-	    "dbname=s", \$dbname
-	  );
-my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host => $host,
-					    -user => $user,
-					    -port => $port,
-					    -pass => $pass,
-					    -dbname => $dbname);
+=head1 DESCRIPTION

-#
-# Only run on database with genes
-#
-my $sth = $db->prepare( "select count(*) from gene" );
-$sth->execute();

-my ( $gene_count )  = $sth->fetchrow_array();
+=head1 LICENCE

-if( ! $gene_count ) {
-    print STDERR "No gene density for $dbname.\n";
-    exit();
-}
+This code is distributed under an Apache style licence:
+Please see http://www.ensembl.org/code_licence.html for details

-#
-# and seq_regions
-#
-$sth = $db->prepare( "select count(*) from seq_region" );
-$sth->execute();
-my ( $seq_region_count ) = $sth->fetchrow_array();
-if( ! $seq_region_count ) {
-    print STDERR "No seq_regions for $dbname.\n";
-    exit();
-}
+=head1 AUTHOR

+Patrick Meidl <pm2@sanger.ac.uk>

-#my $snps_present = lite_attach( $db );
-my $snps_present = 0;
+=head1 CONTACT

+Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk

-my $slice_adaptor = $db->get_SliceAdaptor();
-my $attrib_adaptor = $db->get_AttributeAdaptor();
+=cut
+
+use strict;
+BEGIN {
+    $ENV{'ENSEMBL_SERVERROOT'} = "../../..";
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/conf");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-compara/modules");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-draw/modules");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-external/modules");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-otter/modules");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/modules");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl/modules");
+    unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/bioperl-live");
+}

-my $top_slices = $slice_adaptor->fetch_all( "toplevel" );
+use SiteDefs;
+use EnsWeb;
+use EnsEMBL::DB::Core;
+use Getopt::Long;

+my ($species, $run, $help);
+&GetOptions(
+    "species=s" => \$species,
+    "run=s"	=> \$run,
+    "help"      => \$help,
+    "h"         => \$help,
+);
+
+if($help || !($species && $run)){
+    print qq(Usage:
+    ./vega_seq_region_stats.pl
+        --species=Homo_sapiens
+        --run=gene,snp
+        [--help]\n\n);
+    exit;
+}

-foreach my $slice (@$top_slices) {
-    my $num_known_genes  = 0;
-    my $num_nov_CDS      = 0;
-    my $num_nov_trans    = 0;
-    my $num_tot_pseudo_genes = 0;
-    my $num_unclass_pseudo_genes = 0;
-    my $num_proc_pseudo_genes = 0;
-    my $num_unproc_pseudo_genes = 0;
-    my $num_put_trans = 0;
-    my $num_pred_trans = 0;
-    my $num_pred_Ig_pseudogenes = 0;
-    my $num_Ig_segments = 0;
-    
-    print STDERR "Processing seq_region ", $slice->seq_region_name(), "\n";
-
-    my @genes = @{$slice->get_all_Genes()};
-
-    foreach my $gene (@genes) {
-	my $type = $gene->type;
-	if ($type eq 'Known') {
-	    $num_known_genes++;
-	} elsif ($type eq 'Novel_CDS') {
-	    $num_nov_CDS++;
-	} elsif ($type eq 'Novel_Transcript') {
-	    $num_nov_trans++;
-	} elsif ($type eq 'Putative') {
-	    $num_put_trans++;
-	} elsif ($type eq 'Predicted_Gene') {
-	    $num_pred_trans++;
-	} elsif ($type eq 'Ig_Pseudogene_Segment') {
-	    $num_pred_Ig_pseudogenes++;
-	} elsif ($type eq 'Ig_Segment') {
-	    $num_Ig_segments++;
-	} elsif ($type eq 'Pseudogene') {
-	    $num_unclass_pseudo_genes++;
-	} elsif ($type eq 'Processed_pseudogene') {
-	    $num_proc_pseudo_genes++;
-	} elsif ($type eq 'Unprocessed_pseudogene') {
-	    $num_unproc_pseudo_genes++;
-	}
+$ENV{'ENSEMBL_SPECIES'} = $species;

-	if ($type =~ /seudogene$/) {
-	    $num_tot_pseudo_genes++;
-	}
-    }
+## set db user/pass to allow write access
+my $db_ref = $EnsWeb::species_defs->databases;
+$db_ref->{'ENSEMBL_DB'}{'USER'} = $EnsWeb::species_defs->ENSEMBL_WRITE_USER;
+$db_ref->{'ENSEMBL_DB'}{'PASS'} = $EnsWeb::species_defs->ENSEMBL_WRITE_PASS;

-    print "Slice", $slice->seq_region_name(), " has the following features:\n\n";
-    print "known genes = $num_known_genes\n";
-    print "novel coding sequences = $num_nov_CDS\n";
-    print "novel transcripts = $num_nov_trans\n";
-    print "putative transcripts = $num_put_trans\n";
-    print "predicted transcripts = $num_pred_trans\n"; 
-    print "total number of pseudogenes = $num_tot_pseudo_genes\n";
-    print "\tunclassified pseudogenes = $num_unclass_pseudo_genes\n";
-    print "\tprocessed pseudogenes = $num_proc_pseudo_genes\n";
-    print "\tunprocessed pseudogenes = $num_unproc_pseudo_genes\n";
-    print "Ig Pseudogenes = $num_pred_Ig_pseudogenes\n";
-    print "Ig Segments = $num_Ig_segments\n\n\n";
+## connect to databases
+my %run;
+@run{split(/,/, $run)} = 1;
+my @dbs = qw(core);
+push @dbs, 'glovar' if ($run{'snp'});
+my $databases = &EnsEMBL::DB::Core::get_databases(@dbs);

+die "Problem connecting to databases: $databases->{'error'}\n"
+    if  $databases->{'error'} ;
+warn "Database error: $databases->{'non_fatal_error'}\n"
+    if $databases->{'non_fatal_error'};
+
+## get slice and attribute adaptors, loop over all toplevel slices
+my $slice_adaptor = $databases->{'core'}->get_SliceAdaptor();
+my $attrib_adaptor = $databases->{'core'}->get_AttributeAdaptor();
+my $top_slices = $slice_adaptor->fetch_all("toplevel");
+
+foreach my $slice (@$top_slices) {
+    my %num;
    my @attribs;
+    
+    print STDERR "Processing seq_region ", $slice->seq_region_name(), ":\n";
+
+    ## genes
+    if ($run{'gene'}) {
+        print STDERR "\tGenes...\n";
+        my @genes = @{$slice->get_all_Genes()};
+        foreach my $gene (@genes) {
+            my $type = $gene->type;
+            $num{$type}++;
+            if ($type =~ /seudogene$/) {
+                $num{'Total_Pseudogenes'}++;
+            }
+        }
+
+        print STDERR "Slice", $slice->seq_region_name(),
+            " has the following features:\n\n";
+        foreach my $type (keys %num) {
+            print STDERR "$type = $num{$type}\n";
+        }
+
+        push @attribs, Bio::EnsEMBL::Attribute->new
+        (-NAME => 'Known genes',
+         -CODE => 'KnownGeneCount',
+         -VALUE => $num{'Known'},
+         -DESCRIPTION => 'Total Number of Known genes');
+
+        push @attribs, Bio::EnsEMBL::Attribute->new
+        (-NAME => 'Novel CDS',
+         -CODE => 'NovelCDSCount',
+         -VALUE => $num{'Novel_CDS'},
+         -DESCRIPTION => 'Total Number of Novel CDSs');
+
+        push @attribs, Bio::EnsEMBL::Attribute->new
+        (-NAME => 'Novel transcripts',
+         -CODE => 'NovelTransCount',
+         -VALUE => $num{'Novel_Transcript'},
+         -DESCRIPTION => 'Total Number of Novel transcripts');
+
+        push @attribs, Bio::EnsEMBL::Attribute->new
+        (-NAME => 'Putative transcripts',
+         -CODE => 'PutTransCount',
+         -VALUE => $num{'Putative'},
+         -DESCRIPTION => 'Total Number of Putative transcripts');
+
+        push @attribs, Bio::EnsEMBL::Attribute->new
+        (-NAME => 'Predicted transcripts',
+         -CODE => 'PredTransCount',
+         -VALUE => $num{'Predicted_Gene'},
+         -DESCRIPTION => 'Total Number of Predicted transcripts');
+
+        push @attribs, Bio::EnsEMBL::Attribute->new
+        (-NAME => 'Total Pseudogenes',
+         -CODE => 'TotPsCount',
+         -VALUE => $num{'Total_Pseudogenes'},
+         -DESCRIPTION => 'Total Number of Pseudogenes');
+
+        push @attribs, Bio::EnsEMBL::Attribute->new
+        (-NAME => 'Unclassified pseudogenes',
+         -CODE => 'UnclassPsCount',
+         -VALUE => $num{'Pseudogene'},
+         -DESCRIPTION => 'Number of Unclassified pseudogenes');
+
+        push @attribs, Bio::EnsEMBL::Attribute->new
+        (-NAME => 'Processed pseudogenes',
+         -CODE => 'ProcPsCount',
+         -VALUE => $num{'Processed_pseudogene'},
+         -DESCRIPTION => 'Number of Processed pseudogenes');
+
+        push @attribs, Bio::EnsEMBL::Attribute->new
+        (-NAME => 'Unprocessed pseudogenes',
+         -CODE => 'UnprocPsCount',
+         -VALUE => $num{'Unprocessed_pseudogene'},
+         -DESCRIPTION => 'Number of Unprocessed pseudogenes');
+
+        push @attribs, Bio::EnsEMBL::Attribute->new
+        (-NAME => 'Ig Segments',
+         -CODE => 'IgSegCount',
+         -VALUE => $num{'Ig_Segment'},
+         -DESCRIPTION => 'Total Number of Ig Segments');
+
+        push @attribs, Bio::EnsEMBL::Attribute->new
+        (-NAME => 'Ig Pseudogene Segments',
+         -CODE => 'IgPsSegCount',
+         -VALUE => $num{'Ig_Pseudogene_Segment'},
+         -DESCRIPTION => 'Total Number of Ig Pseudogene Segments');
+    }

-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => 'Known Genes',
-     -CODE => 'KnownGeneCount',
-     -VALUE => $num_known_genes,
-     -DESCRIPTION => 'Total Number of Known Genes');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => 'Novel CDS',
-     -CODE => 'NovelCDSCount',
-     -VALUE => $num_nov_CDS,
-     -DESCRIPTION => 'Total Number of Novel CDSs');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => 'Novel transcripts',
-     -CODE => 'NovelTransCount',
-     -VALUE => $num_nov_trans,
-     -DESCRIPTION => 'Total Number of Novel Transcripts');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => 'Putative transcripts',
-     -CODE => 'PutTransCount',
-     -VALUE => $num_put_trans,
-     -DESCRIPTION => 'Total Number of Putative Transcripts');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => 'Predicted transcripts',
-     -CODE => 'PredTransCount',
-     -VALUE => $num_pred_trans,
-     -DESCRIPTION => 'Total Number of Predicted Transcripts');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => 'Total Pseudogenes',
-     -CODE => 'TotPsCount',
-     -VALUE => $num_tot_pseudo_genes,
-     -DESCRIPTION => 'Total Number of Pseudogenes');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => 'Unclassified Pseudogenes',
-     -CODE => 'UnclassPsCount',
-     -VALUE => $num_unclass_pseudo_genes,
-     -DESCRIPTION => 'Number of Unclassified Pseudogenes');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => 'Processed Pseudogenes',
-     -CODE => 'ProcPsCount',
-     -VALUE => $num_proc_pseudo_genes,
-     -DESCRIPTION => 'Number of Processed Pseudogenes');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => 'Unprocessed Pseudogenes',
-     -CODE => 'UnprocPsCount',
-     -VALUE => $num_unproc_pseudo_genes,
-     -DESCRIPTION => 'Number of Unprocessed Pseudogenes');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => 'Ig Segments',
-     -CODE => 'IgSegCount',
-     -VALUE => $num_Ig_segments,
-     -DESCRIPTION => 'Total Number of Ig Segments');
-
-    push @attribs, Bio::EnsEMBL::Attribute->new
-    (-NAME => 'Ig Pseudogene Segments',
-     -CODE => 'IgPsSegCount',
-     -VALUE => $num_pred_Ig_pseudogenes,
-     -DESCRIPTION => 'Total Number of Ig Pseudogene Segments');
-
-    if( $snps_present ) {
-	my $snps = $slice->get_all_SNPs();
+    ## snps
+    if ($run{'snp'}) {
+        print STDERR "\tSNPs...\n";
+	my $snps = $slice->get_all_ExternalFeatures('GlovarSNP');
 	push @attribs, Bio::EnsEMBL::Attribute->new
-	(-NAME => 'SNP Count',
+	(-NAME => 'SNPs',
 	 -CODE => 'SNPCount',
 	 -VALUE => scalar( @$snps ),
 	 -DESCRIPTION => 'Total Number of SNPs');
@@ -209,43 +204,6 @@ sub print_chromo_stats {
    }
 }

-
-#
-# tries to attach lite.
-#
-
-sub lite_attach {
-    my $db = shift;
-    
-    my $core_db_name;
-    $core_db_name = $db->dbname();
-    if( $core_db_name !~ /_core_/ ) {
-	return 0;
-    }
-    #
-    # get a lost of all databases on that server
-    #
-    my $sth = $db->prepare( "show databases" );
-    $sth->execute();
-    my $all_db_names = $sth->fetchall_arrayref();
-    my %all_db_names = map {( $_->[0] , 1)} @$all_db_names;
-    my $snp_db_name = $core_db_name;
-    $snp_db_name =~ s/_core_/_lite_/;
-    if( ! exists $all_db_names{ $snp_db_name } ) {
-	return 0;
-    }
-    
-    my $snp_db = Bio::EnsEMBL::Lite::DBAdaptor->new
-    ( -host => $db->host(),
-      -user => $db->username(),
-      -pass => $db->password(),
-      -port => $db->port(),
-      -dbname => $snp_db_name );
-    $db->add_db_adaptor( "lite", $snp_db );
-    return 1;
-}
-
-
 1;