diff --git a/misc-scripts/density_feature/vega_seq_region_stats.pl b/misc-scripts/density_feature/vega_seq_region_stats.pl index 9ff66407b059f80c686f165e7fc693beb093f4af..00a96e68affc4ee4e1612dace8807cff047938f9 100644 --- a/misc-scripts/density_feature/vega_seq_region_stats.pl +++ b/misc-scripts/density_feature/vega_seq_region_stats.pl @@ -1,192 +1,187 @@ -use strict; +#!/usr/local/bin/perl -use lib '../../modules/','../../../bioperl-live'; +=head1 NAME -use Bio::EnsEMBL::DBSQL::DBAdaptor; -use Bio::EnsEMBL::Lite::DBAdaptor; -use Getopt::Long; +vega_seq_region_stats.pl - +script to calculate gene and snp numbers for mapview stats -my ( $host, $user, $pass, $port, $dbname ); +=head1 SYNOPSIS -GetOptions( "host=s", \$host, - "user=s", \$user, - "pass=s", \$pass, - "port=i", \$port, - "dbname=s", \$dbname - ); -my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host => $host, - -user => $user, - -port => $port, - -pass => $pass, - -dbname => $dbname); +=head1 DESCRIPTION -# -# Only run on database with genes -# -my $sth = $db->prepare( "select count(*) from gene" ); -$sth->execute(); -my ( $gene_count ) = $sth->fetchrow_array(); +=head1 LICENCE -if( ! $gene_count ) { - print STDERR "No gene density for $dbname.\n"; - exit(); -} +This code is distributed under an Apache style licence: +Please see http://www.ensembl.org/code_licence.html for details -# -# and seq_regions -# -$sth = $db->prepare( "select count(*) from seq_region" ); -$sth->execute(); -my ( $seq_region_count ) = $sth->fetchrow_array(); -if( ! $seq_region_count ) { - print STDERR "No seq_regions for $dbname.\n"; - exit(); -} +=head1 AUTHOR +Patrick Meidl <pm2@sanger.ac.uk> -#my $snps_present = lite_attach( $db ); -my $snps_present = 0; +=head1 CONTACT +Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk -my $slice_adaptor = $db->get_SliceAdaptor(); -my $attrib_adaptor = $db->get_AttributeAdaptor(); +=cut + +use strict; +BEGIN { + $ENV{'ENSEMBL_SERVERROOT'} = "../../.."; + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/conf"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-compara/modules"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-draw/modules"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-external/modules"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl-otter/modules"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/modules"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/ensembl/modules"); + unshift(@INC,"$ENV{'ENSEMBL_SERVERROOT'}/bioperl-live"); +} -my $top_slices = $slice_adaptor->fetch_all( "toplevel" ); +use SiteDefs; +use EnsWeb; +use EnsEMBL::DB::Core; +use Getopt::Long; +my ($species, $run, $help); +&GetOptions( + "species=s" => \$species, + "run=s" => \$run, + "help" => \$help, + "h" => \$help, +); + +if($help || !($species && $run)){ + print qq(Usage: + ./vega_seq_region_stats.pl + --species=Homo_sapiens + --run=gene,snp + [--help]\n\n); + exit; +} -foreach my $slice (@$top_slices) { - my $num_known_genes = 0; - my $num_nov_CDS = 0; - my $num_nov_trans = 0; - my $num_tot_pseudo_genes = 0; - my $num_unclass_pseudo_genes = 0; - my $num_proc_pseudo_genes = 0; - my $num_unproc_pseudo_genes = 0; - my $num_put_trans = 0; - my $num_pred_trans = 0; - my $num_pred_Ig_pseudogenes = 0; - my $num_Ig_segments = 0; - - print STDERR "Processing seq_region ", $slice->seq_region_name(), "\n"; - - my @genes = @{$slice->get_all_Genes()}; - - foreach my $gene (@genes) { - my $type = $gene->type; - if ($type eq 'Known') { - $num_known_genes++; - } elsif ($type eq 'Novel_CDS') { - $num_nov_CDS++; - } elsif ($type eq 'Novel_Transcript') { - $num_nov_trans++; - } elsif ($type eq 'Putative') { - $num_put_trans++; - } elsif ($type eq 'Predicted_Gene') { - $num_pred_trans++; - } elsif ($type eq 'Ig_Pseudogene_Segment') { - $num_pred_Ig_pseudogenes++; - } elsif ($type eq 'Ig_Segment') { - $num_Ig_segments++; - } elsif ($type eq 'Pseudogene') { - $num_unclass_pseudo_genes++; - } elsif ($type eq 'Processed_pseudogene') { - $num_proc_pseudo_genes++; - } elsif ($type eq 'Unprocessed_pseudogene') { - $num_unproc_pseudo_genes++; - } +$ENV{'ENSEMBL_SPECIES'} = $species; - if ($type =~ /seudogene$/) { - $num_tot_pseudo_genes++; - } - } +## set db user/pass to allow write access +my $db_ref = $EnsWeb::species_defs->databases; +$db_ref->{'ENSEMBL_DB'}{'USER'} = $EnsWeb::species_defs->ENSEMBL_WRITE_USER; +$db_ref->{'ENSEMBL_DB'}{'PASS'} = $EnsWeb::species_defs->ENSEMBL_WRITE_PASS; - print "Slice", $slice->seq_region_name(), " has the following features:\n\n"; - print "known genes = $num_known_genes\n"; - print "novel coding sequences = $num_nov_CDS\n"; - print "novel transcripts = $num_nov_trans\n"; - print "putative transcripts = $num_put_trans\n"; - print "predicted transcripts = $num_pred_trans\n"; - print "total number of pseudogenes = $num_tot_pseudo_genes\n"; - print "\tunclassified pseudogenes = $num_unclass_pseudo_genes\n"; - print "\tprocessed pseudogenes = $num_proc_pseudo_genes\n"; - print "\tunprocessed pseudogenes = $num_unproc_pseudo_genes\n"; - print "Ig Pseudogenes = $num_pred_Ig_pseudogenes\n"; - print "Ig Segments = $num_Ig_segments\n\n\n"; +## connect to databases +my %run; +@run{split(/,/, $run)} = 1; +my @dbs = qw(core); +push @dbs, 'glovar' if ($run{'snp'}); +my $databases = &EnsEMBL::DB::Core::get_databases(@dbs); +die "Problem connecting to databases: $databases->{'error'}\n" + if $databases->{'error'} ; +warn "Database error: $databases->{'non_fatal_error'}\n" + if $databases->{'non_fatal_error'}; + +## get slice and attribute adaptors, loop over all toplevel slices +my $slice_adaptor = $databases->{'core'}->get_SliceAdaptor(); +my $attrib_adaptor = $databases->{'core'}->get_AttributeAdaptor(); +my $top_slices = $slice_adaptor->fetch_all("toplevel"); + +foreach my $slice (@$top_slices) { + my %num; my @attribs; + + print STDERR "Processing seq_region ", $slice->seq_region_name(), ":\n"; + + ## genes + if ($run{'gene'}) { + print STDERR "\tGenes...\n"; + my @genes = @{$slice->get_all_Genes()}; + foreach my $gene (@genes) { + my $type = $gene->type; + $num{$type}++; + if ($type =~ /seudogene$/) { + $num{'Total_Pseudogenes'}++; + } + } + + print STDERR "Slice", $slice->seq_region_name(), + " has the following features:\n\n"; + foreach my $type (keys %num) { + print STDERR "$type = $num{$type}\n"; + } + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => 'Known genes', + -CODE => 'KnownGeneCount', + -VALUE => $num{'Known'}, + -DESCRIPTION => 'Total Number of Known genes'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => 'Novel CDS', + -CODE => 'NovelCDSCount', + -VALUE => $num{'Novel_CDS'}, + -DESCRIPTION => 'Total Number of Novel CDSs'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => 'Novel transcripts', + -CODE => 'NovelTransCount', + -VALUE => $num{'Novel_Transcript'}, + -DESCRIPTION => 'Total Number of Novel transcripts'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => 'Putative transcripts', + -CODE => 'PutTransCount', + -VALUE => $num{'Putative'}, + -DESCRIPTION => 'Total Number of Putative transcripts'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => 'Predicted transcripts', + -CODE => 'PredTransCount', + -VALUE => $num{'Predicted_Gene'}, + -DESCRIPTION => 'Total Number of Predicted transcripts'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => 'Total Pseudogenes', + -CODE => 'TotPsCount', + -VALUE => $num{'Total_Pseudogenes'}, + -DESCRIPTION => 'Total Number of Pseudogenes'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => 'Unclassified pseudogenes', + -CODE => 'UnclassPsCount', + -VALUE => $num{'Pseudogene'}, + -DESCRIPTION => 'Number of Unclassified pseudogenes'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => 'Processed pseudogenes', + -CODE => 'ProcPsCount', + -VALUE => $num{'Processed_pseudogene'}, + -DESCRIPTION => 'Number of Processed pseudogenes'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => 'Unprocessed pseudogenes', + -CODE => 'UnprocPsCount', + -VALUE => $num{'Unprocessed_pseudogene'}, + -DESCRIPTION => 'Number of Unprocessed pseudogenes'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => 'Ig Segments', + -CODE => 'IgSegCount', + -VALUE => $num{'Ig_Segment'}, + -DESCRIPTION => 'Total Number of Ig Segments'); + + push @attribs, Bio::EnsEMBL::Attribute->new + (-NAME => 'Ig Pseudogene Segments', + -CODE => 'IgPsSegCount', + -VALUE => $num{'Ig_Pseudogene_Segment'}, + -DESCRIPTION => 'Total Number of Ig Pseudogene Segments'); + } - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => 'Known Genes', - -CODE => 'KnownGeneCount', - -VALUE => $num_known_genes, - -DESCRIPTION => 'Total Number of Known Genes'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => 'Novel CDS', - -CODE => 'NovelCDSCount', - -VALUE => $num_nov_CDS, - -DESCRIPTION => 'Total Number of Novel CDSs'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => 'Novel transcripts', - -CODE => 'NovelTransCount', - -VALUE => $num_nov_trans, - -DESCRIPTION => 'Total Number of Novel Transcripts'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => 'Putative transcripts', - -CODE => 'PutTransCount', - -VALUE => $num_put_trans, - -DESCRIPTION => 'Total Number of Putative Transcripts'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => 'Predicted transcripts', - -CODE => 'PredTransCount', - -VALUE => $num_pred_trans, - -DESCRIPTION => 'Total Number of Predicted Transcripts'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => 'Total Pseudogenes', - -CODE => 'TotPsCount', - -VALUE => $num_tot_pseudo_genes, - -DESCRIPTION => 'Total Number of Pseudogenes'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => 'Unclassified Pseudogenes', - -CODE => 'UnclassPsCount', - -VALUE => $num_unclass_pseudo_genes, - -DESCRIPTION => 'Number of Unclassified Pseudogenes'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => 'Processed Pseudogenes', - -CODE => 'ProcPsCount', - -VALUE => $num_proc_pseudo_genes, - -DESCRIPTION => 'Number of Processed Pseudogenes'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => 'Unprocessed Pseudogenes', - -CODE => 'UnprocPsCount', - -VALUE => $num_unproc_pseudo_genes, - -DESCRIPTION => 'Number of Unprocessed Pseudogenes'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => 'Ig Segments', - -CODE => 'IgSegCount', - -VALUE => $num_Ig_segments, - -DESCRIPTION => 'Total Number of Ig Segments'); - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => 'Ig Pseudogene Segments', - -CODE => 'IgPsSegCount', - -VALUE => $num_pred_Ig_pseudogenes, - -DESCRIPTION => 'Total Number of Ig Pseudogene Segments'); - - if( $snps_present ) { - my $snps = $slice->get_all_SNPs(); + ## snps + if ($run{'snp'}) { + print STDERR "\tSNPs...\n"; + my $snps = $slice->get_all_ExternalFeatures('GlovarSNP'); push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => 'SNP Count', + (-NAME => 'SNPs', -CODE => 'SNPCount', -VALUE => scalar( @$snps ), -DESCRIPTION => 'Total Number of SNPs'); @@ -209,43 +204,6 @@ sub print_chromo_stats { } } - -# -# tries to attach lite. -# - -sub lite_attach { - my $db = shift; - - my $core_db_name; - $core_db_name = $db->dbname(); - if( $core_db_name !~ /_core_/ ) { - return 0; - } - # - # get a lost of all databases on that server - # - my $sth = $db->prepare( "show databases" ); - $sth->execute(); - my $all_db_names = $sth->fetchall_arrayref(); - my %all_db_names = map {( $_->[0] , 1)} @$all_db_names; - my $snp_db_name = $core_db_name; - $snp_db_name =~ s/_core_/_lite_/; - if( ! exists $all_db_names{ $snp_db_name } ) { - return 0; - } - - my $snp_db = Bio::EnsEMBL::Lite::DBAdaptor->new - ( -host => $db->host(), - -user => $db->username(), - -pass => $db->password(), - -port => $db->port(), - -dbname => $snp_db_name ); - $db->add_db_adaptor( "lite", $snp_db ); - return 1; -} - - 1;