diff --git a/misc-scripts/density_feature/gene_density_calc.pl b/misc-scripts/density_feature/gene_density_calc.pl index 315a410ca7159f6bde2fd5138b35ab523c551be9..cb7e1efd2e5a47f3f03778f58d48fdd73c201dd8 100644 --- a/misc-scripts/density_feature/gene_density_calc.pl +++ b/misc-scripts/density_feature/gene_density_calc.pl @@ -31,18 +31,17 @@ $port = 3306 ; my ( $block_count, $genome_size, $block_size ); GetOptions( - "host=s", \$host, - "user=s", \$user, - "pass=s", \$pass, - "port=i", \$port, - "dbname=s", \$dbname, + "host|h=s", \$host, + "user|u=s", \$user, + "pass|p=s", \$pass, + "port=i", \$port, + "dbname|d=s", \$dbname, "pattern=s", \$pattern, + "help" , \&usage ); -unless ($host || $user || $pass || $dbname || $pattern) { - print "\n\nusage : perl gene_density.pl -host <HOST> -user <USER> -pass <PASS> -port <3306> -dbname <DATABASENAME>|-pattern <PATTERN> \n\n" ; - exit(0) ; -} +usage() if (!$host || !$user || !$pass || (!$dbname && !$pattern)); + my @dbnames; if (! $dbname) { my $dsn = sprintf( 'dbi:mysql:host=%s;port=%d', $host, $port ); @@ -57,7 +56,7 @@ else { foreach my $dbname (@dbnames) { if ( $pattern && ($dbname !~ /$pattern/) ) { next } - printf( "Connecting to '%s'\n", $dbname ); + print STDOUT "Connecting to $dbname\n"; my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor( -host => $host, @@ -100,7 +99,7 @@ foreach my $dbname (@dbnames) { -port => $port, -pass => $pass, -dbname => $dna_db_name ); - print STDERR "Attaching $dna_db_name to $dbname.\n"; + print STDOUT "Attaching $dna_db_name to $dbname.\n"; $db->dnadb( $dna_db ); } else { print STDERR "No gene density for $dbname, no seq_regions.\n"; @@ -113,7 +112,7 @@ foreach my $dbname (@dbnames) { # - print "Deleting old knownGeneDensity and geneDensity features\n"; + print STDOUT "Deleting old knownGeneDensity and geneDensity features\n"; $sth = $db->dbc->prepare("DELETE df, dt, a, ad FROM density_feature df, density_type dt, analysis a, analysis_description ad WHERE ad.analysis_id = a.analysis_id AND a.analysis_id=dt.analysis_id AND dt.density_type_id=df.density_type_id AND a.logic_name IN ('knowngenedensity', 'genedensity')"); $sth->execute(); @@ -121,12 +120,6 @@ foreach my $dbname (@dbnames) { $sth->execute(); -# $sth = $db->dbc()->prepare( -# qq( -# DELETE ad -# FROM analysis_description ad -# WHERE ad.display_label IN ('knownGeneDensity', 'geneDensity')) ); -# $sth->execute(); my $dfa = $db->get_DensityFeatureAdaptor(); my $dta = $db->get_DensityTypeAdaptor(); @@ -173,7 +166,7 @@ foreach my $dbname (@dbnames) { # # Now the actual feature calculation loop # - + my $total_count = 0; foreach my $known (1, 0) { # @@ -199,13 +192,13 @@ foreach my $dbname (@dbnames) { my $slice_count = 0; my ( $current, $current_start, $current_end ); - foreach my $slice (@sorted_slices){ + while ( my $slice = shift @sorted_slices){ $block_size = $slice->length() / $bin_count; my @density_features;#sf7 - print "Gene densities for ".$slice->seq_region_name(). + print STDOUT "Gene densities for ".$slice->seq_region_name(). " with block size $block_size\n"; $current_end = 0; $current = 0; @@ -244,15 +237,18 @@ foreach my $dbname (@dbnames) { -end => $current_end, -density_type => $dt, -density_value => $count); + + $total_count++; } $dfa->store(@density_features); - print "Created ", scalar @density_features, " gene density features.\n"; last if ( $slice_count++ > $max_slices ); } + } - print "Finished with $dbname"; + print STDOUT "Created $total_count gene density features.\n"; + print STDOUT "Finished with $dbname"; } @@ -289,7 +285,71 @@ sub print_features { } } +sub usage { + my $indent = ' ' x length($0); + print <<EOF; exit(0); + + +What does it do? + +Populates known gene density features in a database as well as gene density +features from genes of any status. + +First it needs gene and seq_region tables to be populated. It then +deletes all knownGeneDensity and geneDensity entries from the +analysis, density_type and density_feature tables. All toplevel +slices are fetched and sorted from longest to shortest. Each slice +is divided into 150 bins (blocks). For each of the blocks or +sub-slices, the number of genes is counted to give geneDensity and +the number of genes with status KNOWN status is counted to give +knownGeneDensity. All biotypes except pseudogene are counted. + +Input data: genes, top level seq regions, xrefs +Output tables: analysis (logic_name: knownGeneDensity and geneDensity), + analysis description, density_type, density_feature + + +When to run it in the release cycle? + +It can be run after compara have handed over homologies and core have +done xref projections. + + +Which databases to run it on? + +It needs to be run across all core databases for every release. + +How long does it take? + +It takes about 10 mins to run for a database in normal queue, + + +Usage: + + $0 -h host [-port port] -u user -p password \\ + $indent -d database | -pattern pattern \\ + $indent [-help] \\ + + + -h|host Database host to connect to + + -port Database port to connect to (default 3306) + + -u|user Database username + + -p|pass Password for user + + -d|dbname Database name + + -pattern Database name regexp + + -help This message + + +EOF + +} diff --git a/misc-scripts/density_feature/percent_gc_calc.pl b/misc-scripts/density_feature/percent_gc_calc.pl index e3ebcb84841242052628ada3953ee4ba2f8cf58b..219a3af6cf072f1511fb07729f7f63fb5ea66bc6 100644 --- a/misc-scripts/density_feature/percent_gc_calc.pl +++ b/misc-scripts/density_feature/percent_gc_calc.pl @@ -20,14 +20,15 @@ use Getopt::Long; my ( $host, $user, $pass, $port, $dbname ); -GetOptions( "host=s", \$host, - "user=s", \$user, - "pass=s", \$pass, - "port=i", \$port, - "dbname=s", \$dbname +GetOptions( "host|h=s", \$host, + "user|u=s", \$user, + "pass|p=s", \$pass, + "port=i", \$port, + "dbname|d=s", \$dbname, + "help" , \&usage ); - +usage() if (!$host || !$user || !$pass || !$dbname); my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host => $host, -user => $user, @@ -52,7 +53,7 @@ if( ! $dna_count ) { -print "Deleting old PercentGC features\n"; +print STDOUT "Deleting old PercentGC features\n"; $sth = $db->dbc->prepare( qq( DELETE df, dt, a, ad @@ -63,13 +64,6 @@ AND dt.density_type_id=df.density_type_id AND a.logic_name='percentgc') ); $sth->execute(); -# $sth = $db->dbc()->prepare( -# qq( -# DELETE ad -# FROM analysis_description ad -# WHERE ad.display_label = 'PercentGC') ); -# $sth->execute(); - # # Get the adaptors needed; # @@ -120,13 +114,13 @@ my ( $current_start, $current_end, $current ); my $slice_count = 0; my $block_size; -foreach my $slice (@sorted_slices){ +while ( my $slice = shift @sorted_slices ){ $block_size = $slice->length() / $bin_count; my @density_features=(); - print "GC percentage for ".$slice->seq_region_name(). + print STDOUT "GC percentage for ".$slice->seq_region_name(). " with block size $block_size\n"; $current_end = 0; @@ -166,11 +160,6 @@ foreach my $slice (@sorted_slices){ - - - - - # # helper to draw an ascii representation of the density features # @@ -206,7 +195,65 @@ sub print_features { } +sub usage { + my $indent = ' ' x length($0); + print <<EOF; exit(0); + + +What does it do? + +First, it needs the dna table to be populated. It then deletes +all PercentGC entries from the analysis, density_type and +density_feature tables. All toplevel slices are fetched and sorted +from longest to shortest. Each slice is divided into 150 bins +(blocks). For each of the blocks or sub-slices, the %gc is +calculated. + +Input data: dna sequence, top level seq regions +Output tables: analysis (logic_name: percentgc), analysis description, + density_type, density_feature + + +When to run it in the release cycle? + +It can be run after genebuilders have finished their Xrefs +(script not affected by projected Xrefs). + + +Which databases to run it on? +Run on core databases for new species or if one of the following changed: + - dna sequence + - assembly + + +How long does it take? + +It takes about 15 mins to run for a database in normal queue, + + +Usage: + + $0 -h host [-port port] -u user -p password \\ + $indent -d database \\ + $indent [-help] \\ + + -h|host Database host to connect to + + -port Database port to connect to (default 3306) + + -u|user Database username + + -p|pass Password for user + + -d|dbname Database name + + -help This message + + +EOF + +} diff --git a/misc-scripts/density_feature/repeat_coverage_calc.pl b/misc-scripts/density_feature/repeat_coverage_calc.pl index ecebe1e2adf7c3fb7d7b9937931ca954d92b8905..2879688a276a8f34a9381463f9626804efbdadf8 100644 --- a/misc-scripts/density_feature/repeat_coverage_calc.pl +++ b/misc-scripts/density_feature/repeat_coverage_calc.pl @@ -24,13 +24,15 @@ use Getopt::Long; my ( $host, $user, $pass, $port, $dbname ); -GetOptions( "host=s", \$host, - "user=s", \$user, - "pass=s", \$pass, - "port=i", \$port, - "dbname=s", \$dbname +GetOptions( "host|h=s", \$host, + "user|u=s", \$user, + "pass|p=s", \$pass, + "port=i", \$port, + "dbname|d=s", \$dbname, + "help" , \&usage ); +usage() if (!$host || !$user || !$pass || !$dbname); my $chunksize = 1_000_000; my $small_blocksize = 1_000; @@ -63,16 +65,10 @@ if( ! $repeat_count ) { # Clean up old features first. Also remove analysis and density type entry as these are recreated. # -print "Deleting old PercentageRepeat features\n"; +print STDOUT "Deleting old PercentageRepeat features\n"; $sth = $db->dbc->prepare("DELETE df, dt, a, ad FROM analysis_description ad, density_feature df, density_type dt, analysis a WHERE ad.analysis_id = a.analysis_id AND a.analysis_id=dt.analysis_id AND dt.density_type_id=df.density_type_id AND a.logic_name='rercentagerepeat'"); $sth->execute(); -# $sth = $db->dbc()->prepare( -# qq( -# DELETE ad -# FROM analysis_description ad -# WHERE ad.display_label = 'percentagerepeat') ); -# $sth->execute(); my $slice_adaptor = $db->get_SliceAdaptor(); my $dfa = $db->get_DensityFeatureAdaptor(); @@ -80,7 +76,6 @@ my $dta = $db->get_DensityTypeAdaptor(); my $aa = $db->get_AnalysisAdaptor(); - # # Create new analysis object for density calculation. # @@ -120,13 +115,12 @@ $dta->store($variable_density_type); my $slice_count = 0; - -foreach my $slice ( @sorted_slices ) { +while ( my $slice = shift @sorted_slices ) { # # do it for small and large blocks # - print STDERR ("Working on seq_region ".$slice->seq_region_name()." length ".$slice->seq_region_length()); + print STDOUT ("Working on seq_region ".$slice->seq_region_name()." length ".$slice->seq_region_length()); my $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new(); my $chunk_end = 0; @@ -226,7 +220,7 @@ foreach my $slice ( @sorted_slices ) { -density_value => $percentage_repeat)); } - print STDERR " DONE.\n"; + print STDOUT " DONE.\n"; } @@ -242,9 +236,6 @@ sub register { - - - # # helper to draw an ascii representation of the density features # @@ -280,6 +271,78 @@ sub print_features { } +sub usage { + my $indent = ' ' x length($0); + print <<EOF; exit(0); + +What does it do? + +Calculates the percentage of repetetive elements for top level seq_regions. + +First it needs repeat_feature table to be populated. It then +deletes all PercentageRepeat entries from the analysis, density_type +and density_feature tables. All toplevel slices are fetched and sorted +from longest to shortest. For each toplevel slice, both the small and +the variable density repeats are calculated. + +Small repeats are done like this: Move along the toplevel slice 1 KB +at a time. Find the %repeat for each of these 1 KB blocks. + +Variable repeats are done like this: Divide each slice into 150 +sub_slices. Move along the toplevel slice 1 MB at a time. Foreach +sub_slice within the 1 MB, calculate the %repeat for that sub_slice. +Variable repeats are only found for the 100 longest toplevel slices. + +Input data: repeat features, top level seq regions +Output tables: analysis (logic_name: percentagerepeat), analysis description, + density_type (two entries, one for small_density type of + length 1 KB and one for variable_density_type of length 1MB), + density_feature + + +When to run it in the release cycle? + +It can be run after genebuilders have finished their Xrefs +(script not affected by projected Xrefs). + + +Which databases to run it on? + +Run on core databases for new species or if one of the following changed: + - dna sequence + - assembly + - repeats + + +How long does it take? + +It takes about 1 hour to run for a database in the long queue. The script is +slowed down considerably for very fragmented genomes with thousands of toplevel +seqs. + + +Usage: + $0 -h host [-port port] -u user -p password \\ + $indent -d database \\ + $indent [-help] \\ + + + -h|host Database host to connect to + + -port Database port to connect to (default 3306) + + -u|user Database username + + -p|pass Password for user + + -d|dbname Database name + + -help This message + + +EOF + +} diff --git a/misc-scripts/density_feature/seq_region_stats.pl b/misc-scripts/density_feature/seq_region_stats.pl index 379a5b757ea665bd756eeb98c420bca92b3bd47e..eeeb6c26f683f9a6031d6d0ea7be065cbc9b9c9c 100644 --- a/misc-scripts/density_feature/seq_region_stats.pl +++ b/misc-scripts/density_feature/seq_region_stats.pl @@ -8,18 +8,20 @@ use Bio::EnsEMBL::DBSQL::DBAdaptor; use Bio::EnsEMBL::Variation::DBSQL::DBAdaptor; use Getopt::Long; -my ( $host, $user, $pass, $port, $dbname, $pattern, $genestats, $snpstats ); +my ( $host, $user, $pass, $port, $dbname, $pattern, $stats ); -GetOptions( "host=s", \$host, - "user=s", \$user, - "pass=s", \$pass, +GetOptions( "host|h=s", \$host, + "user|u=s", \$user, + "pass|p=s", \$pass, "port=i", \$port, - "dbname=s", \$dbname, + "dbname|d=s", \$dbname, "pattern=s", \$pattern, - "genestats", \$genestats, - "snpstats", \$snpstats + "stats|s=s", \$stats, + "help" , \&usage ); +usage() if (!$host || !$user || !$pass || (!$dbname && !$pattern) || !$stats || $stats !~ /^(gene|snp)$/ ); + my %attrib_codes = ( 'miRNA' => 'miRNA', 'snRNA' => 'snRNA', @@ -70,7 +72,26 @@ my %attrib_codes = ( 'miRNA' => 'miRNA', 'IG_J_pseudogene' => 'pseudo', 'retrotransposed' => 'rettran', 'processed_transcript' => 'proc_tr', - 'lincRNA' => 'lincRNA',); + 'lincRNA' => 'lincRNA'); + + + +#get biotypes from the production database when new field attr_code is added to the biotype table + +# Master database location: +# my ( $mhost, $mport ) = ( 'ens-staging1', '3306' ); +# my ( $muser, $mpass ) = ( 'ensro', undef ); +# my $mdbname = 'ensembl_production'; + + +# my $prod_dsn = sprintf( 'DBI:mysql:host=%s;port=%d;database=%s', + # $mhost, $mport, $mdbname ); +# my $prod_dbh = DBI->connect( $prod_dsn, $muser, $mpass, + # { 'PrintError' => 1, 'RaiseError' => 1 } ); + +#my @attrib_codes = map { @_[0]->@_[1] } @{ $prod_dbh->selectall_arrayref('select distinct name, attr_code from biotype where is_current = 1 order by name') }; + +#my %attrib_codes = map { $_=>$_} @attrib_codes; my @dbnames; if (! $dbname) { @@ -83,6 +104,9 @@ else { @dbnames = ( $dbname ) } +my $genestats = 1 if($stats eq 'gene'); +my $snpstats = 1 if($stats eq 'snp'); + foreach my $name (@dbnames) { if ( $pattern && ($name !~ /$pattern/) ) { next } @@ -94,23 +118,18 @@ foreach my $name (@dbnames) { -pass => $pass, -dbname => $name); - - # do both genestats and snpstats by default - $genestats = $snpstats = 1 if(!$genestats && !$snpstats); - - # $snpstats = 0 if ($name =~ /homo|mus/); - - + my $total_count = 0; # delete old attributes before starting - foreach my $code (values %attrib_codes) { - if ($genestats) { - my $sth = $db->dbc()->prepare( "DELETE sa FROM seq_region_attrib sa, attrib_type at WHERE at.attrib_type_id=sa.attrib_type_id AND at.code=?" ); - $sth->execute("GeneNo_$code"); - } - if ($snpstats) { + if ($genestats) { + foreach my $code (values %attrib_codes) { + my $sth = $db->dbc()->prepare( "DELETE sa FROM seq_region_attrib sa, attrib_type at WHERE at.attrib_type_id=sa.attrib_type_id AND at.code=?" ); + $sth->execute("GeneNo_$code"); + } + } + + if ($snpstats) { my $sth = $db->dbc()->prepare( "DELETE sa FROM seq_region_attrib sa, attrib_type at WHERE at.attrib_type_id=sa.attrib_type_id AND at.code=?" ); - $sth->execute("SNPCount"); - } + $sth->execute("SNPCount"); } # @@ -141,9 +160,13 @@ foreach my $name (@dbnames) { exit(); } - my $snp_db = variation_attach( $db ); + my $snps_present; + my $snp_db; - my $snps_present = $snpstats && $snp_db; + if ($snpstats) { + $snp_db = variation_attach( $db ); + if (defined $snp_db) {$snps_present = 1;} + } my $slice_adaptor = $db->get_SliceAdaptor(); my $attrib_adaptor = $db->get_AttributeAdaptor(); @@ -154,10 +177,12 @@ foreach my $name (@dbnames) { while (my $slice = shift(@{$top_slices})) { # print STDERR "Processing seq_region ", $slice->seq_region_name(), "\n"; - + my @attribs; - + if($genes_present) { + + my %attrib_counts; my %counts; my $genes = $slice->get_all_Genes(); @@ -184,23 +209,26 @@ foreach my $name (@dbnames) { next; } - # not used: - # my $no_space = $biotype; - # $no_space =~ s/ /_/g; + $attrib_counts{$attrib_code} += $counts{$biotype}; + } + + foreach my $attrib_code (keys %attrib_counts) { push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => $biotype.' Gene Count', + (-NAME => $attrib_code.' Gene Count', -CODE => 'GeneNo_'.$attrib_code, - -VALUE => $counts{$biotype}, - -DESCRIPTION => 'Number of '.$biotype.' Genes'); + -VALUE => $attrib_counts{$attrib_code}, + -DESCRIPTION => 'Number of '.$attrib_code.' Genes'); + } + } if( $snps_present ) { my $sth = $snp_db->dbc->prepare("SELECT COUNT(*) FROM variation_feature WHERE seq_region_id = ?"); $sth->execute($slice->get_seq_region_id); my $count; - $sth->bind_columns($count); + $sth->bind_columns(undef,\$count); $sth->fetch; push @attribs, Bio::EnsEMBL::Attribute->new @@ -213,8 +241,13 @@ foreach my $name (@dbnames) { } $attrib_adaptor->store_on_Slice($slice, \@attribs); + my $slice_attrib_count = @attribs; + $total_count += $slice_attrib_count; # print_chromo_stats([$slice]); } + + print STDOUT "Written $total_count seq reqion attributes to database $name on server $host.\n"; + } @@ -272,3 +305,96 @@ if( ! exists $all_db_names{ $snp_db_name } ) { return $snp_db; } + + +sub usage { + my $indent = ' ' x length($0); + print <<EOF; exit(0); + + +For each toplevel slice, count the number of genes for each biotype +(gene stats) or count the number of SNPs (snp stats). + +gene stats +What does it do? + +Deletes all seq_region_attrib that have attrib_type code +with a prefix 'GeneNo_'. All toplevel slices are fetched. + +Input data: dna seqence, genes, xrefs, xref projections +Output tables: seq_region_attrib (attrib_type code with prefix 'GeneNo') + + +When to run it in the release cycle? + +After core have finished xref projections + + +Which databases to run it on? + +Run on all core databases (including otherfeatures, cdna etc) for each release. + + +How long does it take? + +It takes about 10 mins to run for a database in normal queue, + + +snp stats + +What does it do? + +Deletes out all seq_region_attrib that have attrib_type code of 'SNPCount'. +Attach variation db if exists. All toplevel slices are fetched. +For each slice, count the number of SNPs. + +This option requires ensembl-variation in perl5lib. + +Input data: top level seq regions, variation db +Output tables: seq_region_attrib (attrib_type code with prefix 'SNPCount') + + +When to run it in the release cycle? + +When variation dbs have been handed over + + +Which databases to run it on? + +Run on core databases only for new species or if the assembly changed, +or if the variation positions have changed in the corresponding variation db. + + +How long does it take? + +It takes about 20 mins to run for a database in normal queue. + + + +Usage: + + $0 -h host [-port port] -u user -p password \\ + $indent -d database | -pattern pattern \\ + $indent -s gene | snp \\ + $indent [-help] \\ + + -h|host Database host to connect to + + -port Database port to connect to (default 3306) + + -u|user Database username + + -p|pass Password for user + + -d|dbname Database name + + -pattern Database name regexp + + -s|stats 'gene' or 'snp' + + -help This message + + +EOF + +} diff --git a/misc-scripts/density_feature/variation_density.pl b/misc-scripts/density_feature/variation_density.pl index 7794fa1aec7931ae8c7938708886f571c2fefb7a..21fc7dffdf773740756adc1813dc1d444157e0bf 100644 --- a/misc-scripts/density_feature/variation_density.pl +++ b/misc-scripts/density_feature/variation_density.pl @@ -5,8 +5,11 @@ use strict; use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor; +use Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor; use Bio::EnsEMBL::DensityType; use Bio::EnsEMBL::DensityFeature; +use Bio::EnsEMBL::Registry; use Bio::EnsEMBL::Variation::DBSQL::DBAdaptor; use Getopt::Long; @@ -20,20 +23,23 @@ my ($host, $user, $pass, $port, $species); my ($block_count, $genome_size, $block_size ); -GetOptions( "host=s", \$host, - "user=s", \$user, - "pass=s", \$pass, - "port=i", \$port, - "species=s", \$species ); +GetOptions( "host|h=s", \$host, + "user|u=s", \$user, + "pass|p=s", \$pass, + "port=i", \$port, + "species|s=s", \$species ); -Bio::EnsEMBL::Registry->load_registry_from_db(-host => $host, -user => $user, -pass => $pass, -port => $port); -my $density_feature_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species, "core", "DensityFeature") || die "Can't create density feature adaptor"; -my $density_type_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species, "core", "DensityType") || die "Can't create density type adaptor"; -my $analysis_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species, "core", "analysis") || die "Can't create analysis adaptor"; -my $slice_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species, "core", "slice") || die "Can't create slice adaptor"; +usage() if (!$host || !$user || !$pass || !$species ); -my $variation_feature_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species, "variation", "VariationFeature") || die "Can't create variation feature adaptor"; +my $reg = Bio::EnsEMBL::Registry->load_registry_from_db(-host => $host, -user => $user, -pass => $pass, -port => $port, -species => $species); + +my $density_feature_adaptor = $reg->get_adaptor($species, "core", "DensityFeature") || die "Can't create density feature adaptor"; +my $density_type_adaptor = $reg->get_adaptor($species, "core", "DensityType") || die "Can't create density type adaptor"; +my $analysis_adaptor = $reg->get_adaptor($species, "core", "analysis") || die "Can't create analysis adaptor"; +my $slice_adaptor = $reg->get_adaptor($species, "core", "slice") || die "Can't create slice adaptor"; + +my $variation_feature_adaptor = $reg->get_adaptor($species, "variation", "VariationFeature") || die "Can't create variation feature adaptor"; # TODO - variation from registry @@ -79,11 +85,13 @@ my ($current, $current_start, $current_end); # prepare statement outside of loop $sth = $variation_feature_adaptor->prepare("SELECT COUNT(*) FROM variation_feature WHERE seq_region_id = ? AND seq_region_start < ? AND seq_region_end > ?"); -foreach my $slice (@sorted_slices){ +my $total_count = 0; + +while ( my $slice = shift @sorted_slices){ $block_size = $slice->length() / $bin_count; - print "Calculating SNP densities for ". $slice->seq_region_name() . " with block size $block_size\n"; + print STDOUT "Calculating SNP densities for ". $slice->seq_region_name() . " with block size $block_size\n"; $current_end = 0; $current = 0; @@ -113,9 +121,76 @@ foreach my $slice (@sorted_slices){ -density_value => $count); $density_feature_adaptor->store($df); + $total_count ++; } last if ( $slice_count++ > $max_slices ); } +print STDOUT "Written $total_count density features for species $species on server $host\n"; + + +sub usage { + my $indent = ' ' x length($0); + print <<EOF; exit(0); + +What does it do? + +Calculates the density of SNP features on top level sequences. +Deletes out all density_feature and density_type entries having +analysis logic_name 'snpDensity'. Deletes analysis_description +where display_label = 'snpDensity'. All toplevel slices are fetched +and sorted from longest to shortest. Each slice is divided into 150 +bins. For each sub_slice, we count and store the number of +variation_features (SNPs) on that sub_slice. + + +Deletes out all seq_region_attrib that have attrib_type code of 'SNPCount'. +Attach variation db if exists. All toplevel slices are fetched. +For each slice, count the number of SNPs. + +Input data: top level seq regions, variation db +Output tables: analysis, analysis_description, density_feature, density_type + +The script requires ensembl-variation in perl5lib. + +When to run it in the release cycle? + +When variation dbs have been handed over + + +Which databases to run it on? + +The script updates a core database using data from the corresponding variation database. +Run it for new species or where the core assembly has changed, or if there are any changes to variation positions in the variation database. + + +How long does it take? + +It takes about 25 mins to run for a database in normal queue. + + + +Usage: + + $0 -h host [-port port] -u user -p password \\ + $indent -s species \\ + $indent [-help] \\ + + -h|host Database host to connect to + + -port Database port to connect to (default 3306) + + -u|user Database username + + -p|pass Password for user + + -s|species Species name + + -help This message + + +EOF + +} diff --git a/misc-scripts/gene_gc.pl b/misc-scripts/gene_gc.pl index 737138b3aa9a7a9274b00f2b69dbac93be52ac6b..6f11ee7695ecd651bacc0422c3a9829377ba689c 100644 --- a/misc-scripts/gene_gc.pl +++ b/misc-scripts/gene_gc.pl @@ -11,17 +11,17 @@ my ( $host, $user, $pass, $port, $dbpattern, $print); $port = 3306; -GetOptions( "dbhost|host=s", \$host, - "dbuser|user=s", \$user, - "dbpass|pass=s", \$pass, - "dbport|port=i", \$port, - "dbpattern|pattern=s", \$dbpattern, - "print", \$print, - "help" , \&usage +GetOptions( "host|h=s", \$host, + "user|u=s", \$user, + "pass|p=s", \$pass, + "port=i", \$port, + "pattern=s", \$dbpattern, + "print", \$print, + "help" , \&usage ); -usage() if (!$host || !$dbpattern); +usage() if (!$host || !$dbpattern || !$user || !$pass); # loop over databases my $dsn = "DBI:mysql:host=$host"; @@ -42,28 +42,33 @@ for my $dbname (@dbnames) { '-dbname' => $dbname, '-species' => $dbname); - print STDERR "$dbname\n"; + print STDOUT "$dbname\n"; delete_existing($dba) if !($print); - print STDERR "Calculating Gene GC attributes\n"; + print STDOUT "Calculating Gene GC attributes\n"; my $attribute_adaptor = $dba->get_AttributeAdaptor(); my $genes = $dba->get_GeneAdaptor()->fetch_all(); + my $total_count = 0; + while (my $gene = shift(@$genes)) { my $gc = $gene->feature_Slice()->get_base_count->{'%gc'}; if (!$print) { - + # attribute types need to be propagated from production database to all dbs + # if the type exists it won't be overwritten my $attribute = Bio::EnsEMBL::Attribute->new(-CODE => 'GeneGC', -NAME => 'Gene GC', -DESCRIPTION => 'Percentage GC content for this gene', -VALUE => $gc); my @attributes = ($attribute); $attribute_adaptor->store_on_Gene($gene->dbID, \@attributes); + + $total_count++; } else { @@ -72,6 +77,10 @@ for my $dbname (@dbnames) { } } + if (!$print) { + print STDOUT "Written $total_count 'GeneGC' gene attributes to database $dbname on server $host.\n"; + } + } # ---------------------------------------------------------------------- @@ -80,7 +89,7 @@ sub delete_existing { my $dba = shift; - print STDERR "Deleting existing 'GeneGC' gene attributes\n"; + print STDOUT "Deleting existing 'GeneGC' gene attributes\n"; my $dsth = $dba->dbc()->prepare("DELETE ga FROM gene_attrib ga, attrib_type at WHERE at.attrib_type_id=ga.attrib_type_id AND at.code='GeneGC'"); $dsth->execute(); @@ -88,25 +97,54 @@ sub delete_existing { sub usage { + my $indent = ' ' x length($0); print <<EOF; exit(0); -Calculate per-gene GC content and store as gene attributes. +What does it do? + +This script calculates per-gene GC content and stores it as gene attributes. +It deletes existing Gene GC attributes. Then fetches all genes in the +core db and calculates the %gc for each gene. + +Input data: dna sequence +Output tables: gene_attrib + + +When to run it in the release cycle? + +It can be run whenever the genes and sequence are stable, i.e. any time after +the genebuild handover, but before the handover to Mart. + + +Which databases to run it on? + +It needs to be run across all core databases for every release. + + +How long does it take? + +It takes a total of about 10 hours to run for all core databases in normal queue, + + +Usage: -Usage: perl $0 <options> + $0 -h host [-port port] -u user -p password \\ + $indent -pattern pattern [-print] \\ + $indent [-help] \\ - -host|dbhost Database host to connect to + -h|host Database host to connect to - -port|dbport Database port to connect to (default 3306) + -port Database port to connect to (default 3306) - -dbpattern Database name regexp + -u|user Database username - -user|dbuser Database username + -p|pass Password for user - -pass|dbpass Password for user + -pattern Database name regexp - -print Just print, don't insert or delete attributes + -print Just print, don't insert or delete attributes - -help This message + -help This message EOF