Commit 6c4b2b61 authored by Steve Trevanion's avatar Steve Trevanion
Browse files

merge from vega-40-dev

parent f0f369a1
......@@ -140,3 +140,7 @@
# label frameshifts modelled as short (1,2,4,5 bp) introns
59 Frameshift Frameshift Frameshift modelled as intron
#more gene counts for Vega
60 PTCount processed_transcript_UNKNOWN Number of Processed Transcripts
61 PredPTCount processed_transcript_PREDICTED Number of Predicted Processed Transcripts
......@@ -28,6 +28,7 @@ General options:
Specific options:
--chromosomes, --chr=LIST only process LIST chromosomes
--limit_file only process gene_ids in this file
=head1 DESCRIPTION
......@@ -79,6 +80,7 @@ use Bio::EnsEMBL::Utils::ConversionSupport;
use Bio::EnsEMBL::DensityType;
use Bio::EnsEMBL::DensityFeature;
use POSIX;
use Data::Dumper;
$| = 1;
......@@ -86,8 +88,12 @@ my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
# parse options
$support->parse_common_options(@_);
$support->parse_extra_options('chromosomes|chr=s@');
$support->allowed_params($support->get_common_params, 'chromosomes');
$support->parse_extra_options(
'chromosomes|chr=s@',
'limit_file=s');
$support->allowed_params($support->get_common_params,
'chromosomes',
'limit_file');
if ($support->param('help') or $support->error) {
warn $support->error if $support->error;
......@@ -114,7 +120,17 @@ my $dbh = $dba->dbc->db_handle;
# split chromosomes by size and determine block size
my $chr_slices = $support->split_chromosomes_by_size(5000000);
# known biotype/status pairs and associated density type
#limit to gene_stable_ids ?
my (%gene_ids, %found_genes);
if ($support->param('limit_file')) {
my $in = $support->filehandle('<', $support->param('limit_file'));
while (<$in>) {
my ($gsi) = split ' ', $_;
$gene_ids{$gsi}++;
}
}
# biotype/status pairs and associated density type
my %gene_types = (
"protein_coding_KNOWN" => "knownPCodDensity",
"protein_coding_in_progress_KNOWN" => "knownPCodDensity",
......@@ -130,6 +146,8 @@ my %gene_types = (
"pseudogene_UNKNOWN" => "pseudoGeneDensity",
"processed_pseudogene_UNKNOWN" => "pseudoGeneDensity",
"unprocessed_pseudogene_UNKNOWN" => "pseudoGeneDensity",
# "processed_transcript_UNKNOWN" => "PTransDensity",
"processed_transcript_PREDICTED" => "PredPTransDensity",
);
# check for new biotype/status pairs
......@@ -159,6 +177,7 @@ map { $support->log(sprintf($FMT, $_, $type_status{$_}), 1) }
$support->log("\n");
if ($new) {
$support->log_warning("There are new biotype/status pairs! You might need to adapt Bio::EnsEMBL::ColourMap, EnsEMBL::Sanger_vega::Component::Chromosome and configure mapview to show them.\n\n");
exit;
}
......@@ -185,6 +204,7 @@ foreach my $type (keys %gene_types) {
}
}
# loop over block sizes
foreach my $block_size (keys %{ $chr_slices }) {
$support->log("Available chromosomes using block size of $block_size:\n ");
......@@ -220,7 +240,13 @@ foreach my $block_size (keys %{ $chr_slices }) {
$current_start = $current_end + 1;
next;
}
foreach my $gene (@{$genes}) {
GENE: foreach my $gene (@{$genes}) {
if ($support->param('limit_file')) {
my $gsi = $gene->stable_id;
next GENE unless ($gene_ids{$gsi});
$gene_ids{$gsi} |= 2;
}
# only count genes that don't overlap the subslice start
# (since these were already counted in the last bin)
my $gene_type = $gene->biotype . '_' . $gene->status;
......@@ -267,6 +293,20 @@ foreach my $block_size (keys %{ $chr_slices }) {
-DESCRIPTION => 'Number of Known Processed Transcripts',
);
push @attribs, Bio::EnsEMBL::Attribute->new(
-NAME => 'processed_transcript_PREDICTED',
-CODE => 'PredPTCount',
-VALUE => $total{'processed_transcript_PREDICTED'} || 0,
-DESCRIPTION => 'Number of Predicted Processed Transcripts',
);
push @attribs, Bio::EnsEMBL::Attribute->new(
-NAME => 'processed_transcript_UNKNOWN',
-CODE => 'PTCount',
-VALUE => $total{'processed_transcript_UNKNOWN'} || 0,
-DESCRIPTION => 'Number of Processed Transcripts',
);
push @attribs, Bio::EnsEMBL::Attribute->new(
-NAME => 'protein_coding_NOVEL',
-CODE => 'NovelPCCount',
......@@ -348,7 +388,7 @@ foreach my $block_size (keys %{ $chr_slices }) {
);
$attrib_adaptor->store_on_Slice($slice, \@attribs) unless ($support->param('dry_run'));
# log stats
$support->log("\n");
$support->log("Totals for chr $chr:\n", 1);
......@@ -357,8 +397,13 @@ foreach my $block_size (keys %{ $chr_slices }) {
map { $support->log(sprintf($FMT, $_, $total{$_}), 2) } sort keys %total;
$support->log_stamped("\nDone.\n\n", 1);
}
$support->log_stamped("Done.\n");
}
if (%gene_ids) {
my $no_genes_to_look_for = keys %gene_ids;
my (@genes_not_found) = grep {$gene_ids{$_} == 1} keys %gene_ids;
$support->log("Of the $no_genes_to_look_for genes in ".$support->param('limit_file')." , the following were missing: @genes_not_found\n");
}
$support->log_stamped("Done.\n");
# finish logfile
$support->finish_log;
......
......@@ -25,6 +25,9 @@ General options:
-n, --dry_run, --dry=0|1 don't write results to database
-h, --help, -? print help (this message)
--prune undo, i.e. delete from the database changes caused by running the script
=head1 DESCRIPTION
This script calculates the repeat coverage for given database.
......@@ -77,7 +80,10 @@ my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
# parse options
$support->parse_common_options(@_);
$support->allowed_params($support->get_common_params);
$support->parse_extra_options('prune');
$support->allowed_params($support->get_common_params, 'prune');
if ($support->param('help') or $support->error) {
warn $support->error if $support->error;
......@@ -96,81 +102,103 @@ my $dfa = $dba->get_DensityFeatureAdaptor;
my $dta = $dba->get_DensityTypeAdaptor;
my $aa = $dba->get_AnalysisAdaptor;
# Create Analysis object
my $analysis = new Bio::EnsEMBL::Analysis (
-program => "repeat_coverage_calc.pl",
-database => "ensembl",
-gff_source => "repeat_coverage_calc.pl",
-gff_feature => "density",
-logic_name => "PercentageRepeat",
);
$aa->store($analysis) unless ($support->param('dry_run'));
# split chromosomes by size and determine block size
my $chr_slices = $support->split_chromosomes_by_size(5000000);
# loop over block sizes
foreach my $block_size (keys %{ $chr_slices }) {
$support->log("Available chromosomes using block size of $block_size:\n ");
$support->log(join("\n ", map { $_->seq_region_name } @{ $chr_slices->{$block_size} })."\n");
# create DensityType objects
my $density_type = Bio::EnsEMBL::DensityType->new(
-analysis => $analysis,
-block_size => $block_size,
-value_type => 'ratio',
my $dbh= $dba->dbc->db_handle;
# check for prune option (undo)
if($support->param('prune')){
my $query= "delete analysis, density_type, density_feature from analysis, density_type, density_feature where (analysis.program= 'repeat_coverage_calc.pl') and (analysis.analysis_id= density_type.analysis_id) and (density_type.density_type_id= density_feature.density_type_id)";
if($dbh->do($query)){
$support->log("prune was successfull: any previous entries in the database generated by this script have been deleted\n");
}
else{
$support->log_error("prune failed: any previous entries in the database generated by this script have NOT been deleted\n");
}
}
else{
# Create Analysis object
my $analysis = new Bio::EnsEMBL::Analysis (
-program => "repeat_coverage_calc.pl",
-database => "ensembl",
-gff_source => "repeat_coverage_calc.pl",
-gff_feature => "density",
-logic_name => "PercentageRepeat",
);
$dta->store($density_type) unless ($support->param('dry_run'));
# loop over chromosomes
$support->log_stamped("Looping over chromosomes...\n");
my ($current_start, $current_end);
foreach my $slice (@{ $chr_slices->{$block_size} }) {
$current_start = 1;
my $chr = $slice->seq_region_name;
my $i;
my $bins = POSIX::ceil($slice->end/$block_size);
$support->log_stamped("Chromosome $chr with block size $block_size...\n", 1);
# loop over blocks
while($current_start <= $slice->end) {
$i++;
$current_end = $current_start + $block_size - 1;
if ($current_end > $slice->end) {
$current_end = $slice->end;
}
my $this_block_size = $current_end - $current_start + 1;
my $sub_slice = $slice->sub_Slice($current_start, $current_end);
my $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new or die;
foreach my $repeat (@{ $sub_slice->get_all_RepeatFeatures }) {
$rr->check_and_register("1", $repeat->start, $repeat->end);
}
my $count = 0;
my $non_repeats = $rr->check_and_register("1", 1, $this_block_size);
if (defined $non_repeats) {
foreach my $non_repeat (@{ $non_repeats }) {
$count += ($non_repeat->[1] - $non_repeat->[0]) + 1;
$aa->store($analysis) unless ($support->param('dry_run'));
# split chromosomes by size and determine block size
my $chr_slices = $support->split_chromosomes_by_size(5000000);
# loop over block sizes
foreach my $block_size (keys %{ $chr_slices }) {
$support->log("Available chromosomes using block size of $block_size:\n ");
$support->log(join("\n ", map { $_->seq_region_name } @{ $chr_slices->{$block_size} })."\n");
# create DensityType objects
my $density_type = Bio::EnsEMBL::DensityType->new(
-analysis => $analysis,
-block_size => $block_size,
-value_type => 'ratio',
);
$dta->store($density_type) unless ($support->param('dry_run'));
# loop over chromosomes
$support->log_stamped("Looping over chromosomes...\n");
my ($current_start, $current_end);
foreach my $slice (@{ $chr_slices->{$block_size} }) {
$current_start = 1;
my $chr = $slice->seq_region_name;
my $i;
my $bins = POSIX::ceil($slice->end/$block_size);
$support->log_stamped("Chromosome $chr with block size $block_size...\n", 1);
# loop over blocks
while($current_start <= $slice->end) {
$i++;
$current_end = $current_start + $block_size - 1;
if ($current_end > $slice->end) {
$current_end = $slice->end;
}
my $this_block_size = $current_end - $current_start + 1;
my $sub_slice = $slice->sub_Slice($current_start, $current_end);
my $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new or die;
foreach my $repeat (@{ $sub_slice->get_all_RepeatFeatures }) {
$rr->check_and_register("1", $repeat->start, $repeat->end);
}
my $count = 0;
my $non_repeats = $rr->check_and_register("1", 1, $this_block_size);
if (defined $non_repeats) {
foreach my $non_repeat (@{ $non_repeats }) {
$count += ($non_repeat->[1] - $non_repeat->[0]) + 1;
}
}
my $percentage_repeat = (($this_block_size-$count)/$this_block_size)*100;
$support->log_verbose("Chr: $chr | Bin: $i/$bins | ", 2);
$support->log_verbose("\%repeat ".sprintf("%.2f", $percentage_repeat)."\n");
my $df = Bio::EnsEMBL::DensityFeature->new(
-seq_region => $slice,
-start => $current_start,
-end => $current_end,
-density_type => $density_type,
-density_value => $percentage_repeat,
);
$dfa->store($df) unless ($support->param('dry_run'));
$current_start = $current_end + 1;
}
my $percentage_repeat = (($this_block_size-$count)/$this_block_size)*100;
$support->log_verbose("Chr: $chr | Bin: $i/$bins | ", 2);
$support->log_verbose("\%repeat ".sprintf("%.2f", $percentage_repeat)."\n");
my $df = Bio::EnsEMBL::DensityFeature->new(
-seq_region => $slice,
-start => $current_start,
-end => $current_end,
-density_type => $density_type,
-density_value => $percentage_repeat,
);
$dfa->store($df) unless ($support->param('dry_run'));
$current_start = $current_end + 1;
$support->log_stamped("Done.\n", 1);
}
$support->log_stamped("Done.\n", 1);
$support->log_stamped("Done.\n");
}
$support->log_stamped("Done.\n");
}
# finish logfile
$support->finish_log;
......@@ -25,6 +25,9 @@ General options:
-n, --dry_run, --dry=0|1 don't write results to database
-h, --help, -? print help (this message)
--prune undo, i.e. delete from the database changes caused by running the script
Specific options:
--repeatfile=FILE read repeat class definitions from FILE
......@@ -78,8 +81,8 @@ my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
# parse options
$support->parse_common_options(@_);
$support->parse_extra_options('repeatfile');
$support->allowed_params($support->get_common_params, 'repeatfile');
$support->parse_extra_options('repeatfile', 'prune');
$support->allowed_params($support->get_common_params, 'repeatfile', 'prune');
if ($support->param('help') or $support->error) {
warn $support->error if $support->error;
......@@ -92,12 +95,119 @@ $support->confirm_params;
# get log filehandle and print heading and parameters to logfile
$support->init_log;
$support->check_required_params('repeatfile');
$support->check_required_params('repeatfile') unless $support->param('prune'); # don't need the repeat file for pruning
# connect to database and get adaptors
my $dba = $support->get_database('ensembl');
my $dbh = $dba->dbc->db_handle;
# unless we are pruning (undo), we should make a backup copy of the repeat_consensus table
if($support->param('prune')){
# prune (undo mode)
# backup table must exist for this to work
if(check_for_backup_table()){
# backup table present
if($support->user_proceed("Replace the current table 'repeat_consensus' with the backup table 'repeat_consensus_backup'?")){
if($dbh->do("drop table repeat_consensus")){
if($dbh->do("create table repeat_consensus select * from repeat_consensus_backup")){
$support->log("prune (undo) was successful\n");
$support->log_stamped("Done.\n");
# finish logfile
$support->finish_log;
exit(0);
}
else{
$support->log_error("prune failed\n");
}
}
else{
$support->log_error("prune failed\n");
}
}
else{
#user is aborting
print "aborting...\n";
$support->log_error("aborting...\n");
}
}
else{
print "Cannot do prune, as no backup table\n";
$support->log_error("Cannot do prune, as no backup table\n");
}
}
else{
# normal run
# check to see if the backup table 'repeat_consensus_backup' already exists
if(check_for_backup_table()){
#table already exists: ask user if OK to overwrite it
if ($support->user_proceed("The backup table 'repeat_consensus_backup' already exists, OK to delete?")) {
if($dbh->do("drop table 'repeat_consensus_backup'")){
$support->log("deleted previous backup table\n");
make_backup_table();
}
else{
$support->log_error("tried but failed to delete previous backup table\n");
}
}
else{
# user won't allow removing the backup table
print "Aborting ...\n";
$support->log_error("User won't allow removal of backup table ... aborting program\n");
}
}else{
# table doesn't exist, therefore we can create it
make_backup_table();
}
}
# mouse fixes
if ($support->species eq 'Mus_musculus') {
$support->log("Making Vega mouse specific changes...\n");
......@@ -186,3 +296,40 @@ $support->log_stamped("Done.\n");
# finish logfile
$support->finish_log;
sub make_backup_table{
if($dbh->do("create table repeat_consensus_backup select * from repeat_consensus")){
$support->log("backup table 'repeat_consensus_backup was created successfully\n");
}
else{
$support->log_error("failed to create backup table 'repeat_consensus_backup'\n");
}
}
sub check_for_backup_table{
# check to see if the backup table 'repeat_consensus_backup' already exists
my @tables = $dbh->tables();
my $found=0;
foreach my $table(@tables){
#print "$table\n";
if($table eq '`repeat_consensus_backup`'){
$found=1;
last;
}
}
return $found;
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment