diff --git a/misc-scripts/canonical_transcripts/set_canonical_transcripts.pl b/misc-scripts/canonical_transcripts/set_canonical_transcripts.pl index 94c229468bf4a2d79c4d3ac54b899b175d18b730..24dcfa83dd72f8d9ee75b1a8b231d9bf7e3e566d 100644 --- a/misc-scripts/canonical_transcripts/set_canonical_transcripts.pl +++ b/misc-scripts/canonical_transcripts/set_canonical_transcripts.pl @@ -1,3 +1,4 @@ + =pod =head1 SYNOPSIS @@ -66,352 +67,433 @@ =cut -#!/usr/local/ensembl/bin/perl +#!/usr/bin/env perl use strict; use warnings; use Carp; use Data::Dumper; use DBI qw(:sql_types); -use Getopt::Long; use Bio::EnsEMBL::DBSQL::DBAdaptor; use Bio::EnsEMBL::Utils::Exception qw(throw); - -my ($host, $port, $dbname, $user,$pass); -my ($dnahost, $dnadbname, $dnauser); -my ($ccds_host, $ccds_dbname, $ccds_user); - -my $coord_system; -my $seq_region_name; -my $logic_name; # keep as undefined unless you only want to run on a specific analysis -my $write = 0; -my $include_non_ref = 1; -my $include_duplicates; -my $verbose = 0; - -GetOptions( 'dbhost:s' => \$host, - 'dbport:n' => \$port, - 'dbname:s' => \$dbname, - 'dbuser:s' => \$user, - 'dbpass:s' => \$pass, - 'dnahost:s' => \$dnahost, - 'dnadbname:s' => \$dnadbname, - 'dnauser:s' => \$dnauser, - 'ccds_host:s' => \$ccds_host, - 'ccds_dbname:s' => \$ccds_dbname, - 'ccds_user:s' => \$ccds_user, - 'coord_system_name:s' => \$coord_system, - 'seq_region_name:s' => \$seq_region_name, - 'logic_name:s' => \$logic_name, - 'write!' => \$write, - 'include_non_ref!' => \$include_non_ref, - 'include_duplicates' => \$include_duplicates, - 'verbose!' => \$verbose, ); - -unless ($write) { - print "You have not used the -write option " - . "so results will not be written into the database\n"; +use Bio::EnsEMBL::Utils::CliHelper; + +my $cli_helper = Bio::EnsEMBL::Utils::CliHelper->new(); + +# get the basic options for connecting to a database server +my $optsd = [ @{ $cli_helper->get_dba_opts() }, + @{ $cli_helper->get_dba_opts('dna') }, + @{ $cli_helper->get_dba_opts('ccds') } ]; +# add the print option +push( @{$optsd}, "coord_system_name:s" ); +push( @{$optsd}, "logic_name:s" ); +push( @{$optsd}, "write!" ); +push( @{$optsd}, "include_non_ref!" ); +push( @{$optsd}, "include_duplicates" ); +push( @{$optsd}, "verbose!" ); +# process the command line with the supplied options plus a help subroutine +my $opts = $cli_helper->process_args( $optsd, \&usage ); + +$opts->{write} ||= 0; +$opts->{include_non_ref} ||= 1; +$opts->{verbose} ||= 0; + +unless ( $opts->{write} ) { + print "You have not used the -write option " + . "so results will not be written into the database\n"; } -my $db = - new Bio::EnsEMBL::DBSQL::DBAdaptor( -host => $host, - -user => $user, - -port => $port, - -dbname => $dbname, - -pass => $pass ); -my $ccds_db; - -if ($ccds_dbname) { - if (!$ccds_user || !$ccds_host) { - throw ("You must provide user, host and dbname details to connect to CCDS DB!"); - } - $ccds_db = - new Bio::EnsEMBL::DBSQL::DBAdaptor( -host => $ccds_host, - -user => $ccds_user, - -port => $port, - -dbname => $ccds_dbname ); +my $db_args = $cli_helper->get_dba_args_for_opts($opts); +my $dnadb_args = $cli_helper->get_dba_args_for_opts( $opts, 'dna' ); +if ( defined $dnadb_args && scalar(@$dnadb_args) != scalar(@$db_args) ) { + croak "Different number of DBAs found for DB and DNADB"; } - -my $geneDB_has_DNA = check_if_DB_contains_DNA($db); - -my $dnadb; - -if ($geneDB_has_DNA == 0 && !$dnadbname) { - throw ("Your gene DB contains no DNA. You must provide a DNA_DB to connect to"); -} elsif ($geneDB_has_DNA == 0 && $dnadbname) { - $dnadb = new Bio::EnsEMBL::DBSQL::DBAdaptor( - '-host' => $dnahost, - '-user' => $dnauser, - '-dbname' => $dnadbname, - '-port' => $port, - ); - $db->dnadb($dnadb); +my $ccdsdb_args = $cli_helper->get_dba_args_for_opts( $opts, 'ccds' ); +if ( defined $ccdsdb_args && scalar(@$ccdsdb_args) != scalar(@$db_args) ) { + croak "Different number of DBAs found for DB and CCDSDB"; } -my $sa = $db->get_SliceAdaptor; -my $dea = $db->get_DBEntryAdaptor; -my $slices; - -# Only update the canonical transcripts in this region. -if ($seq_region_name) { - print "Only updating genes on chromosome $seq_region_name\n"; - my $slice = - $sa->fetch_by_region( $coord_system, $seq_region_name, $include_non_ref ,$include_duplicates ); - push( @$slices, $slice ); -} else { - $slices = $sa->fetch_all( $coord_system, '', $include_non_ref , $include_duplicates ); -} - -my $gene_update_sql = "update gene set canonical_transcript_id = ? where gene_id = ?"; -my $gene_sth = $db->dbc->prepare($gene_update_sql); - -SLICE: foreach my $slice (@$slices) { - # check whether the slice is reference or not - # this check is important for species that have a ccds database - # and that have non-reference sequence ie. human - my $slice_is_reference; - if (!$slice->is_reference) { - print "Doing non-reference slice ".$slice->name."\n"; - $slice_is_reference = 0; - } else { - print "Doing reference slice ".$slice->name."\n"; - $slice_is_reference = 1; - } - - # Now fetch the genes - print "\nGetting genes for " . $slice->name . "\n" if ($verbose); - my $genes = $slice->get_all_Genes( $logic_name, undef, 1 ); - my %canonical; - - GENE: foreach my $gene (@$genes) { - print "\nLooking at gene: (dbID " . $gene->dbID . ")"; - if ($gene->stable_id) { - print " :: " . $gene->stable_id ; - } - print "\n"; - my $transcripts = $gene->get_all_Transcripts; - if ( @$transcripts == 1 ) { - if ($ccds_db && $slice_is_reference) { - check_Ens_trans_against_CCDS($transcripts->[0], $ccds_db, $verbose); - } - $canonical{ $gene->dbID } = $transcripts->[0]->dbID; - print "Transcript ". $transcripts->[0]->display_id . " of biotype " . $transcripts->[0]->biotype . - " is chosen as the canonical transcript for gene ". $gene->display_id . - " of biotype ". $gene->biotype. "\n"; - next GENE; - } - - # Keep track of the transcript biotypes - my %trans; - foreach my $transcript ( @{$transcripts} ) { - push @{ $trans{ $transcript->biotype } }, $transcript; - } - - # If there is a single protein_coding transcript, make it the - # canonical transcript, if it has a functional translation. - foreach my $key ( keys(%trans) ) { - if ( ( $key eq 'protein_coding' ) && ( @{ $trans{$key} } == 1 ) ) { - my $trans_id = $trans{$key}->[0]->dbID; - if ( $trans{$key}->[0]->translation->seq !~ /\*/ ) { - if ($ccds_db && $slice_is_reference) { - check_Ens_trans_against_CCDS($trans{$key}->[0], $ccds_db, $verbose); - } - $canonical{ $gene->dbID } = $trans_id; - print "Transcript ". $trans{$key}->[0]->display_id . " of biotype " . $trans{$key}->[0]->biotype . - " is chosen as the canonical transcript for gene ". $gene->display_id . - " of biotype ". $gene->biotype. "\n"; - next GENE; - } else { - carp( "Transcript $trans_id has internal stop(s) " - . "and shouldn't if it is protein coding\n" ); - } - } elsif ( !exists( $trans{'protein_coding'} ) - && $key eq 'nonsense_mediated_decay' - && ( @{ $trans{$key} } == 1 ) ) { - my $trans_id = $trans{$key}->[0]->dbID; - if ( ( $trans{$key}->[0]->translation ) - && ( $trans{$key}->[0]->translation->seq !~ /\*/ ) ) { - print "The NMD transcript $trans_id will become " - . "the canonical transcript because there are no " - . "protein coding transcripts.\n"; - if ($ccds_db && $slice_is_reference) { - check_Ens_trans_against_CCDS($trans{$key}->[0], $ccds_db, $verbose); - } - $canonical{ $gene->dbID } = $trans_id; - print "Transcript ". $trans{$key}->[0]->display_id . " of biotype nonsense_mediated_decay" . - " is chosen as the canonical transcript for gene ". $gene->display_id . - " of biotype ". $gene->biotype. "\n"; - next GENE; - } else { - carp( "Transcript $trans_id is an NMD with either " - . " no translation or internal stop(s), skipping it...\n" ); - } - } - } # foreach biotype - - my $has_translation = 0; - my $count = 0; - my @with_translation; - my @no_translation; - - foreach my $transcript ( @{$transcripts} ) { - - if ( ( $gene->biotype ne 'pseudogene' ) - && ( $gene->biotype ne 'processed_transcript' ) - && ( $transcript->biotype ne 'polymorphic_pseudogene' ) - && ( $transcript->biotype ne 'processed_transcript' ) - && $transcript->translation - && ( $transcript->translation->seq !~ /\*/ ) ) - - { - push( @with_translation, $transcript ); - } else { - push( @no_translation, $transcript ); - } - } - - my @sorted; - - if (@with_translation) { - - my ( @ccds_prot_coding_len_and_trans, @merge_prot_coding_len_and_trans, @prot_coding_len_and_trans); - my ( @merge_translateable_len_and_trans, @translateable_len_and_trans ); - my ( @merge_len_and_trans, @len_and_trans ); - - foreach my $trans (@with_translation) { - - print "Looking at Ens trans dbID " . $trans->dbID . " :: biotype " . $trans->biotype . " :: Transl length " . - $trans->translate->length . "\n" if ($verbose); - - my $h = { trans => $trans, len => $trans->translate->length }; - my $ensembl_trans_found_in_CCDS = 0; - - if ( $trans->biotype eq 'protein_coding') { - if ($ccds_db && $slice_is_reference) { - $ensembl_trans_found_in_CCDS = check_Ens_trans_against_CCDS($trans, $ccds_db, $verbose); - } - if ($ensembl_trans_found_in_CCDS) { - push @ccds_prot_coding_len_and_trans, $h; - } elsif ($trans->analysis->logic_name eq 'ensembl_havana_transcript') { - push @merge_prot_coding_len_and_trans, $h; - } else { - push @prot_coding_len_and_trans, $h; - } - next; - } else { # transcript biotype is not protein_coding but the transcript has translation - if ($trans->analysis->logic_name eq 'ensembl_havana_transcript') { - push @merge_translateable_len_and_trans, $h; - } else { - push @translateable_len_and_trans, $h; - } - } - } - - my @tmp_sorted; - if (@ccds_prot_coding_len_and_trans) { - @tmp_sorted = sort { $b->{len} <=> $a->{len} } @ccds_prot_coding_len_and_trans; # sort all the Ens transcripts which matched CCDS by coding length - } elsif (@merge_prot_coding_len_and_trans) { - @tmp_sorted = sort { $b->{len} <=> $a->{len} } @merge_prot_coding_len_and_trans; - } elsif (@prot_coding_len_and_trans) { - @tmp_sorted = sort { $b->{len} <=> $a->{len} } @prot_coding_len_and_trans; - } elsif (@merge_len_and_trans) { - print "There are no 'protein_coding' labelled transcripts, ". - "will take the longest of merged tranlslateable transcripts\n" if ($verbose); - @tmp_sorted = sort { $b->{len} <=> $a->{len} } @merge_translateable_len_and_trans; - } else { - print "There are neither 'protein_coding' labelled transcripts nor merged ". - "translateable transcripts. Will take the longest of any unmerged ". - "translateable transcript.\n" if ($verbose); - @tmp_sorted = sort { $b->{len} <=> $a->{len} } @translateable_len_and_trans; - } - - foreach my $h (@tmp_sorted) { - print "Adding to sorted " . $h->{trans}->dbID . " :: " . $h->{trans}->biotype . "\n" if ($verbose); - push @sorted, $h->{trans}; - } - } else { - # we merge genes without a translation too - my ( @merge_len_and_trans, @len_and_trans ); - foreach my $trans (@no_translation) { - if ($trans->analysis->logic_name eq 'ensembl_havana_transcript') { - push @merge_len_and_trans, $trans; - } else { - push @len_and_trans, $trans; - } - } - if (@merge_len_and_trans) { - @sorted = sort { $b->length <=> $a->length } @merge_len_and_trans; - } else { - @sorted = sort { $b->length <=> $a->length } @len_and_trans; - } - } # end of loop for no translation - - # # # - # set canonical transcirpt - # # # - print "Transcript ". $sorted[0]->display_id . " of biotype " . $sorted[0]->biotype . - " is chosen as the canonical transcript for gene ". $gene->display_id . - " of biotype ". $gene->biotype. "\n"; - $canonical{ $gene->dbID } = $sorted[0]->dbID; - - if (!exists $canonical{ $gene->dbID }) { - throw("No canonical transcript has been set for gene ".$gene->dbID ); - } - } ## end foreach my $gene (@$genes) - - foreach my $gene_id ( keys(%canonical) ) { - my $transcript_id = $canonical{$gene_id}; - print "Updating gene $gene_id " - . "with canonical transcript: $transcript_id.\n"; - $gene_sth->execute( $transcript_id, $gene_id ) if ($write); - } -} ## end foreach my $slice (@$slices) - +for ( my $i = 0; $i < scalar(@$db_args); $i++ ) { + + my $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new( %{ $db_args->[$i] } ); + + my $geneDB_has_DNA = check_if_DB_contains_DNA($db); + + my $dnadb; + + if ( $geneDB_has_DNA == 0 && !$dnadb_args ) { + throw( +"Your gene DB contains no DNA. You must provide a DNA_DB to connect to" ); + } elsif ( $geneDB_has_DNA == 0 && $dnadb_args ) { + $dnadb = Bio::EnsEMBL::DBSQL::DBAdaptor->new( %{ $dnadb_args->[$i] } ); + $db->dnadb($dnadb); + } + + my $ccds_db; + if ( defined $ccdsdb_args ) { + $ccds_db = + Bio::EnsEMBL::DBSQL::DBAdaptor->new( %{ $ccdsdb_args->[$i] } ) + ; + } + + my $sa = $db->get_SliceAdaptor; + my $dea = $db->get_DBEntryAdaptor; + my $slices; + + # Only update the canonical transcripts in this region. + if ( $opts->{seq_region_name} ) { + print "Only updating genes on chromosome " + . $opts->{seq_region_name} . "\n"; + my $slice = + $sa->fetch_by_region( $opts->{coord_system_name}, + $opts->{seq_region_name}, + $opts->{include_non_ref}, + $opts->{include_duplicates} ); + push( @$slices, $slice ); + } else { + $slices = $sa->fetch_all( $opts->{coord_system_name}, '', + $opts->{include_non_ref}, + $opts->{include_duplicates} ); + } + + my $gene_update_sql = + "update gene set canonical_transcript_id = ? where gene_id = ?"; + my $gene_sth = $db->dbc->prepare($gene_update_sql); + + SLICE: foreach my $slice (@$slices) { + # check whether the slice is reference or not + # this check is important for species that have a ccds database + # and that have non-reference sequence ie. human + my $slice_is_reference; + if ( !$slice->is_reference ) { + print "Doing non-reference slice " . $slice->name . "\n"; + $slice_is_reference = 0; + } else { + print "Doing reference slice " . $slice->name . "\n"; + $slice_is_reference = 1; + } + + # Now fetch the genes + print "\nGetting genes for " . $slice->name . "\n" + if ( $opts->{verbose} ); + my $genes = $slice->get_all_Genes( $opts->{logic_name}, undef, 1 ); + my %canonical; + + GENE: foreach my $gene (@$genes) { + print "\nLooking at gene: (dbID " . $gene->dbID . ")"; + if ( $gene->stable_id ) { + print " :: " . $gene->stable_id; + } + print "\n"; + my $transcripts = $gene->get_all_Transcripts; + if ( @$transcripts == 1 ) { + if ( $ccds_db && $slice_is_reference ) { + check_Ens_trans_against_CCDS( $transcripts->[0], $ccds_db, + $opts->{verbose} ); + } + $canonical{ $gene->dbID } = $transcripts->[0]->dbID; + print "Transcript " + . $transcripts->[0]->display_id + . " of biotype " + . $transcripts->[0]->biotype + . " is chosen as the canonical transcript for gene " + . $gene->display_id + . " of biotype " + . $gene->biotype . "\n"; + next GENE; + } + + # Keep track of the transcript biotypes + my %trans; + foreach my $transcript ( @{$transcripts} ) { + push @{ $trans{ $transcript->biotype } }, $transcript; + } + + # If there is a single protein_coding transcript, make it the + # canonical transcript, if it has a functional translation. + foreach my $key ( keys(%trans) ) { + if ( ( $key eq 'protein_coding' ) + && ( @{ $trans{$key} } == 1 ) ) + { + my $trans_id = $trans{$key}->[0]->dbID; + if ( $trans{$key}->[0]->translation->seq !~ /\*/ ) { + if ( $ccds_db && $slice_is_reference ) { + check_Ens_trans_against_CCDS( $trans{$key}->[0], + $ccds_db, $opts->{verbose} ); + } + $canonical{ $gene->dbID } = $trans_id; + print "Transcript " + . $trans{$key}->[0]->display_id + . " of biotype " + . $trans{$key}->[0]->biotype + . " is chosen as the canonical transcript for gene " + . $gene->display_id + . " of biotype " + . $gene->biotype . "\n"; + next GENE; + } else { + carp( "Transcript $trans_id has internal stop(s) " + . "and shouldn't if it is protein coding\n" ); + } + } elsif ( !exists( $trans{'protein_coding'} ) + && $key eq 'nonsense_mediated_decay' + && ( @{ $trans{$key} } == 1 ) ) + { + my $trans_id = $trans{$key}->[0]->dbID; + if ( ( $trans{$key}->[0]->translation ) + && ( $trans{$key}->[0]->translation->seq !~ /\*/ ) ) + { + print "The NMD transcript $trans_id will become " + . "the canonical transcript because there are no " + . "protein coding transcripts.\n"; + if ( $ccds_db && $slice_is_reference ) { + check_Ens_trans_against_CCDS( $trans{$key}->[0], + $ccds_db, $opts->{verbose} ); + } + $canonical{ $gene->dbID } = $trans_id; + print "Transcript " + . $trans{$key}->[0]->display_id + . " of biotype nonsense_mediated_decay" + . " is chosen as the canonical transcript for gene " + . $gene->display_id + . " of biotype " + . $gene->biotype . "\n"; + next GENE; + } else { + carp( "Transcript $trans_id is an NMD with either " + . " no translation or internal stop(s), skipping it...\n" + ); + } + } ## end elsif ( !exists( $trans{'protein_coding'...})) + } # foreach biotype + + my $has_translation = 0; + my $count = 0; + my @with_translation; + my @no_translation; + + foreach my $transcript ( @{$transcripts} ) { + + if ( ( $gene->biotype ne 'pseudogene' ) + && ( $gene->biotype ne 'processed_transcript' ) + && ( $transcript->biotype ne 'polymorphic_pseudogene' ) + && ( $transcript->biotype ne 'processed_transcript' ) + && $transcript->translation + && ( $transcript->translation->seq !~ /\*/ ) ) + + { + push( @with_translation, $transcript ); + } else { + push( @no_translation, $transcript ); + } + } + + my @sorted; + + if (@with_translation) { + + my ( @ccds_prot_coding_len_and_trans, + @merge_prot_coding_len_and_trans, + @prot_coding_len_and_trans ); + my ( @merge_translateable_len_and_trans, + @translateable_len_and_trans ); + my ( @merge_len_and_trans, @len_and_trans ); + + foreach my $trans (@with_translation) { + + print "Looking at Ens trans dbID " + . $trans->dbID + . " :: biotype " + . $trans->biotype + . " :: Transl length " + . $trans->translate->length . "\n" + if ( $opts->{verbose} ); + + my $h = + { trans => $trans, len => $trans->translate->length }; + my $ensembl_trans_found_in_CCDS = 0; + + if ( $trans->biotype eq 'protein_coding' ) { + if ( $ccds_db && $slice_is_reference ) { + $ensembl_trans_found_in_CCDS = + check_Ens_trans_against_CCDS( $trans, $ccds_db, + $opts->{verbose} ); + } + if ($ensembl_trans_found_in_CCDS) { + push @ccds_prot_coding_len_and_trans, $h; + } elsif ( $trans->analysis->logic_name eq + 'ensembl_havana_transcript' ) + { + push @merge_prot_coding_len_and_trans, $h; + } else { + push @prot_coding_len_and_trans, $h; + } + next; + } else { # transcript biotype is not protein_coding but the transcript has translation + if ( $trans->analysis->logic_name eq + 'ensembl_havana_transcript' ) + { + push @merge_translateable_len_and_trans, $h; + } else { + push @translateable_len_and_trans, $h; + } + } + } ## end foreach my $trans (@with_translation) + + my @tmp_sorted; + if (@ccds_prot_coding_len_and_trans) { + @tmp_sorted = + sort { $b->{len} <=> $a->{len} } + @ccds_prot_coding_len_and_trans + ; # sort all the Ens transcripts which matched CCDS by coding length + } elsif (@merge_prot_coding_len_and_trans) { + @tmp_sorted = + sort { $b->{len} <=> $a->{len} } + @merge_prot_coding_len_and_trans; + } elsif (@prot_coding_len_and_trans) { + @tmp_sorted = + sort { $b->{len} <=> $a->{len} } + @prot_coding_len_and_trans; + } elsif (@merge_len_and_trans) { + print "There are no 'protein_coding' labelled transcripts, " + . "will take the longest of merged tranlslateable transcripts\n" + if ( $opts->{verbose} ); + @tmp_sorted = + sort { $b->{len} <=> $a->{len} } + @merge_translateable_len_and_trans; + } else { + print +"There are neither 'protein_coding' labelled transcripts nor merged " + . "translateable transcripts. Will take the longest of any unmerged " + . "translateable transcript.\n" + if ( $opts->{verbose} ); + @tmp_sorted = + sort { $b->{len} <=> $a->{len} } + @translateable_len_and_trans; + } + + foreach my $h (@tmp_sorted) { + print "Adding to sorted " + . $h->{trans}->dbID . " :: " + . $h->{trans}->biotype . "\n" + if ( $opts->{verbose} ); + push @sorted, $h->{trans}; + } + } else { + # we merge genes without a translation too + my ( @merge_len_and_trans, @len_and_trans ); + foreach my $trans (@no_translation) { + if ( $trans->analysis->logic_name eq + 'ensembl_havana_transcript' ) + { + push @merge_len_and_trans, $trans; + } else { + push @len_and_trans, $trans; + } + } + if (@merge_len_and_trans) { + @sorted = + sort { $b->length <=> $a->length } @merge_len_and_trans; + } else { + @sorted = sort { $b->length <=> $a->length } @len_and_trans; + } + } # end of loop for no translation + + # # # + # set canonical transcirpt + # # # + print "Transcript " + . $sorted[0]->display_id + . " of biotype " + . $sorted[0]->biotype + . " is chosen as the canonical transcript for gene " + . $gene->display_id + . " of biotype " + . $gene->biotype . "\n"; + $canonical{ $gene->dbID } = $sorted[0]->dbID; + + if ( !exists $canonical{ $gene->dbID } ) { + throw( "No canonical transcript has been set for gene " + . $gene->dbID ); + } + } ## end foreach my $gene (@$genes) + + foreach my $gene_id ( keys(%canonical) ) { + my $transcript_id = $canonical{$gene_id}; + print "Updating gene $gene_id " + . "with canonical transcript: $transcript_id.\n"; + $gene_sth->execute( $transcript_id, $gene_id ) + if ( $opts->{write} ); + } + } ## end foreach my $slice (@$slices) + +} ## end for ( my $i = 0; $i < scalar...) sub check_if_DB_contains_DNA { - my ($db) = @_; - my $sql_command = "select count(*) from dna"; - my $sth = $db->dbc->prepare($sql_command); - $sth->execute(); - my @dna_array = $sth->fetchrow_array; - if ($dna_array[0] > 0) { - print "Your DB ". $db->dbc->dbname ." contains DNA sequences. No need to attach a ". - "DNA_DB to it.\n" if ($verbose); - return 1; - } else { - print "Your DB ". $db->dbc->dbname ." does not contain DNA sequences.\n" - if ($verbose); - return 0; - } + my ($db) = @_; + my $sql_command = "select count(*) from dna"; + my $sth = $db->dbc->prepare($sql_command); + $sth->execute(); + my @dna_array = $sth->fetchrow_array; + if ( $dna_array[0] > 0 ) { + print "Your DB " + . $db->dbc->dbname + . " contains DNA sequences. No need to attach a " + . "DNA_DB to it.\n" + if ( $opts->{verbose} ); + return 1; + } else { + print "Your DB " + . $db->dbc->dbname + . " does not contain DNA sequences.\n" + if ( $opts->{verbose} ); + return 0; + } } sub check_Ens_trans_against_CCDS { - my ($ensembl_trans, $ccds_db, $verbose) = @_; - - my @ensembl_translateable_exons = @{ $ensembl_trans->get_all_translateable_Exons }; + my ( $ensembl_trans, $ccds_db, $verbose ) = @_; + + my @ensembl_translateable_exons = + @{ $ensembl_trans->get_all_translateable_Exons }; - my $ext_slice = $ccds_db->get_SliceAdaptor->fetch_by_region( 'toplevel', - $ensembl_trans->slice->seq_region_name, - $ensembl_trans->seq_region_start, $ensembl_trans->seq_region_end ); + my $ext_slice = + $ccds_db->get_SliceAdaptor->fetch_by_region( + 'toplevel', + $ensembl_trans->slice->seq_region_name, + $ensembl_trans->seq_region_start, + $ensembl_trans->seq_region_end ); EXT_GENE: foreach my $ext_gene ( @{ $ext_slice->get_all_Genes } ) { - EXT_TRANS: foreach my $ext_trans ( @{ $ext_gene->get_all_Transcripts } ) { - my @ext_exons = @{ $ext_trans->get_all_Exons }; - - if ( scalar(@ensembl_translateable_exons) == scalar(@ext_exons) ) { - for ( my $i = 0 ; $i < scalar(@ensembl_translateable_exons) ; $i++ ) { - if ( $ensembl_translateable_exons[$i]->coding_region_start($ensembl_trans) != $ext_exons[$i]->seq_region_start - || $ensembl_translateable_exons[$i]->strand != $ext_exons[$i]->strand - || $ensembl_translateable_exons[$i]->coding_region_end($ensembl_trans) != $ext_exons[$i]->seq_region_end ) - { - next EXT_TRANS; - } - } - print "Ensembl transcript " . $ensembl_trans->display_id . " found match " . - $ext_gene->display_id . " in CCDS DB.\n"; - return 1; - } - } # end foreach EXT_TRANS - } # end foreach EXT_GENE -} ## end sub + EXT_TRANS: + foreach my $ext_trans ( @{ $ext_gene->get_all_Transcripts } ) { + my @ext_exons = @{ $ext_trans->get_all_Exons }; + + if ( scalar(@ensembl_translateable_exons) == scalar(@ext_exons) ) { + for ( my $i = 0; + $i < scalar(@ensembl_translateable_exons); + $i++ ) + { + if ( $ensembl_translateable_exons[$i] + ->coding_region_start($ensembl_trans) != + $ext_exons[$i]->seq_region_start + || $ensembl_translateable_exons[$i]->strand != + $ext_exons[$i]->strand + || $ensembl_translateable_exons[$i] + ->coding_region_end($ensembl_trans) != + $ext_exons[$i]->seq_region_end ) + { + next EXT_TRANS; + } + } + print "Ensembl transcript " + . $ensembl_trans->display_id + . " found match " + . $ext_gene->display_id + . " in CCDS DB.\n"; + return 1; + } + } # end foreach EXT_TRANS + } # end foreach EXT_GENE +} ## end sub check_Ens_trans_against_CCDS diff --git a/misc-scripts/density_feature/gene_density_calc.pl b/misc-scripts/density_feature/gene_density_calc.pl index 78e9be5e4669ee4760b7c0968db8cea3865a827a..3994db3bb7196fc0b709da2cded8bbfa72467183 100644 --- a/misc-scripts/density_feature/gene_density_calc.pl +++ b/misc-scripts/density_feature/gene_density_calc.pl @@ -1,4 +1,4 @@ -#!/usr/local/ensembl/bin/perl -w +#!/usr/bin/env perl # # script to calculate the gene density features on a database @@ -13,325 +13,299 @@ # 125 toplevel slices... The website will be happy with about 150 bins I # think. - use strict; +use warnings; use Bio::EnsEMBL::DBSQL::DBAdaptor; use Bio::EnsEMBL::DensityType; use Bio::EnsEMBL::DensityFeature; -use Getopt::Long; use Bio::EnsEMBL::Utils::ConversionSupport; +use Bio::EnsEMBL::Utils::CliHelper; my $bin_count = 150; my $max_slices = 100; -my ( $host, $user, $pass, $port, $dbname, $pattern ); +my $cli_helper = Bio::EnsEMBL::Utils::CliHelper->new(); +my $optsd = + [ @{ $cli_helper->get_dba_opts() }, @{ $cli_helper->get_dba_opts('m') } ]; +# add the print option +push(@{$optsd},"print"); +my $opts = $cli_helper->process_args( $optsd, \&usage ); -my ( $mhost, $mport ) = ( 'ens-staging1', '3306' ); -my ( $muser, $mpass ) = ( 'ensro', undef ); -my $mdbname = 'ensembl_production'; +if(!defined $opts->{mdbname} && !defined $opts->{mdbpattern}) { + $opts->{mdbname} = 'ensembl_production'; +} -$port = 3306 ; +if (! defined $opts->{mhost} ) { + ( $opts->{mhost}, $opts->{mport}, $opts->{mdbname}, $opts->{muser} ) = + ( 'ens-staging1', '3306', 'ensembl_production', 'ensro' ); +} -my ( $block_count, $genome_size, $block_size ); +my ($prod_dba) = @{ $cli_helper->get_dbas_for_opts( $opts, 1, 'm' ) }; -GetOptions( - "host|h=s", \$host, - "user|u=s", \$user, - "pass|p=s", \$pass, - "port=i", \$port, - "dbname|d=s", \$dbname, - "pattern=s", \$pattern, - "mhost=s", \$mhost, - "mport=i", \$mport, - "muser=s", \$muser, - "help" , \&usage -); - -usage() if (!$host || !$user || !$pass || (!$dbname && !$pattern)); - -my @dbnames; -if (! $dbname) { - my $dsn = sprintf( 'dbi:mysql:host=%s;port=%d', $host, $port ); - my $dbh = DBI->connect( $dsn, $user, $pass ); - @dbnames = - map { $_->[0] } @{ $dbh->selectall_arrayref('SHOW DATABASES') }; -} -else { - @dbnames = ( $dbname ); +if ( !defined $prod_dba ) { + usage(); } - -foreach my $dbname (@dbnames) { - if ( $pattern && ($dbname !~ /$pattern/) ) { next } - - print STDOUT "Connecting to $dbname\n"; - - my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor( - -host => $host, - -user => $user, - -port => $port, - -pass => $pass, - -dbname => $dbname); - - my $sth = $db->dbc()->prepare( "select count(*) from gene" ); - $sth->execute(); - - my ( $gene_count ) = $sth->fetchrow_array(); - - if( ! $gene_count ) { - print STDERR "No gene density for $dbname.\n"; - exit(); - } -# -# Could be database without seq_regions -# Then have to try and attach core db -# - $sth = $db->dbc()->prepare( "select count(*) from seq_region" ); - $sth->execute(); - my ( $seq_region_count ) = $sth->fetchrow_array(); - if( ! $seq_region_count ) { - # - # for the time being only do core dbs - # no dbs with no seq regions - # - print STDERR "No gene density for $dbname, no seq_regions.\n"; - exit(); - - if( ($dbname =~ /_est_/ ) || ( $dbname =~ /_vega_/ ) || ( $dbname =~ /_cdna_/ ) ) { - my $dna_db_name = $dbname; - $dna_db_name =~ s/(_estgene_|_vega_|_cdna_)/_core_/; - my $dna_db = new Bio::EnsEMBL::DBSQL::DBAdaptor - (-host => $host, - -user => $user, - -port => $port, - -pass => $pass, - -dbname => $dna_db_name ); - print STDOUT "Attaching $dna_db_name to $dbname.\n"; - $db->dnadb( $dna_db ); - } else { - print STDERR "No gene density for $dbname, no seq_regions.\n"; - exit(); - } - } +my ( $block_count, $genome_size, $block_size ); +for my $db_args ( @{ $cli_helper->get_dba_args_for_opts($opts) } ) { + print STDOUT "Connecting to " . $db_args->{-DBNAME} . "\n"; -# -# Get the adaptors needed; -# + my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(%$db_args); + my $dbname = $db->dbc()->dbname(); + my $helper = $db->dbc()->sql_helper(); - print STDOUT "Deleting old knownGeneDensity and geneDensity features\n"; - - $sth = $db->dbc->prepare("DELETE df, dt FROM density_feature df, density_type dt, analysis a WHERE a.analysis_id=dt.analysis_id AND dt.density_type_id=df.density_type_id AND a.logic_name IN ('knowngenedensity', 'genedensity')"); - $sth->execute(); - - my $dfa = $db->get_DensityFeatureAdaptor(); - my $dta = $db->get_DensityTypeAdaptor(); - my $aa = $db->get_AnalysisAdaptor(); - my $slice_adaptor = $db->get_SliceAdaptor(); - - my $support = 'Bio::EnsEMBL::Utils::ConversionSupport'; - my $analysis1 = $aa->fetch_by_logic_name('knowngenedensity'); - my $analysis2 = $aa->fetch_by_logic_name('genedensity'); - - # Master database location: - my ( $muser, $mpass ) = ( 'ensro', undef ); - my $mdbname = 'ensembl_production'; - my $prod_dsn; - my $prod_dbh; - - if ( !defined($analysis1) || !defined($analysis2) ) { - - #get analyses descriptions from the master database - - $prod_dsn = sprintf( 'DBI:mysql:host=%s;port=%d;database=%s', - $mhost, $mport, $mdbname ); - $prod_dbh = DBI->connect( $prod_dsn, $muser, $mpass, - { 'PrintError' => 1, 'RaiseError' => 1 } ); - } - - if ( !defined($analysis1) ) { - - my ($display_label,$description) = $prod_dbh->selectrow_array("select distinct display_label, description from analysis_description where is_current = 1 and logic_name = 'knowngenedensity'"); - - $analysis1 = new Bio::EnsEMBL::Analysis( - -program => "gene_density_calc.pl", - -database => "ensembl", - -gff_source => "gene_density_calc.pl", - -gff_feature => "density", - -logic_name => "knowngenedensity", - -description => $description, - -display_label => $display_label, - -displayable => 1 ); - - $aa->store($analysis1); - - } else { - $analysis1->created($support->date()); - $aa->update($analysis1); - } - - - if ( !defined($analysis2) ) { - - my ($display_label,$description) = $prod_dbh->selectrow_array("select distinct display_label, description from analysis_description where is_current = 1 and logic_name = 'genedensity'"); - - $analysis2 = new Bio::EnsEMBL::Analysis( - -program => "gene_density_calc.pl", - -database => "ensembl", - -gff_source => "gene_density_calc.pl", - -gff_feature => "density", - -logic_name => "genedensity", - -description => $description, - -display_label => $display_label, - -displayable => 1 ); - - $aa->store($analysis2); - - } else { - $analysis2->created($support->date()); - $aa->update($analysis2); - } - - if ( defined($prod_dbh) ) { - $prod_dbh->disconnect; - } + my $gene_count = + $helper->execute_single_result( +-SQL=>"select count(*) from gene join seq_region using (seq_region_id) join coord_system using (coord_system_id) where species_id=?", + -PARAMS=>[$db->species_id()] ); -# -# Now the actual feature calculation loop -# - my $total_count = 0; - - foreach my $known (1, 0) { - # - # Create new analysis object for density calculation. - # - my $analysis; - - if($known) { - $analysis = $aa->fetch_by_logic_name('knowngenedensity'); - } else { - $analysis = $aa->fetch_by_logic_name('genedensity'); - } - - # Sort slices by coordinate system rank, then by length. - my @sorted_slices = - sort( { $a->coord_system()->rank() <=> $b->coord_system()->rank() - || $b->seq_region_length() <=> $a->seq_region_length() - } @{ $slice_adaptor->fetch_all('toplevel') } ); - - - # - # Create new density type. - # - - my $dt = Bio::EnsEMBL::DensityType->new(-analysis => $analysis, - -region_features => $bin_count, - -value_type => 'sum'); - - $dta->store($dt); - - my $slice_count = 0; - my ( $current, $current_start, $current_end ); - - while ( my $slice = shift @sorted_slices){ - - $block_size = $slice->length() / $bin_count; - - my @density_features;#sf7 - - print STDOUT "Gene densities for ".$slice->seq_region_name(). - " with block size $block_size\n"; - $current_end = 0; - $current = 0; - - while($current_end < $slice->length) { - $current += $block_size; - $current_start = $current_end+1; - $current_end = int( $current + 1 ); - - if( $current_end < $current_start ) { - $current_end = $current_start; - } - - if( $current_end > $slice->end ) { - $current_end = $slice->end; + if ( !$gene_count ) { + print STDERR "No gene density for $dbname.\n"; + exit(); } - - - my $sub_slice = $slice->sub_Slice( $current_start , $current_end ); - - my $count =0; - # - # Store info for genes (ignore pseudo genes) - # + # + # Could be database without seq_regions + # Then have to try and attach core db + # + my $seq_region_count = + $helper->execute_single_result( +-SQL=>"select count(*) from seq_region join coord_system using (coord_system_id) where species_id=?", + -PARAMS=>[$db->species_id()] ); + if ( !$seq_region_count ) { + # + # for the time being only do core dbs + # no dbs with no seq regions + # + print STDERR "No gene density for $dbname, no seq_regions.\n"; + exit(); + + if ( ( $dbname =~ /_est_/ ) + || ( $dbname =~ /_vega_/ ) + || ( $dbname =~ /_cdna_/ ) ) + { + my $dna_db_name = $dbname; + $dna_db_name =~ s/(_estgene_|_vega_|_cdna_)/_core_/; + $db_args->{dbname} = $dna_db_name; + my $dna_db = new Bio::EnsEMBL::DBSQL::DBAdaptor($db_args); + print STDOUT "Attaching $dna_db_name to $dbname.\n"; + $db->dnadb($dna_db); + } else { + print STDERR "No gene density for $dbname, no seq_regions.\n"; + exit(); + } + } - foreach my $gene (@{$sub_slice->get_all_Genes()}){ - if($gene->biotype() !~ /pseudogene/i and $gene->start >=1 ) { - $count++ if(!$known || $gene->is_known()); - } + # + # Get the adaptors needed; + # + + print STDOUT "Deleting old knownGeneDensity and geneDensity features\n"; + + $helper->execute_update( + -SQL=>qq/DELETE df, dt FROM density_feature df, density_type dt, analysis a, + seq_region s, coord_system cs WHERE + df.seq_region_id=s.seq_region_id AND s.coord_system_id=cs.coord_system_id AND cs.species_id=? + AND a.analysis_id=dt.analysis_id AND dt.density_type_id=df.density_type_id + AND a.logic_name IN ('knowngenedensity', 'genedensity')/, -PARAMS=>[$db->species_id()] ); + + my $dfa = $db->get_DensityFeatureAdaptor(); + my $dta = $db->get_DensityTypeAdaptor(); + my $aa = $db->get_AnalysisAdaptor(); + my $slice_adaptor = $db->get_SliceAdaptor(); + + my $support = 'Bio::EnsEMBL::Utils::ConversionSupport'; + my $analysis1 = $aa->fetch_by_logic_name('knowngenedensity'); + my $analysis2 = $aa->fetch_by_logic_name('genedensity'); + + if ( !defined($analysis1) ) { + + my ( $display_label, $description ) = + @{$prod_dba->dbc()->sql_helper()->execute( +-SQL=>"select distinct display_label, description from analysis_description where is_current = 1 and logic_name = 'knowngenedensity'" + ) }; + + $analysis1 = + new Bio::EnsEMBL::Analysis( -program => "gene_density_calc.pl", + -database => "ensembl", + -gff_source => "gene_density_calc.pl", + -gff_feature => "density", + -logic_name => "knowngenedensity", + -description => $description, + -display_label => $display_label, + -displayable => 1 ); + + $aa->store($analysis1); + + } else { + $analysis1->created( $support->date() ); + $aa->update($analysis1); } - push @density_features, Bio::EnsEMBL::DensityFeature->new - (-seq_region => $slice, - -start => $current_start, - -end => $current_end, - -density_type => $dt, - -density_value => $count); - if ($count > 0) { - #density features with value = 0 are not stored - $total_count++; + if ( !defined($analysis2) ) { + + my ( $display_label, $description ) = + @{$prod_dba->dbc()->sql_helper()->execute( +-SQL=>"select distinct display_label, description from analysis_description where is_current = 1 and logic_name = 'genedensity'" + ) }; + + $analysis2 = + new Bio::EnsEMBL::Analysis( -program => "gene_density_calc.pl", + -database => "ensembl", + -gff_source => "gene_density_calc.pl", + -gff_feature => "density", + -logic_name => "genedensity", + -description => $description, + -display_label => $display_label, + -displayable => 1 ); + + $aa->store($analysis2); + + } else { + $analysis2->created( $support->date() ); + $aa->update($analysis2); } - } - - $dfa->store(@density_features); - last if ( $slice_count++ > $max_slices ); - } + # + # Now the actual feature calculation loop + # + my $total_count = 0; - } - print STDOUT "Created $total_count gene density features\n"; - print STDOUT "Finished with $dbname"; -} + foreach my $known ( 1, 0 ) { + # + # Create new analysis object for density calculation. + # + my $analysis; + + if ($known) { + $analysis = $aa->fetch_by_logic_name('knowngenedensity'); + } else { + $analysis = $aa->fetch_by_logic_name('genedensity'); + } + + # Sort slices by coordinate system rank, then by length. + my @sorted_slices = + sort( { $a->coord_system()->rank() <=> $b->coord_system()->rank() + || $b->seq_region_length() <=> $a->seq_region_length() + } @{ $slice_adaptor->fetch_all('toplevel') } ); + + # + # Create new density type. + # + + my $dt = + Bio::EnsEMBL::DensityType->new( -analysis => $analysis, + -region_features => $bin_count, + -value_type => 'sum' ); + + $dta->store($dt); + + my $slice_count = 0; + my ( $current, $current_start, $current_end ); + while ( my $slice = shift @sorted_slices ) { + + $block_size = $slice->length()/$bin_count; + + my @density_features; #sf7 + + print STDOUT "Gene densities for " + . $slice->seq_region_name() + . " with block size $block_size\n"; + $current_end = 0; + $current = 0; + + while ( $current_end < $slice->length ) { + $current += $block_size; + $current_start = $current_end + 1; + $current_end = int( $current + 1 ); + + if ( $current_end < $current_start ) { + $current_end = $current_start; + } + + if ( $current_end > $slice->end ) { + $current_end = $slice->end; + } + + my $sub_slice = + $slice->sub_Slice( $current_start, $current_end ); + + my $count = 0; + + # + # Store info for genes (ignore pseudo genes) + # + + foreach my $gene ( @{ $sub_slice->get_all_Genes() } ) { + if ( $gene->biotype() !~ /pseudogene/i + and $gene->start >= 1 ) + { + $count++ if ( !$known || $gene->is_known() ); + } + } + + push @density_features, + Bio::EnsEMBL::DensityFeature->new( -seq_region => $slice, + -start => $current_start, + -end => $current_end, + -density_type => $dt, + -density_value => $count ); + + if ($count > 0) { + #density features with value = 0 are not stored + $total_count++; +} + } ## end while ( $current_end < $slice...) + + $dfa->store(@density_features); + + last if ( $slice_count++ > $max_slices ); + } ## end while ( my $slice = shift...) + + } ## end foreach my $known ( 1, 0 ) + print STDOUT "Created $total_count gene density features.\n"; + print STDOUT "Finished with $dbname"; +} ## end for my $db_args ( @{ $cli_helper...}) # # helper to draw an ascii representation of the density features # sub print_features { - my $features = shift; - - return if(!@$features); - - my $sum = 0; - my $length = 0; -# my $type = $features->[0]->{'density_type'}->value_type(); - - print("\n"); - my $max=0; - foreach my $f (@$features) { - if($f->density_value() > $max){ - $max=$f->density_value(); - } - } - if( !$max ) { $max = 1 }; - - foreach my $f (@$features) { - my $i=1; - for(; $i< ($f->density_value()/$max)*40; $i++){ - print "*"; - } - for(my $j=$i;$j<40;$j++){ - print " "; - } - print " ".$f->density_value()."\t".$f->start()."\n"; - } -} + my $features = shift; + + return if ( !@$features ); + + my $sum = 0; + my $length = 0; + # my $type = $features->[0]->{'density_type'}->value_type(); + + print("\n"); + my $max = 0; + foreach my $f (@$features) { + if ( $f->density_value() > $max ) { + $max = $f->density_value(); + } + } + if ( !$max ) { $max = 1 } + + foreach my $f (@$features) { + my $i = 1; + for ( ; $i < ( $f->density_value()/$max )*40; $i++ ) { + print "*"; + } + for ( my $j = $i; $j < 40; $j++ ) { + print " "; + } + print " " . $f->density_value() . "\t" . $f->start() . "\n"; + } +} ## end sub print_features sub usage { - my $indent = ' ' x length($0); - print <<EOF; exit(0); + my $indent = ' ' x length($0); + print <<EOF; exit(0); What does it do? @@ -401,5 +375,5 @@ Usage: EOF -} +} ## end sub usage diff --git a/misc-scripts/density_feature/percent_gc_calc.pl b/misc-scripts/density_feature/percent_gc_calc.pl index 7458fa66edee8f5194934b3f2568657bb53d8eb8..24f8cb1d19ec1c69a9d16167c3a9e365ba431643 100644 --- a/misc-scripts/density_feature/percent_gc_calc.pl +++ b/misc-scripts/density_feature/percent_gc_calc.pl @@ -1,12 +1,12 @@ -#!/usr/local/ensembl/bin/perl -w +#!/usr/bin/env perl # # Calculate the GC content for top level seq_regions # small regions 500bp to be able to display on contigview # big regions genomesize / 4000 for 4000 features on the genome - use strict; +use warnings; #use dbi; use Bio::EnsEMBL::DBSQL::DBAdaptor; use Bio::EnsEMBL::DBSQL::SliceAdaptor; @@ -16,218 +16,218 @@ use Bio::EnsEMBL::Slice; use Bio::EnsEMBL::Analysis; use Bio::EnsEMBL::DensityType; use Bio::EnsEMBL::DensityFeature; -use Getopt::Long; use Bio::EnsEMBL::Utils::ConversionSupport; - -my ( $host, $user, $pass, $port, $dbname ); - # Master database location: - my ( $mhost, $mport ) = ( 'ens-staging1', '3306' ); - my ( $muser, $mpass ) = ( 'ensro', undef ); - my $mdbname = 'ensembl_production'; - -GetOptions( "host|h=s", \$host, - "user|u=s", \$user, - "pass|p=s", \$pass, - "port=i", \$port, - "dbname|d=s", \$dbname, - "mhost=s", \$mhost, - "mport=i", \$mport, - "muser=s", \$muser, - "help" , \&usage - ); - -usage() if (!$host || !$user || !$pass || !$dbname); - -my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host => $host, - -user => $user, - -port => $port, - -pass => $pass, - -dbname => $dbname); - - -my $bin_count = 150; -my $max_slices = 100; - -# -# Check wether the script should run on given database -# -my $sth = $db->dbc->prepare( "select count(*) from dna" ); -$sth->execute(); -my ( $dna_count ) = $sth->fetchrow_array(); -if( ! $dna_count ) { - print STDERR "No dna, no gc content for $dbname.\n"; - exit(); +use Bio::EnsEMBL::Utils::CliHelper; +use Data::Dumper; + +my $cli_helper = Bio::EnsEMBL::Utils::CliHelper->new(); +my $optsd = + [ @{ $cli_helper->get_dba_opts() }, @{ $cli_helper->get_dba_opts('m') } ]; +push( @{$optsd}, "print" ); +my $opts = $cli_helper->process_args( $optsd, \&usage ); +if ( !defined $opts->{mdbname} && !defined $opts->{mdbpattern} ) { + $opts->{mdbname} = 'ensembl_production'; } +if ( !defined $opts->{mhost} ) { + ( $opts->{mhost}, $opts->{mport}, $opts->{mdbname}, $opts->{muser} ) = + ( 'ens-staging1', '3306', 'ensembl_production', 'ensro' ); +} -print STDOUT "Deleting old PercentGC features\n"; -$sth = $db->dbc->prepare( - qq( -DELETE df, dt -FROM density_feature df, density_type dt, analysis a -WHERE a.analysis_id=dt.analysis_id -AND dt.density_type_id=df.density_type_id -AND a.logic_name='percentgc') ); -$sth->execute(); - -# -# Get the adaptors needed; -# - -my $slice_adaptor = $db->get_SliceAdaptor(); -my $dfa = $db->get_DensityFeatureAdaptor(); -my $dta = $db->get_DensityTypeAdaptor(); -my $aa = $db->get_AnalysisAdaptor(); - -# Sort slices by coordinate system rank, then by length. -my @sorted_slices = - sort( { $a->coord_system()->rank() <=> $b->coord_system()->rank() - || $b->seq_region_length() <=> $a->seq_region_length() - } @{ $slice_adaptor->fetch_all('toplevel') } ); - - -my $analysis = $aa->fetch_by_logic_name('percentgc'); +my ($prod_dba) = @{ $cli_helper->get_dbas_for_opts( $opts, 1, 'm' ) }; -if ( !defined($analysis) ) { +if ( !defined $prod_dba ) { + usage(); +} +for my $db_args ( @{ $cli_helper->get_dba_args_for_opts($opts) } ) { + print STDOUT "Connecting to " . $db_args->{-DBNAME} . "\n"; - my $prod_dsn = sprintf( 'DBI:mysql:host=%s;port=%d;database=%s', - $mhost, $mport, $mdbname ); - my $prod_dbh = DBI->connect( $prod_dsn, $muser, $mpass, - { 'PrintError' => 1, 'RaiseError' => 1 } ); + my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(%$db_args); + my $dbname = $db->dbc()->dbname(); - my ($display_label,$description) = $prod_dbh->selectrow_array("select distinct display_label, description from analysis_description where is_current = 1 and logic_name = 'percentgc'"); + my $helper = $db->dbc()->sql_helper(); - $prod_dbh->disconnect; + my $bin_count = 150; + my $max_slices = 100; - $analysis = new Bio::EnsEMBL::Analysis( - -program => "percent_gc_calc.pl", - -database => "ensembl", - -gff_source => "percent_gc_calc.pl", - -gff_feature => "density", - -logic_name => "percentgc", - -description => $description, - -display_label => $display_label, - -displayable => 1 ); + # + # Check wether the script should run on given database + # + my $dna_count = $helper->execute_single_result( + -SQL => q/select count(*) from dna join seq_region +using (seq_region_id) join coord_system using (coord_system_id) where species_id=?/, + -PARAMS => [ $db->species_id() ] ); - $aa->store($analysis); + if ( !$dna_count ) { + print STDERR "No dna, no gc content for $dbname.\n"; + exit(); + } + print STDOUT "Deleting old PercentGC features\n"; + $helper->execute_update( + -SQL => qq( +DELETE df, dt +FROM density_feature df, density_type dt, analysis a, +seq_region sr, coord_system c +WHERE +sr.seq_region_id=df.seq_region_id +AND sr.coord_system_id=c.coord_system_id +AND c.species_id=? +AND a.analysis_id=dt.analysis_id +AND dt.density_type_id=df.density_type_id +AND a.logic_name='percentgc'), + -PARAMS => [ $db->species_id() ] ); -} else { + # + # Get the adaptors needed; + # + + my $slice_adaptor = $db->get_SliceAdaptor(); + my $dfa = $db->get_DensityFeatureAdaptor(); + my $dta = $db->get_DensityTypeAdaptor(); + my $aa = $db->get_AnalysisAdaptor(); + + # Sort slices by coordinate system rank, then by length. + my @sorted_slices = + sort( { $a->coord_system()->rank() <=> $b->coord_system()->rank() + || $b->seq_region_length() <=> $a->seq_region_length() + } @{ $slice_adaptor->fetch_all('toplevel') } ); - my $support = 'Bio::EnsEMBL::Utils::ConversionSupport'; - $analysis->created($support->date()); - $aa->update($analysis); + my $analysis = $aa->fetch_by_logic_name('percentgc'); -} + if ( !defined($analysis) ) { -# -# Create new density type. -# + my ( $display_label, $description ) = + @{$prod_dba->dbc()->sql_helper()->execute( + -SQL => +"select distinct display_label, description from analysis_description where is_current = 1 and logic_name = 'percentgc'" + ) }; -my $density_type = Bio::EnsEMBL::DensityType->new - (-analysis => $analysis, - -region_features => $bin_count, - -value_type => 'ratio'); + $analysis = + new Bio::EnsEMBL::Analysis( -program => "percent_gc_calc.pl", + -database => "ensembl", + -gff_source => "percent_gc_calc.pl", + -gff_feature => "density", + -logic_name => "percentgc", + -description => $description, + -display_label => $display_label, + -displayable => 1 ); -$dta->store($density_type); + $aa->store($analysis); + } else { -my ( $current_start, $current_end, $current ); -my $slice_count = 0; -my $block_size; + my $support = 'Bio::EnsEMBL::Utils::ConversionSupport'; + $analysis->created( $support->date() ); + $aa->update($analysis); -my $total_count; + } -while ( my $slice = shift @sorted_slices ){ + # + # Create new density type. + # - $block_size = $slice->length() / $bin_count; + my $density_type = + Bio::EnsEMBL::DensityType->new( -analysis => $analysis, + -region_features => $bin_count, + -value_type => 'ratio' ); - my @density_features=(); + $dta->store($density_type); - print STDOUT "GC percentage for ".$slice->seq_region_name(). - " with block size $block_size\n"; + my ( $current_start, $current_end, $current ); + my $slice_count = 0; + my $block_size; + my $total_count; + while ( my $slice = shift @sorted_slices ) { - $current_end = 0; - $current = 0; + $block_size = $slice->length()/$bin_count; - while ($current_end < $slice->length) { + my @density_features = (); - $current += $block_size; - $current_start = $current_end+1; - $current_end = int( $current + 1 ); + print STDOUT "GC percentage for " + . $slice->seq_region_name() + . " with block size $block_size\n"; - if ( $current_end < $current_start ) { - $current_end = $current_start; - } + $current_end = 0; + $current = 0; - if ( $current_end > $slice->end() ) { - $current_end = $slice->end(); - } + while ( $current_end < $slice->length ) { + $current += $block_size; + $current_start = $current_end + 1; + $current_end = int( $current + 1 ); - my $sub_slice = $slice->sub_Slice( $current_start , $current_end ); + if ( $current_end < $current_start ) { + $current_end = $current_start; + } - my $gc = $sub_slice->get_base_count()->{'%gc'}; - my $df = Bio::EnsEMBL::DensityFeature->new - (-seq_region => $slice, - -start => $current_start, - -end => $current_end, - -density_type => $density_type, - -density_value => $gc); - push(@density_features, $df); - if ($gc > 0) { - #density features with value = 0 are not stored - $total_count++; - } + if ( $current_end > $slice->end() ) { + $current_end = $slice->end(); + } - } + my $sub_slice = $slice->sub_Slice( $current_start, $current_end ); - $dfa->store(@density_features); + my $gc = $sub_slice->get_base_count()->{'%gc'}; + my $df = + Bio::EnsEMBL::DensityFeature->new( -seq_region => $slice, + -start => $current_start, + -end => $current_end, + -density_type => $density_type, + -density_value => $gc ); +#print join ("\t", $slice, $current_start, $current_end, $density_type, $gc, "\n"); + push( @density_features, $df ); + $dfa->store($df); + if ( $gc > 0 ) { + #density features with value = 0 are not stored + $total_count++; + } + } ## end while ( $current_end < $slice...) - last if ( $slice_count++ > $max_slices ); -} + $dfa->store(@density_features); -print STDOUT "Created $total_count percent gc density features\n"; + last if ( $slice_count++ > $max_slices ); + } ## end while ( my $slice = shift...) + print STDOUT "Created $total_count percent gc density features\n"; +} ## end for my $db_args ( @{ $cli_helper...}) # # helper to draw an ascii representation of the density features # sub print_features { - my $features = shift; - - return if(!@$features); - - my $sum = 0; - my $length = 0; -# my $type = $features->[0]->{'density_type'}->value_type(); - - print("\n"); - my $max=0; - foreach my $f (@$features) { - if($f->density_value() > $max){ - $max=$f->density_value(); - } - } - foreach my $f (@$features) { - my $i=1; - for(; $i< ($f->density_value()/$max)*40; $i++){ - print "*"; - } - for(my $j=$i;$j<40;$j++){ - print " "; - } - print " ".$f->density_value()."\t".$f->start()."\n"; - } -# my $avg = undef; -# $avg = $sum/$length if($length < 0); -# print("Sum=$sum, Length=$length, Avg/Base=$sum"); -} - + my $features = shift; + + return if ( !@$features ); + + my $sum = 0; + my $length = 0; + # my $type = $features->[0]->{'density_type'}->value_type(); + + print("\n"); + my $max = 0; + foreach my $f (@$features) { + if ( $f->density_value() > $max ) { + $max = $f->density_value(); + } + } + foreach my $f (@$features) { + my $i = 1; + for ( ; $i < ( $f->density_value()/$max )*40; $i++ ) { + print "*"; + } + for ( my $j = $i; $j < 40; $j++ ) { + print " "; + } + print " " . $f->density_value() . "\t" . $f->start() . "\n"; + } + # my $avg = undef; + # $avg = $sum/$length if($length < 0); + # print("Sum=$sum, Length=$length, Avg/Base=$sum"); +} ## end sub print_features sub usage { - my $indent = ' ' x length($0); - print <<EOF; exit(0); + my $indent = ' ' x length($0); + print <<EOF; exit(0); What does it do? @@ -290,8 +290,5 @@ Usage: EOF -} - - - +} ## end sub usage diff --git a/misc-scripts/density_feature/repeat_coverage_calc.pl b/misc-scripts/density_feature/repeat_coverage_calc.pl index a676271f0a52bc229f59c507db9ec096972fc8bc..d9347f85eb42799b44b7e1e7d98c3e546f72db9c 100644 --- a/misc-scripts/density_feature/repeat_coverage_calc.pl +++ b/misc-scripts/density_feature/repeat_coverage_calc.pl @@ -1,4 +1,4 @@ -#!/usr/local/ensembl/bin/perl -w +#!/usr/bin/env perl # # Calculate the repeat coverage for given database. # condition: 1k blocks to show contigview displays @@ -21,284 +21,300 @@ use Bio::EnsEMBL::Utils::ConversionSupport; use POSIX; use strict; +use warnings; use Getopt::Long; +use Bio::EnsEMBL::Utils::CliHelper; + +my $cli_helper = Bio::EnsEMBL::Utils::CliHelper->new(); +my $optsd = + [ @{ $cli_helper->get_dba_opts() }, @{ $cli_helper->get_dba_opts('m') } ]; +# add the print option +push(@{$optsd},"print"); +my $opts = $cli_helper->process_args( $optsd, \&usage ); +if(!defined $opts->{mdbname} && !defined $opts->{mdbpattern}) { + $opts->{mdbname} = 'ensembl_production'; +} -my ( $host, $user, $pass, $port, $dbname ); - -# Master database location: -my ( $mhost, $mport ) = ( 'ens-staging1', '3306' ); -my ( $muser, $mpass ) = ( 'ensro', undef ); -my $mdbname = 'ensembl_production'; - +if (! defined $opts->{mhost} ) { + ( $opts->{mhost}, $opts->{mport}, $opts->{mdbname}, $opts->{muser} ) = + ( 'ens-staging1', '3306', 'ensembl_production', 'ensro' ); +} -GetOptions( "host|h=s", \$host, - "user|u=s", \$user, - "pass|p=s", \$pass, - "port=i", \$port, - "dbname|d=s", \$dbname, - "mhost=s", \$mhost, - "mport=i", \$mport, - "muser=s", \$muser, - "help" , \&usage - ); +my ($prod_dba) = @{ $cli_helper->get_dbas_for_opts( $opts, 1, 'm' ) }; -usage() if (!$host || !$user || !$pass || !$dbname); +if ( !defined $prod_dba ) { + usage(); +} -my $chunksize = 1_000_000; +my $chunksize = 1_000_000; my $small_blocksize = 1_000; -my $bin_count = 150; -my $max_top_slice = 100; +my $bin_count = 150; +my $max_top_slice = 100; -my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host => $host, - -user => $user, - -port => $port, - -pass => $pass, - -dbname => $dbname); +for my $db_args ( @{ $cli_helper->get_dba_args_for_opts($opts) } ) { + print STDOUT "Connecting to " . $db_args->{-DBNAME} . "\n"; + my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(%$db_args); + my $dbname = $db->dbc()->dbname(); + my $helper = $db->dbc()->sql_helper(); -my $sth = $db->dbc()->prepare( "select count(*) from repeat_feature" ); -$sth->execute(); + my $repeat_count = $helper->execute_simple( + -SQL=>q/select count(*) from repeat_feature join seq_region +using (seq_region_id) join coord_system using (coord_system_id) where species_id=?/, + -PARAMS=>[$db->species_id()] ); -my ( $repeat_count ) = $sth->fetchrow_array(); + if ( !$repeat_count ) { + print STDERR "No repeat density for $dbname.\n"; + exit(); + } -if( ! $repeat_count ) { - print STDERR "No repeat density for $dbname.\n"; - exit(); -} - -# -# Get the adaptors needed; -# + # + # Get the adaptors needed; + # # # Clean up old features first. Also remove analysis and density type entry as these are recreated. # -print STDOUT "Deleting old PercentageRepeat features\n"; -$sth = $db->dbc->prepare("DELETE df, dt FROM density_feature df, density_type dt, analysis a WHERE a.analysis_id=dt.analysis_id AND dt.density_type_id=df.density_type_id AND a.logic_name='percentagerepeat'"); -$sth->execute(); - - -my $slice_adaptor = $db->get_SliceAdaptor(); -my $dfa = $db->get_DensityFeatureAdaptor(); -my $dta = $db->get_DensityTypeAdaptor(); -my $aa = $db->get_AnalysisAdaptor(); - - -my $analysis = $aa->fetch_by_logic_name('percentagerepeat'); - - -if ( !defined($analysis) ) { - - - my $prod_dsn = sprintf( 'DBI:mysql:host=%s;port=%d;database=%s', - $mhost, $mport, $mdbname ); - my $prod_dbh = DBI->connect( $prod_dsn, $muser, $mpass, - { 'PrintError' => 1, 'RaiseError' => 1 } ); - - my ($display_label,$description) = $prod_dbh->selectrow_array("select distinct display_label, description from analysis_description where is_current = 1 and logic_name = 'percentagerepeat'"); - - $prod_dbh->disconnect; - - $analysis = new Bio::EnsEMBL::Analysis( - -program => "repeat_coverage_calc.pl", - -database => "ensembl", - -gff_source => "repeat_coverage_calc.pl", - -gff_feature => "density", - -logic_name => "percentagerepeat", - -description => $description, - -display_label => $display_label, - -displayable => 1 ); - - $aa->store($analysis); -} else { - - my $support = 'Bio::EnsEMBL::Utils::ConversionSupport'; - $analysis->created($support->date()); - $aa->update($analysis); - -} - -my $slices = $slice_adaptor->fetch_all( "toplevel" ); -my @sorted_slices = sort { $b->seq_region_length() <=> $a->seq_region_length() } @$slices; - -my $small_density_type = Bio::EnsEMBL::DensityType->new - (-analysis => $analysis, - -block_size => $small_blocksize, - -value_type => 'ratio'); - -my $variable_density_type = Bio::EnsEMBL::DensityType->new - (-analysis => $analysis, - -region_features => $bin_count, - -value_type => 'ratio'); - -$dta->store($small_density_type); -$dta->store($variable_density_type); - - -my $slice_count = 0; - - -while ( my $slice = shift @sorted_slices ) { - - # - # do it for small and large blocks - # - print STDOUT ("Working on seq_region ".$slice->seq_region_name()." length ".$slice->seq_region_length()); - - my $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new(); - my $chunk_end = 0; - my $variable_end = 0; - my $small_end = 0; - my $small_start = 0; - my $repeat_size; - my $variable_start = 0; - my $variable_blocksize = POSIX::ceil( $slice->seq_region_length() / - $variable_density_type->region_features()); - $slice_count++; - - while( $chunk_end < $slice->length() ) { - my $chunk_start = $chunk_end+1; - $chunk_end += $chunksize; - $chunk_end = $slice->length() if $chunk_end > $slice->length(); - - register( $rr, $slice, $chunk_start, $chunk_end ); - - my @dfs = (); - - if( $slice_count < $max_top_slice ) { - while ( $variable_end+$variable_blocksize <= $chunk_end ) { - # here we can do the variable sized repeat densities - $variable_start = $variable_end+1; - $variable_end += $variable_blocksize; - - $repeat_size = $rr->overlap_size( "1", $variable_start, $variable_end ); - my $percentage_repeat = $repeat_size / $variable_blocksize * 100; - - push( @dfs, Bio::EnsEMBL::DensityFeature->new - (-seq_region => $slice, - -start => $variable_start, - -end => $variable_end, - -density_type => $variable_density_type, - -density_value => $percentage_repeat)); - - } - } - - while ( $small_end + $small_blocksize <= $chunk_end ) { - # here we can do the small sized density features - $small_start = $small_end+1; - $small_end += $small_blocksize; - - $repeat_size = $rr->overlap_size( "1", $small_start, $small_end ); - my $percentage_repeat = $repeat_size / $small_blocksize * 100; - - push( @dfs, Bio::EnsEMBL::DensityFeature->new - (-seq_region => $slice, - -start => $small_start, - -end => $small_end, - -density_type => $small_density_type, - -density_value => $percentage_repeat)); - } - - if (@dfs) { - $dfa->store(@dfs); - } else { - warning("No repeat density calculated for ".$slice->name." (chunk start $chunk_start, chunk end $chunk_end)."); - } - - my $used_lower_limit = $small_start < $variable_start ? $small_start : $variable_start; - - # here some rr cleanup - $rr->check_and_register( "1", 0, $used_lower_limit ); - } - - # missing the last bits - if( $small_end < $slice->length() ) { - $small_start = $small_end+1; - $small_end = $slice->length(); - - $repeat_size = $rr->overlap_size( "1", $small_start, $small_end ); - my $percentage_repeat = $repeat_size / ($small_end - $small_start + 1 ) * 100; - - $dfa->store( Bio::EnsEMBL::DensityFeature->new - (-seq_region => $slice, - -start => $small_start, - -end => $small_end, - -density_type => $small_density_type, - -density_value => $percentage_repeat)); - } - - if( $variable_end < $slice->length() && $slice_count < $max_top_slice ) { - $variable_start = $variable_end+1; - $variable_end = $slice->length(); - - $repeat_size = $rr->overlap_size( "1", $variable_start, $variable_end ); - my $percentage_repeat = $repeat_size / ($variable_end - $variable_start+1) * 100; - - $dfa->store( Bio::EnsEMBL::DensityFeature->new - (-seq_region => $slice, - -start => $variable_start, - -end => $variable_end, - -density_type => $variable_density_type, - -density_value => $percentage_repeat)); - } - - print STDOUT " DONE.\n"; -} - + print STDOUT "Deleting old PercentageRepeat features\n"; + $helper->execute_update( +-SQL=>q/DELETE df, dt FROM density_feature df, density_type dt, analysis a, seq_region sr, coord_system c +WHERE +sr.seq_region_id=df.seq_region_id AND c.coord_system_id=sr.coord_system_id AND c.species_id=? +AND a.analysis_id=dt.analysis_id AND dt.density_type_id=df.density_type_id AND a.logic_name='percentagerepeat'/, + -PARAMS=>[$db->species_id()] ); + + my $slice_adaptor = $db->get_SliceAdaptor(); + my $dfa = $db->get_DensityFeatureAdaptor(); + my $dta = $db->get_DensityTypeAdaptor(); + my $aa = $db->get_AnalysisAdaptor(); + + my $analysis = $aa->fetch_by_logic_name('percentagerepeat'); + + if ( !defined($analysis) ) { + + my ( $display_label, $description ) = + @{$prod_dba->dbc()->sql_helper()->execute( +-SQL=>"select distinct display_label, description from analysis_description where is_current = 1 and logic_name = 'percentagerepeat'" + ) }; + + $analysis = + new Bio::EnsEMBL::Analysis( -program => "repeat_coverage_calc.pl", + -database => "ensembl", + -gff_source => "repeat_coverage_calc.pl", + -gff_feature => "density", + -logic_name => "percentagerepeat", + -description => $description, + -display_label => $display_label, + -displayable => 1 ); + + $aa->store($analysis); + } else { + + my $support = 'Bio::EnsEMBL::Utils::ConversionSupport'; + $analysis->created( $support->date() ); + $aa->update($analysis); + + } + + my $slices = $slice_adaptor->fetch_all("toplevel"); + my @sorted_slices = + sort { $b->seq_region_length() <=> $a->seq_region_length() } @$slices; + + my $small_density_type = Bio::EnsEMBL::DensityType->new( + -analysis => $analysis, + -block_size => $small_blocksize, + -value_type => 'ratio' ); + + my $variable_density_type = + Bio::EnsEMBL::DensityType->new( -analysis => $analysis, + -region_features => $bin_count, + -value_type => 'ratio' ); + + $dta->store($small_density_type); + $dta->store($variable_density_type); + + my $slice_count = 0; + + while ( my $slice = shift @sorted_slices ) { + + # + # do it for small and large blocks + # + print STDOUT ( "Working on seq_region " + . $slice->seq_region_name() + . " length " + . $slice->seq_region_length() ); + + my $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new(); + my $chunk_end = 0; + my $variable_end = 0; + my $small_end = 0; + my $small_start = 0; + my $repeat_size; + my $variable_start = 0; + my $variable_blocksize = + POSIX::ceil( $slice->seq_region_length()/ + $variable_density_type->region_features() ); + $slice_count++; + + while ( $chunk_end < $slice->length() ) { + my $chunk_start = $chunk_end + 1; + $chunk_end += $chunksize; + $chunk_end = $slice->length() if $chunk_end > $slice->length(); + + register( $rr, $slice, $chunk_start, $chunk_end ); + + my @dfs = (); + + if ( $slice_count < $max_top_slice ) { + while ( $variable_end + $variable_blocksize <= $chunk_end ) { + # here we can do the variable sized repeat densities + $variable_start = $variable_end + 1; + $variable_end += $variable_blocksize; + + $repeat_size = + $rr->overlap_size( "1", $variable_start, $variable_end ); + my $percentage_repeat = + $repeat_size/$variable_blocksize*100; + + push( @dfs, + Bio::EnsEMBL::DensityFeature->new( + -seq_region => $slice, + -start => $variable_start, + -end => $variable_end, + -density_type => $variable_density_type, + -density_value => $percentage_repeat ) + ); + + } + } + + while ( $small_end + $small_blocksize <= $chunk_end ) { + # here we can do the small sized density features + $small_start = $small_end + 1; + $small_end += $small_blocksize; + + $repeat_size = + $rr->overlap_size( "1", $small_start, $small_end ); + my $percentage_repeat = $repeat_size/$small_blocksize*100; + + push( @dfs, + Bio::EnsEMBL::DensityFeature->new( + -seq_region => $slice, + -start => $small_start, + -end => $small_end, + -density_type => $small_density_type, + -density_value => $percentage_repeat + ) ); + } + + if (@dfs) { + $dfa->store(@dfs); + } else { + warning( "No repeat density calculated for " + . $slice->name + . " (chunk start $chunk_start, chunk end $chunk_end)." ); + } + + my $used_lower_limit = + $small_start < $variable_start ? $small_start : $variable_start; + + # here some rr cleanup + $rr->check_and_register( "1", 0, $used_lower_limit ); + } ## end while ( $chunk_end < $slice...) + + # missing the last bits + if ( $small_end < $slice->length() ) { + $small_start = $small_end + 1; + $small_end = $slice->length(); + + $repeat_size = $rr->overlap_size( "1", $small_start, $small_end ); + my $percentage_repeat = + $repeat_size/( $small_end - $small_start + 1 )*100; + + $dfa->store( Bio::EnsEMBL::DensityFeature->new( + -seq_region => $slice, + -start => $small_start, + -end => $small_end, + -density_type => $small_density_type, + -density_value => $percentage_repeat + ) ); + } + + if ( $variable_end < $slice->length() && $slice_count < $max_top_slice ) + { + $variable_start = $variable_end + 1; + $variable_end = $slice->length(); + + $repeat_size = + $rr->overlap_size( "1", $variable_start, $variable_end ); + my $percentage_repeat = + $repeat_size/( $variable_end - $variable_start + 1 )*100; + + $dfa->store( Bio::EnsEMBL::DensityFeature->new( + -seq_region => $slice, + -start => $variable_start, + -end => $variable_end, + -density_type => $variable_density_type, + -density_value => $percentage_repeat ) + ); + } + + print STDOUT " DONE.\n"; + } ## end while ( my $slice = shift...) +} ## end for my $db_args ( @{ $cli_helper...}) sub register { - my ($rr, $slice, $start, $end ) = @_; - - my $subSlice = $slice->sub_Slice( $start, $end ); - my $repeats = $subSlice->get_all_RepeatFeatures(); - for my $repeat ( @$repeats ) { - $rr->check_and_register( "1", $repeat->seq_region_start(), $repeat->seq_region_end() ); - } + my ( $rr, $slice, $start, $end ) = @_; + + my $subSlice = $slice->sub_Slice( $start, $end ); + my $repeats = $subSlice->get_all_RepeatFeatures(); + for my $repeat (@$repeats) { + $rr->check_and_register( "1", + $repeat->seq_region_start(), + $repeat->seq_region_end() ); + } } - - # # helper to draw an ascii representation of the density features # sub print_features { - my $features = shift; - - return if(!@$features); - - my $sum = 0; - my $length = 0; -# my $type = $features->[0]->{'density_type'}->value_type(); - - print("\n"); - my $max=0; - foreach my $f (@$features) { - if($f->density_value() > $max){ - $max=$f->density_value(); - } - } - foreach my $f (@$features) { - my $i=1; - for(; $i< ($f->density_value()/$max)*40; $i++){ - print "*"; - } - for(my $j=$i;$j<40;$j++){ - print " "; - } - print " ".$f->density_value()."\t".$f->start()."\n"; - } -# my $avg = undef; -# $avg = $sum/$length if($length < 0); -# print("Sum=$sum, Length=$length, Avg/Base=$sum"); -} - + my $features = shift; + + return if ( !@$features ); + + my $sum = 0; + my $length = 0; + # my $type = $features->[0]->{'density_type'}->value_type(); + + print("\n"); + my $max = 0; + foreach my $f (@$features) { + if ( $f->density_value() > $max ) { + $max = $f->density_value(); + } + } + foreach my $f (@$features) { + my $i = 1; + for ( ; $i < ( $f->density_value()/$max )*40; $i++ ) { + print "*"; + } + for ( my $j = $i; $j < 40; $j++ ) { + print " "; + } + print " " . $f->density_value() . "\t" . $f->start() . "\n"; + } + # my $avg = undef; + # $avg = $sum/$length if($length < 0); + # print("Sum=$sum, Length=$length, Avg/Base=$sum"); +} ## end sub print_features sub usage { - my $indent = ' ' x length($0); - print <<EOF; exit(0); + my $indent = ' ' x length($0); + print <<EOF; exit(0); What does it do? @@ -374,9 +390,5 @@ Usage: EOF -} - - - - +} ## end sub usage diff --git a/misc-scripts/density_feature/seq_region_stats.pl b/misc-scripts/density_feature/seq_region_stats.pl index 619db2ecf4ce6befa7e8f4a7507234b74555040e..8c36d5b99a1873ddf225c6b1c84d0e4f581269a9 100644 --- a/misc-scripts/density_feature/seq_region_stats.pl +++ b/misc-scripts/density_feature/seq_region_stats.pl @@ -6,266 +6,251 @@ use warnings; use Bio::EnsEMBL::Registry; use Bio::EnsEMBL::DBSQL::DBAdaptor; use Bio::EnsEMBL::Variation::DBSQL::DBAdaptor; -use Getopt::Long; +use Bio::EnsEMBL::Utils::CliHelper; +use Data::Dumper; + +my $cli_helper = Bio::EnsEMBL::Utils::CliHelper->new(); +my $optsd = + [ @{ $cli_helper->get_dba_opts() }, @{ $cli_helper->get_dba_opts('m') } ]; +# add the print option +push( @{$optsd}, "stats|s:s" ); +push(@{$optsd},"print"); +my $opts = $cli_helper->process_args( $optsd, \&usage ); + +usage() if ( !$opts->{stats} || $opts->{stats} !~ /^(gene|snp)$/ ); +if(!defined $opts->{mdbname} && !defined $opts->{mdbpattern}) { + $opts->{mdbname} = 'ensembl_production'; +} -my ( $host, $user, $pass, $port, $dbname, $pattern, $stats ); -# Master database location: -my ( $mhost, $mport ) = ( 'ens-staging1', '3306' ); -my ( $muser, $mpass ) = ( 'ensro', undef ); -my $mdbname = 'ensembl_production'; +if ( !defined $opts->{mhost} ) { + ( $opts->{mhost}, $opts->{mport}, $opts->{mdbname}, $opts->{muser} ) = + ( 'ens-staging1', '3306', 'ensembl_production', 'ensro' ); +} -GetOptions( "host|h=s", \$host, - "user|u=s", \$user, - "pass|p=s", \$pass, - "port=i", \$port, - "dbname|d=s", \$dbname, - "pattern=s", \$pattern, - "stats|s=s", \$stats, - "mhost=s", \$mhost, - "mport=i", \$mport, - "muser=s", \$muser, - "help" , \&usage - ); +my ($prod_dba) = @{ $cli_helper->get_dbas_for_opts( $opts, 1, 'm' ) }; -usage() if (!$host || !$user || !$pass || (!$dbname && !$pattern) || !$stats || $stats !~ /^(gene|snp)$/ ); +if ( !defined $prod_dba ) { + usage(); +} +my %attrib_codes = %{ + $prod_dba->dbc()->sql_helper()->execute_into_hash( + -SQL => +"select distinct b.name, code from biotype b join attrib_type using(attrib_type_id) where is_current = 1 and db_type like '%core%' and object_type = 'gene' order by b.name" + ), + , + { Columns => [ 1, 2 ] } }; -#get biotypes and attrib codes from the production database +#add known and novel protein coding attrib types +$attrib_codes{'known protein_coding'} = 'GeneNo_knwCod'; +$attrib_codes{'novel protein_coding'} = 'GeneNo_novCod'; -my $prod_dsn = sprintf( 'DBI:mysql:host=%s;port=%d;database=%s', - $mhost, $mport, $mdbname ); -my $prod_dbh = DBI->connect( $prod_dsn, $muser, $mpass, - { 'PrintError' => 1, 'RaiseError' => 1 } ); +my $genestats = 1 if ( $opts->{stats} eq 'gene' ); +my $snpstats = 1 if ( $opts->{stats} eq 'snp' ); +for my $db_args ( @{ $cli_helper->get_dba_args_for_opts($opts) } ) { + + my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(%$db_args); + my $dbname = $db->dbc()->dbname(); + + my $total_count = 0; + # delete old attributes before starting + if ($genestats) { + foreach my $code ( values %attrib_codes ) { + $db->dbc()->sql_helper()->execute_update( + -SQL => +"DELETE sa FROM seq_region_attrib sa, attrib_type at, seq_region s, coord_system c WHERE s.seq_region_id=sa.seq_region_id AND c.coord_system_id=s.coord_system_id AND at.attrib_type_id=sa.attrib_type_id AND at.code=? AND c.species_id=?", + -PARAMS => [ $code, $db->species_id() ] ); + } + } -my $attrib_codes_ref = $prod_dbh->selectcol_arrayref("select distinct b.name, code from biotype b join attrib_type using(attrib_type_id) where is_current = 1 and db_type like '%core%' and object_type = 'gene' order by b.name", { Columns=>[1,2] }); + if ($snpstats) { + $db->dbc()->sql_helper()->execute_update( + -SQL => +"DELETE sa FROM seq_region_attrib sa, attrib_type at, seq_region s, coord_system c WHERE s.seq_region_id=sa.seq_region_id AND c.coord_system_id=s.coord_system_id AND at.attrib_type_id=sa.attrib_type_id AND at.code=? AND c.species_id=?", + -PARAMS => [ "SNPCount", $db->species_id() ] ); + } -my %attrib_codes = @$attrib_codes_ref; + # + # Only run on database with genes + # + + my $genes_present; + + if ($genestats) { + my $gene_count = + $db->dbc()->sql_helper()->execute_single_result( + -SQL => +"select count(*) from gene join seq_region using (seq_region_id) join coord_system using (coord_system_id) where species_id=?", + -PARAMs => [ $db->species_id() ] ); + $genes_present = ($gene_count) ? 1 : 0; + } else { + $genes_present = 0; + } -$prod_dbh->disconnect; + # + # and seq_regions + # + my $seq_region_count = + $db->dbc()->sql_helper()->execute_single_result( + -SQL => +"select count(*) from seq_region join coord_system using (coord_system_id) where species_id=?", + -PARAMS => [ $db->species_id() ] ); + if ( !$seq_region_count ) { + print STDERR "No seq_regions for $dbname\n"; + exit(); + } -#add known and novel protein coding attrib types -$attrib_codes{'known protein_coding'} = 'GeneNo_knwCod'; -$attrib_codes{'novel protein_coding'} = 'GeneNo_novCod'; + my $snps_present; + my $snp_db; -my @dbnames; -if (! $dbname) { - my $dsn = sprintf( 'dbi:mysql:host=%s;port=%d', $host, $port ); - my $dbh = DBI->connect( $dsn, $user, $pass ); - @dbnames = - map { $_->[0] } @{ $dbh->selectall_arrayref('SHOW DATABASES') }; -} -else { - @dbnames = ( $dbname ) -} + if ($snpstats) { + $snp_db = variation_attach($db); + if ( defined $snp_db ) { $snps_present = 1; } + } -my $genestats = 1 if($stats eq 'gene'); -my $snpstats = 1 if($stats eq 'snp'); - -foreach my $name (@dbnames) { - if ( $pattern && ($name !~ /$pattern/) ) { next } - - printf( "\nConnecting to '%s'\n", $name ); - - my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host => $host, - -user => $user, - -port => $port, - -pass => $pass, - -dbname => $name); - - my $total_count = 0; - # delete old attributes before starting - if ($genestats) { - foreach my $code (values %attrib_codes) { - my $sth = $db->dbc()->prepare( "DELETE sa FROM seq_region_attrib sa, attrib_type at WHERE at.attrib_type_id=sa.attrib_type_id AND at.code=?" ); - $sth->execute($code); - } - } - - if ($snpstats) { - my $sth = $db->dbc()->prepare( "DELETE sa FROM seq_region_attrib sa, attrib_type at WHERE at.attrib_type_id=sa.attrib_type_id AND at.code=?" ); - $sth->execute("SNPCount"); - } + my $slice_adaptor = $db->get_SliceAdaptor(); + my $attrib_adaptor = $db->get_AttributeAdaptor(); -# -# Only run on database with genes -# + # Do not include non-reference sequences ie. haplotypes for human + #my $top_slices = $slice_adaptor->fetch_all( "toplevel" , undef, 1); + my $top_slices = $slice_adaptor->fetch_all("toplevel"); - my $genes_present; - - if($genestats) { - my $sth = $db->dbc()->prepare( "select count(*) from gene" ); - $sth->execute(); - - my ( $gene_count ) = $sth->fetchrow_array(); - - $genes_present = ($gene_count) ? 1 : 0; - } else { - $genes_present = 0; - } - -# -# and seq_regions -# - my $sth = $db->dbc()->prepare( "select count(*) from seq_region" ); - $sth->execute(); - my ( $seq_region_count ) = $sth->fetchrow_array(); - if( ! $seq_region_count ) { - print STDERR "No seq_regions for $dbname.\n"; - exit(); - } - - my $snps_present; - my $snp_db; - - if ($snpstats) { - $snp_db = variation_attach( $db ); - if (defined $snp_db) {$snps_present = 1;} - } - - my $slice_adaptor = $db->get_SliceAdaptor(); - my $attrib_adaptor = $db->get_AttributeAdaptor(); - -# Do not include non-reference sequences ie. haplotypes for human -#my $top_slices = $slice_adaptor->fetch_all( "toplevel" , undef, 1); - my $top_slices = $slice_adaptor->fetch_all( "toplevel" ); - - while (my $slice = shift(@{$top_slices})) { -# print STDERR "Processing seq_region ", $slice->seq_region_name(), "\n"; - - my @attribs; - - if($genes_present) { - - my %attrib_counts; - my %counts; - - my $genes = $slice->get_all_Genes(); - - while (my $gene = shift(@{$genes})) { - - my $biotype = $gene->biotype(); - if( $biotype =~ /coding/i && $biotype !~ /non_/i) { - if($gene->is_known()) { - $biotype = "known ".$biotype; - } else { - $biotype = "novel ".$biotype; - } - } + while ( my $slice = shift( @{$top_slices} ) ) { + # print STDERR "Processing seq_region ", $slice->seq_region_name(), "\n"; - $counts{$biotype}++; + my @attribs; - } + if ($genes_present) { - for my $biotype ( keys %counts ) { - my $attrib_code = $attrib_codes{$biotype}; - if( !$attrib_code ) { - print STDERR "Unspecified biotype \"$biotype\" in database $name.\n"; - next; - } + my %attrib_counts; + my %counts; - $attrib_counts{$attrib_code} += $counts{$biotype}; - - } - - foreach my $attrib_code (keys %attrib_counts) { - my $attrib_code_desc = $attrib_code; - $attrib_code_desc =~ s/GeneNo_//; - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => $attrib_code_desc.' Gene Count', - -CODE => $attrib_code, - -VALUE => $attrib_counts{$attrib_code}, - -DESCRIPTION => 'Number of '.$attrib_code_desc.' Genes'); - - } - - } - - if( $snps_present ) { - my $sth = $snp_db->dbc->prepare("SELECT COUNT(*) FROM variation_feature WHERE seq_region_id = ?"); - $sth->execute($slice->get_seq_region_id); - my $count; - $sth->bind_columns(undef,\$count); - $sth->fetch; - - push @attribs, Bio::EnsEMBL::Attribute->new - (-NAME => 'SNP Count', - -CODE => 'SNPCount', - -VALUE => $count, - -DESCRIPTION => 'Total Number of SNPs'); - - $sth->finish; - } - - $attrib_adaptor->store_on_Slice($slice, \@attribs); - my $slice_attrib_count = @attribs; - $total_count += $slice_attrib_count; -# print_chromo_stats([$slice]); - } - - print STDOUT "Written $total_count seq reqion attributes to database $name on server $host.\n"; + my $genes = $slice->get_all_Genes(); -} + while ( my $gene = shift( @{$genes} ) ) { + my $biotype = $gene->biotype(); + if ( $biotype =~ /coding/i && $biotype !~ /non_/i ) { + if ( $gene->is_known() ) { + $biotype = "known " . $biotype; + } else { + $biotype = "novel " . $biotype; + } + } + $counts{$biotype}++; + + } + + for my $biotype ( keys %counts ) { + my $attrib_code = $attrib_codes{$biotype}; + if ( !$attrib_code ) { + print STDERR + "Unspecified biotype \"$biotype\" in database $dbname.\n"; + next; + } + + $attrib_counts{$attrib_code} += $counts{$biotype}; + + } + + foreach my $attrib_code ( keys %attrib_counts ) { + my $attrib_code_desc = $attrib_code; + $attrib_code_desc =~ s/GeneNo_//; + push @attribs, + Bio::EnsEMBL::Attribute->new( + -NAME => $attrib_code_desc . ' Gene Count', + -CODE => $attrib_code, + -VALUE => $attrib_counts{$attrib_code}, + -DESCRIPTION => 'Number of ' . $attrib_code_desc . ' Genes' + ); + + } + + } ## end if ($genes_present) + + if ($snps_present) { + my $count = + $snp_db->dbc()->sql_helper()->execute_single_result( + -SQL => +"SELECT COUNT(*) FROM variation_feature WHERE seq_region_id = ?", + -PARAMS => [ $slice->get_seq_region_id ] ); + + push @attribs, + Bio::EnsEMBL::Attribute->new( + -NAME => 'SNP Count', + -CODE => 'SNPCount', + -VALUE => $count, + -DESCRIPTION => 'Total Number of SNPs' + ); + } + + $attrib_adaptor->store_on_Slice( $slice, \@attribs ); + my $slice_attrib_count = @attribs; + $total_count += $slice_attrib_count; + # print_chromo_stats([$slice]); + } ## end while ( my $slice = shift...) + + print STDOUT +"Written $total_count seq reqion attributes to database $dbname on server " + . $db->dbc()->host() . "\n"; + +} ## end for my $db_args ( @{ $cli_helper...}) sub print_chromo_stats { - my $chromosomes = shift; - - foreach my $chr (@$chromosomes) { - print "\nchromosome: ",$chr->seq_region_name(),"\n"; - foreach my $attrib (@{$chr->get_all_Attributes()}) { - print " ", $attrib->name(), ": ", $attrib->value(), "\n"; - } - } -} + my $chromosomes = shift; + foreach my $chr (@$chromosomes) { + print "\nchromosome: ", $chr->seq_region_name(), "\n"; + foreach my $attrib ( @{ $chr->get_all_Attributes() } ) { + print " ", $attrib->name(), ": ", $attrib->value(), "\n"; + } + } +} # # tries to attach variation database. # sub variation_attach { - my $db = shift; - - my $core_db_name; - $core_db_name = $db->dbc->dbname(); - if( $core_db_name !~ /_core_/ ) { - return 0; - } - # - # get a lost of all databases on that server - # - my $sth = $db->dbc->prepare( "show databases" ); - $sth->execute(); - my $all_db_names = $sth->fetchall_arrayref(); - my %all_db_names = map {( $_->[0] , 1)} @$all_db_names; - my $snp_db_name = $core_db_name; - $snp_db_name =~ s/_core_/_variation_/; - - - -if( ! exists $all_db_names{ $snp_db_name } ) { - return 0; - } - - # this should register the dbadaptor with the Registry - my $snp_db = Bio::EnsEMBL::Variation::DBSQL::DBAdaptor->new - ( -host => $db->dbc()->host(), - -user => $db->dbc()->username(), - -pass => $db->dbc()->password(), - -port => $db->dbc()->port(), - -dbname => $snp_db_name, - -group => "variation", - -species => "DEFAULT" - ); - - return $snp_db; -} + my $db = shift; + my $core_db_name; + $core_db_name = $db->dbc->dbname(); + if ( $core_db_name !~ /_core_/ ) { + return 0; + } + # + # get a lost of all databases on that server + # + my $sth = $db->dbc->prepare("show databases"); + $sth->execute(); + my $all_db_names = $sth->fetchall_arrayref(); + my %all_db_names = map { ( $_->[0], 1 ) } @$all_db_names; + my $snp_db_name = $core_db_name; + $snp_db_name =~ s/_core_/_variation_/; + + if ( !exists $all_db_names{$snp_db_name} ) { + return 0; + } + + # this should register the dbadaptor with the Registry + my $snp_db = + Bio::EnsEMBL::Variation::DBSQL::DBAdaptor->new( + -host => $db->dbc()->host(), + -user => $db->dbc()->username(), + -pass => $db->dbc()->password(), + -port => $db->dbc()->port(), + -dbname => $snp_db_name, + -group => "variation", + -species => "DEFAULT" ); + + return $snp_db; +} ## end sub variation_attach sub usage { - my $indent = ' ' x length($0); - print <<EOF; exit(0); + my $indent = ' ' x length($0); + print <<EOF; exit(0); For each toplevel slice, count the number of genes for each biotype @@ -359,5 +344,5 @@ Usage: EOF - -} + +} ## end sub usage diff --git a/misc-scripts/gene_gc.pl b/misc-scripts/gene_gc.pl index 6f11ee7695ecd651bacc0422c3a9829377ba689c..770a5367cbbd34dced6887a60b8188058a199137 100644 --- a/misc-scripts/gene_gc.pl +++ b/misc-scripts/gene_gc.pl @@ -1,104 +1,89 @@ +#!/usr/bin/env perl # Calculate per-gene GC content and store as gene attributes +use warnings; use strict; -use DBI; - -use Getopt::Long; - use Bio::EnsEMBL::DBSQL::DBAdaptor; - -my ( $host, $user, $pass, $port, $dbpattern, $print); - -$port = 3306; - -GetOptions( "host|h=s", \$host, - "user|u=s", \$user, - "pass|p=s", \$pass, - "port=i", \$port, - "pattern=s", \$dbpattern, - "print", \$print, - "help" , \&usage - ); - - -usage() if (!$host || !$dbpattern || !$user || !$pass); - -# loop over databases -my $dsn = "DBI:mysql:host=$host"; -$dsn .= ";port=$port" if ($port); - -my $db = DBI->connect($dsn, $user, $pass); - -my @dbnames = map {$_->[0] } @{$db->selectall_arrayref("show databases")}; - -for my $dbname (@dbnames) { - - next if ($dbname !~ /$dbpattern/); - - my $dba = new Bio::EnsEMBL::DBSQL::DBAdaptor('-host' => $host, - '-port' => $port, - '-user' => $user, - '-pass' => $pass, - '-dbname' => $dbname, - '-species' => $dbname); - - print STDOUT "$dbname\n"; - - delete_existing($dba) if !($print); - - print STDOUT "Calculating Gene GC attributes\n"; - - my $attribute_adaptor = $dba->get_AttributeAdaptor(); - - my $genes = $dba->get_GeneAdaptor()->fetch_all(); - - my $total_count = 0; - - while (my $gene = shift(@$genes)) { - - my $gc = $gene->feature_Slice()->get_base_count->{'%gc'}; - - if (!$print) { - # attribute types need to be propagated from production database to all dbs - # if the type exists it won't be overwritten - my $attribute = Bio::EnsEMBL::Attribute->new(-CODE => 'GeneGC', - -NAME => 'Gene GC', - -DESCRIPTION => 'Percentage GC content for this gene', - -VALUE => $gc); - my @attributes = ($attribute); - $attribute_adaptor->store_on_Gene($gene->dbID, \@attributes); - - $total_count++; - - } else { - - print $gene->stable_id() . " " . $gc . "\n"; - - } - - } - if (!$print) { - print STDOUT "Written $total_count 'GeneGC' gene attributes to database $dbname on server $host.\n"; - } - -} +use Bio::EnsEMBL::Utils::CliHelper; + +my $cli_helper = Bio::EnsEMBL::Utils::CliHelper->new(); + +# get the basic options for connecting to a database server +my $optsd = $cli_helper->get_dba_opts(); +# add the print option +push( @{$optsd}, "print|p" ); +# process the command line with the supplied options plus a help subroutine +my $opts = $cli_helper->process_args( $optsd, \&usage ); + +# use the command line options to get an array of database details +for my $db_args ( @{ $cli_helper->get_dba_args_for_opts($opts) } ) { + + # use the args to create a DBA + my $dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new( %{$db_args} ); + + print STDOUT "Processing species " + . $dba->species_id() + . " from database " + . $dba->dbc()->dbname() + . " on server " + . $dba->dbc()->host() . "\n"; + delete_existing($dba) unless ( $opts->{print} ); + + print STDOUT "Calculating Gene GC attributes\n"; + my $attribute_adaptor = $dba->get_AttributeAdaptor(); + my $total_count = 0; + for my $gene ( @{ $dba->get_GeneAdaptor()->fetch_all() } ) { + my $gc = $gene->feature_Slice()->get_base_count->{'%gc'}; + if ( !$opts->{print} ) { + # attribute types need to be propagated from production database to all dbs + # if the type exists it won't be overwritten + my $attribute = + Bio::EnsEMBL::Attribute->new( + -CODE => 'GeneGC', + -NAME => 'Gene GC', + -DESCRIPTION => 'Percentage GC content for this gene', + -VALUE => $gc ); + my @attributes = ($attribute); + $attribute_adaptor->store_on_Gene( $gene->dbID, \@attributes ); + $total_count++; + } else { + print $gene->stable_id() . " " . $gc . "\n"; + } + } + + if ( !$opts->{print} ) { + print STDOUT "Written $total_count 'GeneGC' gene attributes to species " + . $dba->species_id() + . " from database " + . $dba->dbc()->dbname() + . " on server " + . $dba->dbc()->host() . "\n"; + } + +} ## end for my $db_args ( @{ $cli_helper...}) # ---------------------------------------------------------------------- sub delete_existing { - my $dba = shift; + my $dba = shift; - print STDOUT "Deleting existing 'GeneGC' gene attributes\n"; - my $dsth = $dba->dbc()->prepare("DELETE ga FROM gene_attrib ga, attrib_type at WHERE at.attrib_type_id=ga.attrib_type_id AND at.code='GeneGC'"); - $dsth->execute(); + print STDOUT "Deleting existing 'GeneGC' gene attributes\n"; + $dba->dbc()->sql_helper()->execute_update( + -SQL => +q/DELETE ga FROM gene_attrib ga, attrib_type at, gene g, seq_region s, coord_system c +WHERE at.attrib_type_id=ga.attrib_type_id AND at.code='GeneGC' +AND ga.gene_id=g.gene_id AND g.seq_region_id=s.seq_region_id +AND c.coord_system_id=s.coord_system_id AND c.species_id=? +/, + PARAMS => [ $dba->species_id() ] ); + return; } - sub usage { - my $indent = ' ' x length($0); - print <<EOF; exit(0); + my $indent = ' ' x length($0); + print <<EOF; exit(0); What does it do? @@ -149,4 +134,4 @@ Usage: EOF -} +} ## end sub usage diff --git a/misc-scripts/meta_coord/update_meta_coord.pl b/misc-scripts/meta_coord/update_meta_coord.pl index 6160556d0ee5debfd55a658df46c73399486e85b..b985b18a750238e870b3651efbbd2e6335200c74 100755 --- a/misc-scripts/meta_coord/update_meta_coord.pl +++ b/misc-scripts/meta_coord/update_meta_coord.pl @@ -5,6 +5,7 @@ use warnings; use Bio::EnsEMBL::DBSQL::DBConnection; use Getopt::Long; +use Bio::EnsEMBL::Utils::CliHelper; my $help = 0; my ( $host, $port, $user, $pass, $dbpattern ); @@ -32,7 +33,7 @@ my @table_names = qw( ); sub usage { - print <<USAGE_END; + print <<USAGE_END; USAGE: $0 --dbhost=ens-staging1 [--dbport=3306] \\ @@ -52,81 +53,74 @@ data in the following tables: USAGE_END - print( "\t", join( "\n\t", @table_names ), "\n" ); + print( "\t", join( "\n\t", @table_names ), "\n" ); } if ( scalar(@ARGV) == 0 ) { - usage(); - exit 0; + usage(); + exit 0; } -if ( !GetOptions( 'help!' => \$help, - 'dbhost|host=s' => \$host, - 'dbport|port=i' => \$port, - 'dbuser|user=s' => \$user, - 'dbpass|password|pass=s' => \$pass, - 'dbpattern=s' => \$dbpattern - ) || - $help ) -{ - usage(); - exit; -} - -my $dsn = "DBI:mysql:host=$host;port=$port"; - -my $db = DBI->connect( $dsn, $user, $pass ); - -my @dbnames = - map { $_->[0] } @{ $db->selectall_arrayref("SHOW DATABASES") }; - -for my $dbname (@dbnames) { - - if ( $dbname !~ /$dbpattern/ ) { next } - - print("==> Looking at $dbname...\n"); - - my $dbc = - new Bio::EnsEMBL::DBSQL::DBConnection( -host => $host, - -port => $port, - -user => $user, - -pass => $pass, - -dbname => $dbname ); - - if ( system( "mysql --host=$host --port=$port " . - "--user=$user --password='$pass' " . - "--database=$dbname --skip-column-names " . - "--execute='SELECT * FROM meta_coord'" . - ">$dbname.meta_coord.backup" ) ) - { - warn( "Can't dump the original meta_coord for back up " . - "(skipping this database)\n" ); - next; - } - else { - print( "Original meta_coord table backed up in \n" . - "\t$dbname.meta_coord.backup\n" ); - } - - foreach my $table_name (@table_names) { - print("Updating $table_name table entries... "); - - my $sql = "DELETE FROM meta_coord WHERE table_name = '$table_name'"; - $dbc->do($sql); - - $sql = - "INSERT INTO meta_coord " . - "SELECT '$table_name', s.coord_system_id, " . - "MAX( t.seq_region_end - t.seq_region_start + 1 ) " . - "FROM $table_name t JOIN seq_region s USING (seq_region_id) " . - "GROUP BY s.coord_system_id"; - $dbc->do($sql); - - print("done\n"); - } - - print("==> Done with $dbname\n"); -} ## end for my $dbname (@dbnames) +my $cli_helper = Bio::EnsEMBL::Utils::CliHelper->new(); + +# get the basic options for connecting to a database server +my $optsd = $cli_helper->get_dba_opts(); +# process the command line with the supplied options plus a help subroutine +my $opts = $cli_helper->process_args( $optsd, \&usage ); + +# use the command line options to get an array of database details +# only process each database name once (to avoid duplication for multispecies dbs) +for my $db_args ( @{ $cli_helper->get_dba_args_for_opts( $opts, 1 ) } ) { + + my $dba = new Bio::EnsEMBL::DBSQL::DBAdaptor(%$db_args); + + my $file = + $dba->dbc()->dbname() . "_" . $dba->species_id() . ".meta_coord.backup"; + my $sys_call = sprintf( "mysql " + . "--host=%s " + . "--port=%d " + . "--user=%s " + . "--pass='%s' " + . "--database=%s " + . "--skip-column-names " + . " --execute='SELECT * FROM meta_coord'" + . " > $file", + $dba->dbc->host(), $dba->dbc->port(), + $dba->dbc->username(), $dba->dbc->password(), + $dba->dbc->dbname() ); + unless ( system($sys_call) == 0 ) { + warn "Can't dump the original meta_coord for back up " + . "(skipping this species)\n"; + next; + } else { + print "Original meta_coord table backed up in $file\n"; + } + + foreach my $table_name (@table_names) { + print("Updating $table_name table entries... "); + + $dba->dbc()->helper()->execute_update( + -SQL => +"DELETE mc.* FROM meta_coord mc, coord_system cs WHERE cs.coord_system_id=mc.coord_system_id AND table_name = ? AND cs.species_id=?", + -PARAMS => [ $table_name, $dba->species_id() ] ); + + $dba->dbc()->helper()->execute_update( + -SQL => "INSERT INTO meta_coord " + . "SELECT '$table_name', s.coord_system_id, " + . "MAX( t.seq_region_end - t.seq_region_start + 1 ) " + . "FROM $table_name t, seq_region s, coord_system c " + . "WHERE t.seq_region_id = s.seq_region_id AND c.coord_system_id=s.coord_system_id AND c.species_id=?" + . "GROUP BY s.coord_system_id", + -PARAMS => [ $dba->species_id() ] ); + + print("done\n"); + } + + print( "==> Done with " + . $dba->dbc->dbname() . "/" + . $dba->species_id() + . "\n" ); +} ## end for my $db_args ( @{ $cli_helper...}) print("==> All done.\n"); diff --git a/misc-scripts/meta_levels.pl b/misc-scripts/meta_levels.pl index 91f75a34e12729463904fdf7298cc68780de5019..b76e559a87cd1663cab6948ed7edcb802dde6f0e 100644 --- a/misc-scripts/meta_levels.pl +++ b/misc-scripts/meta_levels.pl @@ -10,87 +10,71 @@ use DBI; use Getopt::Long; use Bio::EnsEMBL::DBSQL::DBAdaptor; - -my ( $host, $user, $pass, $port, $dbpattern, $print, $sing_db_name ); - -$port = 3306; - -GetOptions( "dbhost|host=s", \$host, - "dbuser|user=s", \$user, - "dbpass|pass=s", \$pass, - "dbport|port=i", \$port, - "dbpattern|pattern=s", \$dbpattern, - "dbname=s", \$sing_db_name, - "print", \$print, - "help", \&usage ); - -if ( !$host || ( !$dbpattern && !$sing_db_name ) ) { - usage(); +use Bio::EnsEMBL::Utils::CliHelper; + +my $cli_helper = Bio::EnsEMBL::Utils::CliHelper->new(); + +our @feature_types = + qw[gene transcript exon repeat_feature dna_align_feature protein_align_feature simple_feature prediction_transcript prediction_exon]; + +# get the basic options for connecting to a database server +my $optsd = $cli_helper->get_dba_opts(); +# add the print option +push(@{$optsd},"print"); +# process the command line with the supplied options plus a help subroutine +my $opts = $cli_helper->process_args($optsd,\&usage); + +# use the command line options to get an array of database details +for my $db_args (@{$cli_helper->get_dba_args_for_opts($opts)}) { + # use the args to create a DBA + my $dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(%{$db_args}); + if($dba) { + process_dba($dba,$opts->{print}); + } } -$dbpattern = $sing_db_name unless $dbpattern; - -my @feature_types = qw[gene transcript exon repeat_feature dna_align_feature protein_align_feature simple_feature prediction_transcript prediction_exon]; - -# loop over databases - -my $dsn = "DBI:mysql:host=$host"; -$dsn .= ";port=$port" if ($port); +sub process_dba { -my $db = DBI->connect( $dsn, $user, $pass ); + my ($dba,$print) = @_; + my $ma = $dba->get_MetaContainer(); -my @dbnames = - map { $_->[0] } @{ $db->selectall_arrayref("show databases") }; + my @inserted; + my @not_inserted; -for my $dbname (@dbnames) { + foreach my $type (@feature_types) { - next if ( $dbname !~ /$dbpattern/ ); + delete_existing( $ma, $type ) if ( !$print ); - my $dba = - new Bio::EnsEMBL::DBSQL::DBAdaptor( '-host' => $host, - '-port' => $port, - '-user' => $user, - '-pass' => $pass, - '-dbname' => $dbname, - '-species' => $dbname ); + if ( can_use_key( $dba, $type ) ) { - my $ma = $dba->get_MetaContainer(); + insert_key( $ma, $type ) if ( !$print ); + push @inserted, $type; - my @inserted; - my @not_inserted; + } else { - foreach my $type (@feature_types) { + push @not_inserted, $type; - delete_existing( $ma, $type ) if ( !$print ); + } - if ( can_use_key( $dba, $type ) ) { + } - insert_key( $ma, $type ) if ( !$print ); - push @inserted, $type; - - } else { - - push @not_inserted, $type; - - } - - } - - print "$dbname inserted keys for " . join( ", ", @inserted ) . ".\n" - if (@inserted); - print "$dbname did not insert keys for " - . join( ", ", @not_inserted ) . ".\n" - if (@not_inserted); - -} ## end for my $dbname (@dbnames) + print $dba->dbc()->dbname() + . " (species_id " + . $dba->species_id() + . ") inserted keys for " + . join( ", ", @inserted ) . ".\n" + if (@inserted); + print "Did not insert keys for " . join( ", ", @not_inserted ) . ".\n" + if (@not_inserted); +} ## end sub process_dba #------------------------------------------------------------------------------ sub delete_existing { - my ( $ma, $type ) = @_; + my ( $ma, $type ) = @_; - $ma->delete_key( $type . "build.level" ); + $ma->delete_key( $type . "build.level" ); } @@ -98,41 +82,41 @@ sub delete_existing { sub can_use_key { - my ( $dba, $type ) = @_; + my ( $dba, $type ) = @_; -# compare total count of typewith number of toplevel type, if they're the same, -# then we can use the key + # compare total count of typewith number of toplevel type, if they're the same, + # then we can use the key - my $sth = $dba->dbc()->prepare("SELECT COUNT(*) FROM $type"); - $sth->execute(); - my $total = ( $sth->fetchrow_array() )[0]; + my $sth = $dba->dbc()->prepare("SELECT COUNT(*) FROM $type"); + $sth->execute(); + my $total = ( $sth->fetchrow_array() )[0]; - $sth = - $dba->dbc() - ->prepare( "SELECT COUNT(*) " - . "FROM $type t, seq_region_attrib sra, attrib_type at " - . "WHERE t.seq_region_id=sra.seq_region_id " - . "AND sra.attrib_type_id=at.attrib_type_id " - . "AND at.code='toplevel'" ); - $sth->execute(); - my $toplevel = ( $sth->fetchrow_array() )[0]; + $sth = + $dba->dbc() + ->prepare( "SELECT COUNT(*) " + . "FROM $type t, seq_region_attrib sra, attrib_type at " + . "WHERE t.seq_region_id=sra.seq_region_id " + . "AND sra.attrib_type_id=at.attrib_type_id " + . "AND at.code='toplevel'" ); + $sth->execute(); + my $toplevel = ( $sth->fetchrow_array() )[0]; - if ( $toplevel > 0 ) { - return $total == $toplevel; - } -} + if ( $toplevel > 0 ) { + return $total == $toplevel; + } +} ## end sub can_use_key #------------------------------------------------------------------------------ sub insert_key { - my ( $ma, $type ) = @_; - $ma->store_key_value( $type . "build.level", "toplevel" ); + my ( $ma, $type ) = @_; + $ma->store_key_value( $type . "build.level", "toplevel" ); } #------------------------------------------------------------------------------ sub usage { - print <<EOF; exit(0); + print <<EOF; exit(0); Populate meta table with (e.g.) genebuild.level = toplevel if all genes are top level. Using v41 API code this can speed fetching and dumping diff --git a/misc-scripts/repeats/repeat-types.pl b/misc-scripts/repeats/repeat-types.pl index c2e811d40653adbe4822213292831a0834ad3e97..db39eabccd9a20983c87206ec7f305b4bcb5cf1e 100644 --- a/misc-scripts/repeats/repeat-types.pl +++ b/misc-scripts/repeats/repeat-types.pl @@ -1,92 +1,75 @@ -# +#!/usr/bin/env perl # Repeat classification script -# -# This script is used to do the repeat classification for web display -# on newer v32 databases. +# +# This script is used to do the repeat classification for web display +# on newer v32 databases. # use strict; - -use DBI; -use Getopt::Long; - -my ( $host, $user, $pass, $port, $expression, $dbpattern, $help ); - -GetOptions( "dbhost|host=s", \$host, - "dbuser|user=s", \$user, - "dbpass|pass=s", \$pass, - "dbport|port=i", \$port, - "dbname|dbpattern=s", \$dbpattern, - "help", \$help - ); - -if($help) { - usage(); -} - -if( !$host ) { - print STDERR "-host argument is required\n"; - usage(); -} - -if( !$dbpattern ) { - print STDERR "-dbpattern argument is required\n"; - usage(); -} - -my $dsn = "DBI:mysql:host=$host"; -if( $port ) { - $dsn .= ";port=$port"; -} - -my $dbh = DBI->connect( $dsn, $user, $pass ); - -my @dbnames = map {$_->[0] } @{ $dbh->selectall_arrayref( "show databases" ) }; - -my @dbs = grep {$_ =~ /$dbpattern/} @dbnames; -die("Haven't found any real dbs from $dbpattern") if(!@dbs); -foreach my $db (@dbs) { - warn "using $db"; - $dbh->do("use $db"); - - print STDERR " Setting repeat types\n"; - - my %mappings = ( - 'Low_Comp%' => 'Low complexity regions', - 'LINE%' => 'Type I Transposons/LINE', - 'SINE%' => 'Type I Transposons/SINE', - 'DNA%' => 'Type II Transposons', - 'LTR%' => 'LTRs', - 'Other%' => 'Other repeats', - 'Satelli%' => 'Satellite repeats', - 'Simple%' => 'Simple repeats', - 'Other%' => 'Other repeats', - 'Tandem%' => 'Tandem repeats', - 'TRF%' => 'Tandem repeats', - 'Waterman' => 'Waterman', - 'Recon' => 'Recon', - 'Tet_repeat' => 'Tetraodon repeats', - 'MaskRegion' => 'Mask region', - 'dust%' => 'Dust', - 'Unknown%' => 'Unknown', - '%RNA' => 'RNA repeats', - ); - foreach (keys %mappings) { - $dbh->do(qq(update repeat_consensus set repeat_type = '$mappings{$_}' where repeat_class like '$_')); - } - - # type all remaining repeats as unknown - $dbh->do(qq(update repeat_consensus set repeat_type = 'Unknown' where repeat_type = '')); - $dbh->do(qq(update repeat_consensus set repeat_type = 'Unknown' where repeat_type = null)); -} +use warnings; + +use Bio::EnsEMBL::Utils::CliHelper; + +my $cli_helper = Bio::EnsEMBL::Utils::CliHelper->new(); + +# get the basic options for connecting to a database server +my $optsd = $cli_helper->get_dba_opts(); +# add the print option +push( @{$optsd}, "print|p" ); +# process the command line with the supplied options plus a help subroutine +my $opts = $cli_helper->process_args( $optsd, \&usage ); + +# use the command line options to get an array of database details +for my $db_args ( @{ $cli_helper->get_dba_args_for_opts($opts) } ) { + + # use the args to create a DBA + my $dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new( %{$db_args} ); + my $helper = $dba->dbc()->sql_helper(); + print STDOUT "Processing species " + . $dba->species_id() + . " from database " + . $dba->dbc()->dbname() + . " on server " + . $dba->dbc()->host() . "\n"; + print STDERR " Setting repeat types\n"; + + my %mappings = ( 'Low_Comp%' => 'Low complexity regions', + 'LINE%' => 'Type I Transposons/LINE', + 'SINE%' => 'Type I Transposons/SINE', + 'DNA%' => 'Type II Transposons', + 'LTR%' => 'LTRs', + 'Other%' => 'Other repeats', + 'Satelli%' => 'Satellite repeats', + 'Simple%' => 'Simple repeats', + 'Other%' => 'Other repeats', + 'Tandem%' => 'Tandem repeats', + 'TRF%' => 'Tandem repeats', + 'Waterman' => 'Waterman', + 'Recon' => 'Recon', + 'Tet_repeat' => 'Tetraodon repeats', + 'MaskRegion' => 'Mask region', + 'dust%' => 'Dust', + 'Unknown%' => 'Unknown', + '%RNA' => 'RNA repeats', ); + foreach ( keys %mappings ) { + $helper->execute_update( -SQL => +qq(update repeat_consensus set repeat_type = '$mappings{$_}' where repeat_class like '$_') + ); + } + + # type all remaining repeats as unknown + $helper->execute_update( -SQL => +qq(update repeat_consensus set repeat_type = 'Unknown' where repeat_type = '') + ); + $helper->execute_update( -SQL => +qq(update repeat_consensus set repeat_type = 'Unknown' where repeat_type is null) + ); +} ## end for my $db_args ( @{ $cli_helper...}) print STDERR "All done.\n"; -$dbh->disconnect; - - sub usage { - print STDERR <<EOF + print STDERR <<EOF This program classifies the repeats stored in a core database into some somewhat sensible categories. It does this through a combination of a @@ -100,6 +83,6 @@ example: perl repeat-types.pl -user ensadmin -pass secret -host ecs1g \\ -port 3306 -dbpattern '^homo_sapiens_(core|vega)_20_34c$' EOF -; - exit; + ; + exit; } diff --git a/misc-scripts/translation_attribs.pl b/misc-scripts/translation_attribs.pl index 2b97d6b93cee47ff0f8ac1386701670ad45fa683..aee078fdc694e27c0b7238c75f3d63d4e3765be7 100644 --- a/misc-scripts/translation_attribs.pl +++ b/misc-scripts/translation_attribs.pl @@ -1,4 +1,4 @@ -#!/usr/local/ensembl/bin/perl +#!/software/bin/perl =head1 NAME @@ -70,20 +70,33 @@ Daniel Rios <dani@ebi.ac.uk>, Ensembl core API team use strict; use warnings; - use Getopt::Long; use Pod::Usage; - use Bio::EnsEMBL::Translation; use Bio::EnsEMBL::Registry; use Bio::EnsEMBL::Attribute; use Bio::EnsEMBL::DBSQL::DBAdaptor; use Data::Dumper; use DBI; - use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Utils::CliHelper; + +my $cli_helper = Bio::EnsEMBL::Utils::CliHelper->new(); -##global variable containing all possible pepstats and the codes used +# get the basic options for connecting to a database server +my $optsd = $cli_helper->get_dba_opts(); +# add the print option +push( @{$optsd}, "binpath:s" ); +push( @{$optsd}, "tmpdir:s" ); +push( @{$optsd}, "verbose|v" ); +# process the command line with the supplied options plus a help subroutine +my $opts = $cli_helper->process_args( $optsd, \&pod2usage ); + +$opts->{binpath} ||= '/software/pubseq/bin/emboss'; +$opts->{tmpdir} ||= '/tmp'; +$opts->{port} ||= '3306'; +$opts->{host} ||= 'ens-staging'; +$opts->{user} ||= 'ensro'; my %PEPSTATS_CODES = ( 'Number of residues' => 'NumResidues', 'Molecular weight' => 'MolecularWeight', @@ -97,221 +110,140 @@ my %MET_AND_STOP = ( 'Starts with methionine' => 'starts_met', ); -## Command line options - -my $binpath = '/software/pubseq/bin/emboss'; -my $tmpdir = '/tmp'; -my $host = 'ens-staging'; -my $dbname = undef; -my $user = undef; -my $pass = undef; -my $port = 3306; -my $help = undef; -my $pattern = undef; - -GetOptions('binpath=s' => \$binpath, - 'tmpdir=s' => \$tmpdir, - 'host=s' => \$host, - 'dbname=s' => \$dbname, - 'user=s' => \$user, - 'pass=s' => \$pass, - 'port=s' => \$port, - 'help' => \$help, - 'pattern=s' => \$pattern - ); - -pod2usage(1) if($help); -throw("--user argument required") if (!defined($user)); -throw("--pass argument required") if (!defined($pass)); - -my $dbas; -#load registry with all databses when no database defined -if (!defined ($dbname) && !defined ($pattern)){ - Bio::EnsEMBL::Registry->load_registry_from_db(-host => $host, - -user => $user, - -pass => $pass, - -port => $port - ); - $dbas = Bio::EnsEMBL::Registry->get_all_DBAdaptors(-group=>'core'); #get all core adaptors for all species -} -elsif(defined ($pattern)){ - #will only load core databases matching the pattern - my $database = 'information_schema'; - my $dbh = DBI->connect("DBI:mysql:database=$database;host=$host;port=$port",$user,$pass); - #fetch all databases matching the pattern - my $sth = $dbh->prepare("SHOW DATABASES WHERE `database` REGEXP \'$pattern\'"); - $sth->execute(); - my $dbs = $sth->fetchall_arrayref(); - foreach my $db_name (@{$dbs}){ - #this is a core database - #TODO Fix this; we DO NOT want to do this and it will break for trinomials - my ($species) = ( $db_name->[0] =~ /(^[a-z]+_[a-z]+)_(core|vega|otherfeatures)_\d+/ ); - next unless $species; - my $dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $host, - -user => $user, - -pass => $pass, - -port => $port, - -group => 'core', - -species => $species, - -dbname => $db_name->[0] - ); - if ($db_name->[0] =~ /(vega|otherfeatures)/){ - my $other_dbname = $db_name->[0]; - $other_dbname =~ s/$1/core/; - #for vega databases, add the core as the dna database - my $core_db = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $host, - -user => $user, - -pass => $pass, - -port => $port, - -species => $species, - -dbname => $other_dbname - ); - $dba->dnadb($core_db); - } - push @{$dbas},$dba; - } -} -elsif(defined ($dbname)){ -#only get a single DBAdaptor, the one for the database specified - my $dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $host, - -user => $user, - -pass => $pass, - -port => $port, - -dbname => $dbname - ); - if ($dbname =~ /(vega|otherfeatures)/){ - my $other_dbname = $dbname; - $other_dbname =~ s/$1/core/; - #for vega databases, add the core as the dna database - my $core_db = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $host, - -user => $user, - -pass => $pass, - -port => $port, - -dbname => $other_dbname - ); - $dba->dnadb($core_db); - } - push @{$dbas},$dba; -} -else{ - thrown("Not entered properly database connection param. Read docs\n"); -} - -my %attributes_to_delete; #hash containing attributes to be removed from the database -#from release 54, only PEPSTATS_CODES will be calculated, but we will leave the MET_AND_STOP -#removal in case the database run is very old -%attributes_to_delete = (%PEPSTATS_CODES,%MET_AND_STOP); +my %attributes_to_delete = (%PEPSTATS_CODES); my $translation_attribs = {}; my $translation; my $dbID; -#foreach of the species, calculate the pepstats -foreach my $dba (@{$dbas}){ - next if (defined $dbname and $dba->dbc->dbname ne $dbname); - print "Removing attributes from database ", $dba->dbc->dbname,"\n"; - remove_old_attributes($dba,\%attributes_to_delete); - - my $translationAdaptor = $dba->get_TranslationAdaptor(); - my $transcriptAdaptor = $dba->get_TranscriptAdaptor(); - my $attributeAdaptor = $dba->get_AttributeAdaptor(); - print "Going to update translation_attribs for ", $dba->dbc->dbname,"\n"; - #for all the translations in the database, run pepstats and update the translation_attrib table - my $sth = $dba->dbc->prepare("SELECT translation_id from translation"); - $sth->execute(); - $sth->bind_columns(\$dbID); - while($sth->fetch()){ - #foreach translation, retrieve object - $translation = $translationAdaptor->fetch_by_dbID($dbID); - #calculate pepstats - get_pepstats($translation,$binpath,$tmpdir,$translation_attribs); - #and store results in database - store_translation_attribs($attributeAdaptor,$translation_attribs,$translation,\%PEPSTATS_CODES); - $translation_attribs = {}; - } -} + +# use the command line options to get an array of database details +for my $db_args ( @{ $cli_helper->get_dba_args_for_opts($opts) } ) { + + # use the args to create a DBA + my $dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new( %{$db_args} ); + my $dbname = $dba->dbc()->dbname(); + print STDOUT "Processing species " + . $dba->species_id() + . " from database " + . $dba->dbc()->dbname() + . " on server " + . $dba->dbc()->host() . "\n"; + + if ( $dbname =~ /(vega|otherfeatures)/ ) { + my $other_dbname = $dbname; + $other_dbname =~ s/$1/core/; + + $opts->{dbname} = $other_dbname; + #for vega databases, add the core as the dna database + my $core_db = Bio::EnsEMBL::DBSQL::DBAdaptor->new( %{$db_args} ); + $dba->dnadb($core_db); + } + + print "Removing attributes from database ", $dba->dbc->dbname, "\n"; + remove_old_attributes( $dba, \%attributes_to_delete); + my $translationAdaptor = $dba->get_TranslationAdaptor(); + my $attributeAdaptor = $dba->get_AttributeAdaptor(); + print + "Going to update translation_attribs in", + $dba->dbc->dbname, "\n"; + +#for all the translations in the database, run pepstats and update the translation_attrib table + my @translation_ids = @{ + $dba->dbc()->sql_helper()->execute_simple( +-SQL=>"SELECT tl.translation_id FROM translation tl, transcript tr, seq_region s, coord_system c WHERE tl.transcript_id = tr.transcript_id AND tr.seq_region_id = s.seq_region_id AND s.coord_system_id = c.coord_system_id AND c.species_id = ? order by tl.translation_id", + -PARAMS=>[$dba->species_id()] ) }; + my $translations = {}; + my $tmpfile = $opts->{tmpdir} . "/$$.pep"; + open( TMP, "> $tmpfile" ) || warn "PEPSTAT: $!"; + print "Retrieving translations\n"; + for my $dbID (@translation_ids) { + + #foreach translation, retrieve object + my $translation = $translationAdaptor->fetch_by_dbID($dbID); + if ( $opts->{verbose} ) { + print "Dumping translation dbID, $dbID...\n"; + } + $translations->{$dbID} = $translation; + my $peptide_seq = $translation->seq(); + if ( !( $peptide_seq =~ m/[BZX]/ig ) ) { + if ( $peptide_seq !~ /\n$/ ) { $peptide_seq .= "\n" } + $peptide_seq =~ s/\*$//; + print TMP ">$dbID\n$peptide_seq"; + } else { + print "Skipping translation dbID $dbID due to ambiguity codes...\n"; + } + } + close(TMP); + print "Running pepstat\n"; + + my $PEPSTATS = $opts->{binpath} . '/bin/pepstats'; + throw("pepstats binary at $PEPSTATS cannot be executed") + if ( !-x $PEPSTATS ); + open( OUT, "$PEPSTATS -filter < $tmpfile 2>&1 |" ) + || warn "PEPSTAT: $!"; + my @lines = <OUT>; + close(OUT); + unlink($tmpfile); + my $attribs = {}; + my $tId; + print "Parsing pepstat\n"; + + foreach my $line (@lines) { + if ( $line =~ /PEPSTATS of ([^ ]+)/ ) { + $tId = $1; + } elsif ( defined $tId ) { + if ( $line =~ /^Molecular weight = (\S+)(\s+)Residues = (\d+).*/ ) { + $attribs->{$tId}{'Number of residues'} = $3; + $attribs->{$tId}{'Molecular weight'} = $1; + } elsif ( $line =~ +/^Average(\s+)(\S+)(\s+)(\S+)(\s+)=(\s+)(\S+)(\s+)(\S+)(\s+)=(\s+)(\S+)/ ) + { + $attribs->{$tId}{'Ave. residue weight'} = $7; + $attribs->{$tId}{'Charge'} = $12; + } elsif ( $line =~ /^Isoelectric(\s+)(\S+)(\s+)=(\s+)(\S+)/ ) { + $attribs->{$tId}{'Isoelectric point'} = $5; + } elsif ( $line =~ /FATAL/ ) { + print STDERR "pepstats: $line\n"; + } + } + } + for my $id ( keys %$attribs ) { + my $translation = $translations->{$id}; + my $aas = $attribs->{$id}; + if ( $opts->{verbose} ) { + print "Storing attribs for translation dbID, $id...\n"; + } + store_translation_attrib( $attributeAdaptor, $translation, $aas ); + } +} ## end for my $db_args ( @{ $cli_helper...}) #will remove any entries in the translation_attrib table for the attributes, if any #this method will try to remove the old starts_met and has_stop_codon attributes, if present #this is to allow to be run on old databases, but removing the not used attributes -sub remove_old_attributes{ - my $dba = shift; - my $attributes = shift; - - my $sth = $dba->dbc()->prepare("DELETE ta FROM translation_attrib ta, attrib_type at WHERE at.attrib_type_id = ta.attrib_type_id AND at.code = ?"); - #remove all possible entries in the translation_attrib table for the attributes - foreach my $value (values %{$attributes}){ - $sth->execute($value); - } - $sth->finish; +sub remove_old_attributes { + my $dba = shift; + my $attributes = shift; + print "removing all translation attributes for db, " + . $dba->{_dbc}->{_dbname} . "\n"; + foreach my $value ( values %{$attributes} ) { + my $sql = "delete ta FROM translation_attrib ta, attrib_type at, translation tl, transcript tr, seq_region s, coord_system c WHERE at.attrib_type_id = ta.attrib_type_id AND ta.translation_id=tl.translation_id and tl.transcript_id = tr.transcript_id AND tr.seq_region_id = s.seq_region_id AND s.coord_system_id = c.coord_system_id AND c.species_id = ? and at.code=?"; + $dba->dbc()->sql_helper()->execute_update(-SQL=>$sql,-PARAMS=>[$dba->species_id(),$value]); + } + return; + +} ## end sub remove_old_attributes + +sub store_translation_attrib { + my ( $attributeAdaptor, $translation, $attribs ) = @_; + my @attribs; + for my $key ( keys %$attribs ) { + my $value = $attribs->{$key}; + push @attribs, + Bio::EnsEMBL::Attribute->new( '-code' => $PEPSTATS_CODES{$key}, + '-name' => $key, + '-value' => $value ); + } + $attributeAdaptor->store_on_Translation( $translation, \@attribs ); + return; } -#method that retrieves the pepstatistics for a translation - -sub get_pepstats { - my $translation = shift; - my $binpath = shift; - my $tmpdir = shift; - my $translation_attribs = shift; - - my $peptide_seq ; - eval { $peptide_seq = $translation->seq}; - - if ($@) { - warn("PEPSTAT: eval() failed: $!"); - return {}; - } elsif ( $peptide_seq =~ m/[BZX]/ig ) { - return {}; - } - - return {} if ($@ || $peptide_seq =~ m/[BZX]/ig); - if( $peptide_seq !~ /\n$/ ){ $peptide_seq .= "\n" } - $peptide_seq =~ s/\*$//; - - my $tmpfile = $tmpdir."/$$.pep"; - open( TMP, "> $tmpfile" ) || warn "PEPSTAT: $!"; - print TMP "$peptide_seq"; - close(TMP); - my $PEPSTATS = $binpath.'/bin/pepstats'; - open (OUT, "$PEPSTATS -filter < $tmpfile 2>&1 |") || warn "PEPSTAT: $!"; - my @lines = <OUT>; - close(OUT); - unlink($tmpfile); - foreach my $line (@lines){ - if($line =~ /^Molecular weight = (\S+)(\s+)Residues = (\d+).*/){ - $translation_attribs->{'Number of residues'} = $3 ; - $translation_attribs->{'Molecular weight'} = $1; - } - if($line =~ /^Average(\s+)(\S+)(\s+)(\S+)(\s+)=(\s+)(\S+)(\s+)(\S+)(\s+)=(\s+)(\S+)/){ - $translation_attribs->{'Ave. residue weight'} = $7; - $translation_attribs->{'Charge'} = $12; - } - if($line =~ /^Isoelectric(\s+)(\S+)(\s+)=(\s+)(\S+)/){ - $translation_attribs->{'Isoelectric point'} = $5; - } - if ($line =~ /FATAL/){ - print STDERR "pepstats: $line\n"; - $translation_attribs = {}; - } - } -} - -sub store_translation_attribs{ - my $attributeAdaptor = shift; - my $translation_attribs = shift; - my $translation = shift; - my $attributes = shift; - - my $attribute; - my @attributes; - #each of the keys in the pepstats is an attribute for the translation - foreach my $key (keys %{$translation_attribs}){ - - $attribute = Bio::EnsEMBL::Attribute->new('-code' => $attributes->{$key}, - '-name' => $key, - '-value' => $translation_attribs->{$key} - ); - push @attributes, $attribute; - - } - $attributeAdaptor->store_on_Translation($translation,\@attributes); -}