diff --git a/misc-scripts/db/cleanup_tmp_tables.pl b/misc-scripts/db/cleanup_tmp_tables.pl new file mode 100755 index 0000000000000000000000000000000000000000..362e5d0576c25bdb613a130440c3e085aa129968 --- /dev/null +++ b/misc-scripts/db/cleanup_tmp_tables.pl @@ -0,0 +1,128 @@ +#!/usr/local/bin/perl + +=head1 NAME + +cleanup_tmp_tables.pl - delete temporary and backup tables from a database + +=head1 SYNOPSIS + +cleanup_tmp_tables.pl [arguments] + +Required arguments: + + --dbname, db_name=NAME database name NAME + --host, --dbhost, --db_host=HOST database host HOST + --port, --dbport, --db_port=PORT database port PORT + --user, --dbuser, --db_user=USER database username USER + --pass, --dbpass, --db_pass=PASS database passwort PASS + +Optional arguments: + + --conffile, --conf=FILE read parameters from FILE + (default: conf/Conversion.ini) + + --logfile, --log=FILE log to FILE (default: *STDOUT) + --logpath=PATH write logfile to PATH (default: .) + --logappend, --log_append append to logfile (default: truncate) + + -v, --verbose=0|1 verbose logging (default: false) + -i, --interactive=0|1 run script interactively (default: true) + -n, --dry_run, --dry=0|1 don't write results to database + -h, --help, -? print help (this message) + +=head1 DESCRIPTION + + +=head1 LICENCE + +This code is distributed under an Apache style licence: +Please see http://www.ensembl.org/code_licence.html for details + +=head1 AUTHOR + +Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team + +=head1 CONTACT + +Please post comments/questions to the Ensembl development list +<ensembl-dev@ebi.ac.uk> + +=cut + +use strict; +use warnings; +no warnings 'uninitialized'; + +use FindBin qw($Bin); +use vars qw($SERVERROOT); + +BEGIN { + $SERVERROOT = "$Bin/../.."; + unshift(@INC, "$SERVERROOT/ensembl/modules"); + unshift(@INC, "$SERVERROOT/bioperl-live"); +} + +use Getopt::Long; +use Pod::Usage; +use Bio::EnsEMBL::Utils::ConversionSupport; + +$| = 1; + +my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT); + +# parse options +$support->parse_common_options(@_); +$support->allowed_params( + $support->get_common_params, +); + +if ($support->param('help') or $support->error) { + warn $support->error if $support->error; + pod2usage(1); +} + +# ask user to confirm parameters to proceed +$support->confirm_params; + +# get log filehandle and print heading and parameters to logfile +$support->init_log; + +$support->check_required_params; + +# connect to database and get adaptors +my $dba = $support->get_database('ensembl'); +my $dbh = $dba->dbc->db_handle; + +# find all temporary and backup tables +my @tables; + +foreach my $pattern (qw(tmp temp bak backup)) { + my $sql = qq(SHOW TABLES LIKE '\%$pattern\%'); + my $sth = $dbh->prepare($sql); + $sth->execute; + while (my ($table) = $sth->fetchrow_array) { + push @tables, $table; + } +} + +if ($support->param('dry_run')) { + # for a dry run, only show which databases would be deleted + $support->log("Temporary and backup tables found:\n"); + foreach my $table (sort @tables) { + $support->log("$table\n", 1); + } + +} else { + # delete tables + foreach my $table (sort @tables) { + if ($support->user_proceed("Drop table $table?")) { + $support->log("Dropping table $table...\n"); + $dbh->do("DROP TABLE $table"); + $support->log("Done.\n"); + } + } +} + +# finish logfile +$support->finish_log; + diff --git a/misc-scripts/utilities/dna_compress.pl b/misc-scripts/utilities/dna_compress.pl new file mode 100755 index 0000000000000000000000000000000000000000..3dcd5104f012e574e7944698a32b7229822c8456 --- /dev/null +++ b/misc-scripts/utilities/dna_compress.pl @@ -0,0 +1,294 @@ +#!/usr/local/bin/perl -- # -*-Perl-*- +# +# Copyright (c) 2003 Tim Hubbard (th@sanger.ac.uk) +# Sanger Centre, Wellcome Trust Genome Campus, Cambs, UK +# All rights reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation +# +# $Header$ + +# This is a driver script to populate and test the experimental dnac +# (compressed dna) table of ensembl. Use -T for pod based tutorial + +use strict; +use Getopt::Std; +use vars qw($opt_H $opt_T + $opt_u $opt_d $opt_P $opt_h $opt_p $opt_C $opt_U); + +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +getopts("u:HTd:p:P:h:CU"); + +$|=1; + +# specify defaults +my $def_u='ensadmin'; +my $def_P='3306'; +my $def_h='127.0.0.1'; + +if($opt_H){ + &help; +}elsif($opt_T){ + &help2; +} + +sub help { + print <<ENDHELP; + +usage: +dna_compress.pl [options] + + -H for help + -T tutorial + -d database + -u user ensembl db user [$def_u] + -d db ensembl db + -P port port of mysql [$def_P] + -h host host of mysql [$def_h] + -p pass passwd for mysqluser + -C compress + -U uncompress +ENDHELP + + exit 0; +} + +sub help2 { + exec('perldoc', $0); +} + +# defaults or options +$opt_u ||= $def_u; +$opt_h ||= $def_h; +$opt_P ||= $def_P unless $opt_P; + + +if(!$opt_d) { + print STDERR "ensembl db (-d) argument is required\n"; + help(); +} + +if(!$opt_C && !$opt_U) { + print STDERR "either compress (-C) or uncompress (-U) is required\n"; + help(); +} + + +# db connection +my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host => $opt_h, + -user => $opt_u, + -port => $opt_P, + -dbname => $opt_d, + -pass => $opt_p); + + + +my $meta_container = $db->get_MetaContainer(); +my $comp_seq_adaptor = $db->get_CompressedSequenceAdaptor(); + +my ($compressed) = + @{$meta_container->list_value_by_key('sequence.compression')}; + +if($opt_C) { + #compress the dna in the database + if($compressed) { + throw("The database meta table indicates this database already contains" + ." sequence compression.\nIf this is not the case do the following:\n" + ." mysql> delete from meta where meta_key = 'sequence.compression'\n"); + } + + #one more sanity check - make sure that dna table is populated and dnac table + #is not + if(get_dna_count($db) == 0) { + print STDERR "There is no uncompressed dna in this database to compress."; + exit(0); + } + if(get_dnac_count($db) != 0) { + throw("Unexpected: the dnac table already contains some rows.\n"); + } + + print STDERR "Compressing sequence and writing to dnac table.\n"; + + #get every slice that contains sequence + my $slice_adaptor = $db->get_SliceAdaptor(); + foreach my $slice (@{$slice_adaptor->fetch_all('seqlevel')}) { + my $seq_region_id = $slice_adaptor->get_seq_region_id($slice); + $comp_seq_adaptor->store($seq_region_id, $slice->seq); + print STDERR "."; + } + + print STDERR "\nSetting sequence.compression meta table entry.\n"; + $meta_container->delete_key('sequence.compression'); + $meta_container->store_key_value('sequence.compression', 1); + + print STDERR "Deleting uncompressed sequence.\n"; + my $sth = $db->prepare("delete from dna"); + $sth->execute(); + + print STDERR "All done.\n"; + +} else { + #uncompress the dna in the database + if(!$compressed) { + throw("The database meta table indicates that the sequence in this " + ." database is already uncompressed.\nIf this is not the case do " + ." the following:\n" + ." mysql> delete from meta where meta_key = 'sequence.compression';\n" + ." mysql> insert into meta (meta_key,meta_value) " + . "values('sequence.compression', 1);\n"); + } + + if(get_dnac_count($db) == 0) { + print STDERR "There is no compressed dna in this database to uncompress."; + exit(0); + } + if(get_dna_count($db) != 0) { + throw("Unexpected: the dna table already contains some rows.\n"); + } + + print STDERR "Setting sequence.compression meta table entry.\n"; + $meta_container->delete_key('sequence.compression'); + $meta_container->store_key_value('sequence.compression', 0); + + print STDERR "Uncompressing sequence and writing to dna table.\n"; + + #get every slice that contains sequence + my $slice_adaptor = $db->get_SliceAdaptor(); + my $seq_adaptor = $db->get_SequenceAdaptor(); + + if($seq_adaptor->isa('CompressedSequenceAdaptor')) { + throw("Tried to get SequenceAdaptor but got CompressedSequenceAdaptor."); + } + + foreach my $slice (@{$slice_adaptor->fetch_all('seqlevel')}) { + my $seq_region_id = $slice_adaptor->get_seq_region_id($slice); + my $seq = $comp_seq_adaptor->fetch_by_Slice_start_end_strand($slice); + $seq_adaptor->store($seq_region_id, $$seq); + print STDERR "."; + } + + print STDERR "\nDeleting compressed sequence.\n"; + my $sth = $db->prepare("delete from dnac"); + $sth->execute(); + + print STDERR "All done.\n"; +} + + +sub get_dna_count { + my $db = shift; + + my $sth = $db->prepare("SELECT count(*) from dna"); + $sth->execute(); + my ($count) = $sth->fetchrow_array(); + $sth->finish(); + + return $count; +} + + +sub get_dnac_count { + my $db = shift; + + my $sth = $db->prepare("SELECT count(*) from dnac"); + $sth->execute(); + my ($count) = $sth->fetchrow_array(); + $sth->finish(); + + return $count; +} + + + +__END__ + +=pod + +=head1 NAME - name + +dna_compress.pl + +=head1 DESCRIPTION + +Converts compressed sequence in an ensembl database to uncompressed sequence +or converts uncompressed sequence in an ensembl database to compressed +sequence. + +=head1 SYNOPSIS + + + +=head1 EXAMPLES + +Compress the contents of the dna table in an ensembl database: + + dna_compress.pl -C -u ensadmin -p password -h host -d homo_sapiens_core_18_34 + +Uncompress the contents of the dnac table in an ensembl database + + dna_compress.pl -U -u ensadmin -p password -h host -d homo_sapiens_core_18_34 + + +=head1 FLAGS + +=over 4 + +=item -h + +Displays short help + +=item -T + +Displays this help message + +=item -C + +Compress the dna in the database + +=item -U + +Uncompress the dna in the database + +=item -h + +The database host + +=item -p + +The database password + +=item -P + +The database port + +=item -u + +The database user + +=back + +=head1 VERSION HISTORY + +=over 4 + +=item 14-Jul-2003 + +B<th> initial release + +=item 08-Oct-2003 + +B<mcvicker> rewrote for new schema and newly implemented sequence compression + +=back + +=head1 BUGS + +=head1 AUTHOR + +B<Tim Hubbard> Email th@sanger.ac.uk + +=cut diff --git a/misc-scripts/utilities/parse_embl.pl b/misc-scripts/utilities/parse_embl.pl new file mode 100644 index 0000000000000000000000000000000000000000..e9d0f788d91c899d2a0ff3aecb1b1239296e9882 --- /dev/null +++ b/misc-scripts/utilities/parse_embl.pl @@ -0,0 +1,509 @@ + +=head1 NAME + +parse_embl.pl + +=head1 SYNOPSIS + + parse_embl -dbname <db name> [-host <db host>] [-port <db port>] + [-user <db user>] [-pass <db pass>] + [-genetype <gene type>] + -codontable <int> -emblfile <file name> + +=head1 DESCRIPTION + +This is a script originally written by Keith James to load a pathogen +genome from an EMBL flat file into an Otter (modified ensembl) database. +It has been has been stripped down and modified to make it more general +purpose. + +This script will peform simple loading of an annotated genome in EMBL +flat file format into an Ensembl database. The script loads a fairly +minimal set of gene information from an EMBL file, but could be +extended to do more. + +Certain assumptions are made about the format of the EMBL file such +that it may require pre-processing before loading - these are listed +below. A number of non-standard tags (EMBL qualifiers) are used. I +recommend using BioPerl to write a suitable pre-processor. + +=over + +=item * + +There may be multiple chromosomes/plasmids per EMBL file. + +=item * + +Each chromosome/plasmid must have a source feature defining the +chromosome/plasmid name and type. They must all have different +names. See EMBL feature table definition for 'chromosome' and +'plasmid' tags. + +=item * + +Protein coding genes are annotated as CDS features. + +=item * + +All gene synonyms are defined by multiple 'gene' and/or 'synonym' tags. + +=item * + +The frame of a CDS feature is defined by a 'codon_start' tag if it is +not in frame 0. + +=item * + +If a CDS feature has a 'pseudo' tag it is taken to be a pseudogene. + +=item * + +If a CDS feature has a 'partial' tag but fuzzy ends are not defined in +its location it is assumed the ends have not been found. + +=item * + +Non-standard translation tables are currently not handled. + +=item * + +Annotator remarks about genes are taken from 'note' tags of +features. Remarks about transcripts are taken from 'product', 'class' +and 'EC_number' tags of features. + +=back + +Redirect STDERR to a file to obtain a rough log of the loading +operations. + +Some functions in the script are simply stubs which may be implemented +later. Some user assembly may be required. + +=head1 AUTHOR - Keith James + +Email kdj@sanger.ac.uk + +=head1 CONTRIBUTORS + +Stephen Searle, sjms@sanger.ac.uk + +Graham McVicker + +=head1 METHODS + +=cut + +use strict; +use Getopt::Long; + +use Bio::SeqIO; +use Bio::Location::Simple; + +use Bio::EnsEMBL::DBSQL::DBAdaptor; + +use Bio::EnsEMBL::Exon; +use Bio::EnsEMBL::Gene; +use Bio::EnsEMBL::Slice; +use Bio::EnsEMBL::CoordSystem; +use Bio::EnsEMBL::Transcript; +use Bio::EnsEMBL::Translation; +use Bio::EnsEMBL::Analysis; + +use Bio::EnsEMBL::Utils::Exception qw(throw); + +### +### Script globals. +### + +my $MAX_SEQUENCE_LEN = 2e6; +my $CURATED_LOGIC_NAME = 'Genome Reviews'; + +### +### Command line options +### +my $emblfile = undef; +my $host = '127.0.0.1'; +my $user = undef; +my $pass = undef; +my $port = 3306; +my $dbname = undef; +my $gene_type = 'known'; + +GetOptions('emblfile:s' => \$emblfile, + 'dbname:s' => \$dbname, + 'user:s' => \$user, + 'pass:s' => \$pass, + 'port:n' => \$port, + 'host:s' => \$host, + 'genetype:s' => \$gene_type); + +throw("-dbname argument required") if(!defined($dbname)); +throw("-emblfile argument required") if(!defined($emblfile)); + +### +### Get Ensembl DB adaptor +### +my $dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $host, + -user => $user, + -pass => $pass, + -port => $port, + -dbname => $dbname); + + +### +### Create new analysis object indicating a curated origin +### +my $analysis = Bio::EnsEMBL::Analysis->new(-logic_name => $CURATED_LOGIC_NAME); + +### +### Create sequence object and load it +### +my $seqi = Bio::SeqIO->new(-file => $emblfile, + -format => 'embl'); +### +### Main loading loop +### +while (my $seq = $seqi->next_seq) { + # Split sequence and return a slice + my $slice = load_sequence($dba, $seq); + + foreach my $f ($seq->get_SeqFeatures) { + if ($f->primary_tag =~ /CDS/) { + store_gene($dba, $f, $slice); + } + + if ($f->primary_tag =~ /repeat/) { + # store_repeat($f, $slice); + } + } +} + + +=head2 store_gene + + Title : store_gene + Function : Stores a gene feature (both protein- and RNA-coding) + Returns : void + Argument : Bio::SeqFeatureI, Bio::EnsEMBL::DBSQL::SliceAdaptor + +=cut + +sub store_gene { + my ($db, $f, $slice) = @_; + + my $gene_stable_id = get_gene_stable_id($f); + + my $gene = Bio::EnsEMBL::Gene->new(); + + $gene->analysis($analysis); + $gene->type($gene_type); + $gene->stable_id($gene_stable_id); + $gene->version(1); + $gene->slice($slice); + + print STDERR sprintf("Found CDS with ID %s\n", $gene_stable_id); + + my $tcount = 0; + my $transcript_id = sprintf("%s.%d", $gene_stable_id, $tcount++); + + my $transcript = Bio::EnsEMBL::Transcript->new; + $transcript->stable_id($transcript_id); + $transcript->version(1); + $transcript->slice($slice); + + $gene->add_Transcript($transcript); + + # Add the exons + my @exons = create_exons($f); + my $ecount = 0; + foreach my $exon (@exons) { + $exon->stable_id(sprintf("%s.%d", $gene_stable_id, $ecount++)); + $exon->version(1); + $exon->slice($slice); + $transcript->add_Exon($exon); + } + + if ($f->primary_tag =~ /RNA/) { + foreach my $exon (@exons) { + $exon->phase(-1); + $exon->end_phase(-1); + } + } + + # Handle protein CDS features + if ($f->primary_tag eq 'CDS') { + # Based on /codon_start EMBL qualifier + my $frame = get_initial_frame($f); + + @exons = @{ $transcript->get_all_Exons }; + + # This code assumes no UTRs + my $translation = Bio::EnsEMBL::Translation->new; + my $rcount = 0; + $translation->stable_id(sprintf("%s.%d", $gene_stable_id, $rcount++)); + $translation->version(1); + $translation->start_Exon($exons[0]); + $translation->start(1 + $frame); + $translation->end_Exon($exons[$#exons]); + $translation->end($translation->end_Exon->length); + + set_exon_phases($translation, @exons); + + foreach my $exon (@exons) { + print STDERR sprintf("Added exon start: %d end: %d strand: %d phase: %d\n", + $exon->start, $exon->end, $exon->strand, $exon->phase); + } + + $transcript->translation($translation); + + my $mrna_seq = + Bio::Seq->new(-seq => $transcript->translateable_seq, + -moltype => "dna", + -alphabet => 'dna', + -id => $translation->stable_id); + + # Translate args: stop char, unknown aa char, frame, + # table, full CDS, throw + my $aa_seq = $transcript->translate()->seq(); + + print STDERR sprintf("Translation is: %s\n", $aa_seq); + + if ($aa_seq =~ /\*/) { + print STDERR sprintf("Failed translating %s after phase setting\n", + $translation->stable_id); + } + } + + eval { + $db->get_GeneAdaptor->store($gene); + }; + throw(sprintf("Failed loading %s\n%s\n", $gene_stable_id, $@)) if($@); +} + + + +=head2 get_gene_stable_id + + Arg [1] : Bio::SeqFeatureI $f + Example : my $stable_id = get_gene_stable_id($f); + Description: Tries to get a sensible stable identifier from a bioperl feature + created from an embl flat file. + Returntype : string + Exceptions : throw if cannot determine stable identifier + Caller : + +=cut + +sub get_gene_stable_id { + my $f = shift; + + my $stable_id; + + if($f->has_tag('locus_tag')) { + my @vals = $f->get_tag_values('locus_tag'); + ($stable_id) = split(/\s+/,$vals[0]); + return $stable_id if($stable_id); + } + + if($f->has_tag('gene')) { + my @vals = $f->get_tag_values('gene'); + ($stable_id) = split(/\s+/,$vals[0]); + return $stable_id if($stable_id); + } + + throw("Could not determine gene identifier\n"); +} + + +=head2 create_exons + + Title : create_exons + Function : Returns a list of exons created from the location of + feature. + Returns : List of Bio::EnsEMBL::Exons + Argument : Bio::SeqFeatureI + +=cut + +sub create_exons { + my $f = shift; + + my @exons; + + foreach my $loc ($f->location->each_Location) { + push(@exons, Bio::EnsEMBL::Exon->new(-start => $loc->start, + -end => $loc->end, + -strand => $loc->strand)); + + print STDERR sprintf("Creating exon at %d..%d on strand %d\n", + $loc->start, $loc->end, $loc->strand); + } + + if ($f->has_tag('pseudo')) { + foreach my $exon (@exons) { + $exon->phase(-1); + $exon->end_phase(-1); + } + } + else { + foreach my $exon (@exons) { + $exon->end_phase(0); + } + } + + return @exons; +} + +=head2 get_initial_frame + + Title : get_initial_frame + Function : Returns the frame specified by the codon_start tag of + feature. + Returns : int + Argument : Bio::SeqFeatureI + +=cut + +sub get_initial_frame { + my $f = shift; + + my $frame = 0; + + if ($f->has_tag('codon_start')) { + my @vals = $f->get_tag_values('codon_start'); + $frame = $vals[0] - 1; + } + + return $frame; +} + +=head2 set_exon_phases + + Title : set_exon_phases + Function : Sets the start and end phases of exons. + Returns : void + Argument : Bio::Otter::AnnotatedTranscript, Bio::Ensembl::Exons + +=cut + +sub set_exon_phases { + my $translation = shift; + my @exons = @_; + + my $found_start = 0; + my $found_end = 0; + my $phase = 0; + + foreach my $exon (@exons) { + # Internal and end exons + if ($found_start && ! $found_end) { + $exon->phase($phase); + $exon->end_phase(($exon->length + $exon->phase) % 3); + $phase = $exon->end_phase; + } + + if ($translation->start_Exon == $exon) { + $exon->phase($phase); + $exon->end_phase((($exon->length - $translation->start + 1) + + $exon->phase) % 3); + $phase = $exon->end_phase; + $found_start = 1; + } + + if ($translation->end_Exon == $exon) { + $found_end = 1; + } + } +} + + + +=head2 get_seq_region_data + + Arg [1] : Bio::SeqI + Example : my ($csname, $seq_region_name) = get_seq_region_data($seq); + Description: Gets the coordinate system name (e.g. 'chromosome', 'plasmid') + and the name of the seq_region + Returntype : pair of strings + Exceptions : none + Caller : load_sequence + +=cut + +sub get_seq_region_data { + my $seq = shift; + my $type; + my $name; + + foreach my $f ($seq->get_SeqFeatures) { + if ($f->primary_tag eq 'source') { + if ($f->start == 1 and $f->end == $seq->length) { + if ($f->has_tag('chromosome')){ + $type = 'chromosome'; + my @vals = $f->get_tag_values('chromosome'); + $name = $vals[0]; + $name =~ s/chr(omosome)\s+//i; #strip off chromosome prefix if present + last; + } + + if ($f->has_tag('plasmid')) { + $type = 'plasmid'; + my @vals = $f->get_tag_values('plasmid'); + $name = $vals[0]; + $name =~ s/plasmid\s+//i; #strip off plasmid prefix if present + last; + } + } + } + } + + return ($type, $name); +} + + +=head2 load_sequence + + Arg [1] : Bio::EnsEMBL::DBAdaptor $db + Arg [2] : Bio::SeqI $seq + Example : load_sequence + Description: Loads the sequence for this organism into the database, creating + Coordinate system and seq_region as necessary + Returntype : none + Exceptions : none + Caller : general + +=cut + +sub load_sequence { + my $db = shift; + my $seq = shift; + + my ($csname, $seq_region) = get_seq_region_data($seq); + + my $csa = $db->get_CoordSystemAdaptor(); + my $cs = $csa->fetch_by_name($csname); + + if(!$cs) { + $cs = Bio::EnsEMBL::CoordSystem->new(-NAME => $csname, + -SEQUENCE_LEVEL => 1, + -RANK => 1, + -DEFAULT => 1); + $csa->store($cs); + } + + my $slice = Bio::EnsEMBL::Slice->new(-SEQ_REGION_NAME => $seq_region, + -COORD_SYSTEM => $cs, + -START => 1, + -END => $seq->length(), + -SEQ_REGION_LENGTH => $seq->length()); + + + my $slice_adaptor = $db->get_SliceAdaptor(); + + my $sequence = $seq->seq(); + $slice_adaptor->store($slice, \$sequence); + + return $slice; +} +