moved to ensembl/misc-scripts/utilities/

fdb3dc37 · Patrick Meidl · 7515622c · 7515622c · 7515622c
Commit fdb3dc37 authored 18 years ago by Patrick Meidl
--- a/misc-scripts/dna_compress.pl
+++ b/misc-scripts/dna_compress.pl
-#!/usr/local/bin/perl --    # -*-Perl-*-
-#
-# Copyright (c) 2003 Tim Hubbard (th@sanger.ac.uk)
-# Sanger Centre, Wellcome Trust Genome Campus, Cambs, UK
-# All rights reserved.
-#
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License
-# as published by the Free Software Foundation
-#
-# $Header$
-
-# This is a driver script to populate and test the experimental dnac
-# (compressed dna) table of ensembl.  Use -T for pod based tutorial
-
-use strict;
-use Getopt::Std;
-use vars qw($opt_H $opt_T
-	    $opt_u $opt_d $opt_P $opt_h $opt_p $opt_C $opt_U);
-
-use Bio::EnsEMBL::DBSQL::DBAdaptor;
-use Bio::EnsEMBL::Utils::Exception qw(throw warning);
-
-getopts("u:HTd:p:P:h:CU");
-
-$|=1;
-
-# specify defaults
-my $def_u='ensadmin';
-my $def_P='3306';
-my $def_h='127.0.0.1';
-
-if($opt_H){
-    &help;
-}elsif($opt_T){
-    &help2;
-}
-
-sub help {
-    print <<ENDHELP;
-
-usage:
-dna_compress.pl [options]
-
-  -H       for help
-  -T       tutorial
-  -d       database
-  -u user  ensembl db user [$def_u]
-  -d db    ensembl db
-  -P port  port of mysql   [$def_P]
-  -h host  host of mysql   [$def_h]
-  -p pass  passwd for mysqluser
-  -C       compress
-  -U       uncompress
-ENDHELP
-
-    exit 0;
-}
-
-sub help2 {
-    exec('perldoc', $0);
-}
-
-# defaults or options
-$opt_u ||= $def_u;
-$opt_h ||= $def_h;
-$opt_P ||= $def_P unless $opt_P;
-
-
-if(!$opt_d) {
-  print STDERR "ensembl db (-d) argument is required\n";
-  help();
-}
-
-if(!$opt_C && !$opt_U) {
-  print STDERR "either compress (-C) or uncompress (-U) is required\n";
-  help();
-}
-
-
-# db connection
-my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host   => $opt_h,
-                                            -user   => $opt_u,
-                                            -port   => $opt_P,
-                                            -dbname => $opt_d,
-                                            -pass   => $opt_p);
-
-
-
-my $meta_container = $db->get_MetaContainer();
-my $comp_seq_adaptor = $db->get_CompressedSequenceAdaptor();
-
-my ($compressed) = 
-  @{$meta_container->list_value_by_key('sequence.compression')};
-
-if($opt_C) {
-  #compress the dna in the database
-  if($compressed) {
-    throw("The database meta table indicates this database already contains"
-     ." sequence compression.\nIf this is not the case do the following:\n"
-     ."   mysql>  delete from meta where meta_key = 'sequence.compression'\n");
-  }
-
-  #one more sanity check - make sure that dna table is populated and dnac table
-  #is not
-  if(get_dna_count($db) == 0) {
-    print STDERR "There is no uncompressed dna in this database to compress.";
-    exit(0);
-  }
-  if(get_dnac_count($db) != 0) {
-    throw("Unexpected: the dnac table already contains some rows.\n");
-  }
-  
-  print STDERR "Compressing sequence and writing to dnac table.\n";
-
-  #get every slice that contains sequence
-  my $slice_adaptor = $db->get_SliceAdaptor();
-  foreach my $slice (@{$slice_adaptor->fetch_all('seqlevel')}) {
-    my $seq_region_id = $slice_adaptor->get_seq_region_id($slice);
-    $comp_seq_adaptor->store($seq_region_id, $slice->seq);
-    print STDERR ".";
-  }
-
-  print STDERR "\nSetting sequence.compression meta table entry.\n";
-  $meta_container->delete_key('sequence.compression');
-  $meta_container->store_key_value('sequence.compression', 1);
-
-  print STDERR "Deleting uncompressed sequence.\n";
-  my $sth = $db->prepare("delete from dna");
-  $sth->execute();
-
-  print STDERR "All done.\n";
-
-} else {
-  #uncompress the dna in the database
-  if(!$compressed) {
-    throw("The database meta table indicates that the sequence in this "
-     ." database is already uncompressed.\nIf this is not the case do "
-     ." the following:\n"
-     ."   mysql>  delete from meta where meta_key = 'sequence.compression';\n"
-     ."   mysql>  insert into meta (meta_key,meta_value) "
-     . "values('sequence.compression', 1);\n");
-  }
-
-  if(get_dnac_count($db) == 0) {
-    print STDERR "There is no compressed dna in this database to uncompress.";
-    exit(0);
-  }
-  if(get_dna_count($db) != 0) {
-    throw("Unexpected: the dna table already contains some rows.\n");
-  }
-
-  print STDERR "Setting sequence.compression meta table entry.\n";
-  $meta_container->delete_key('sequence.compression');
-  $meta_container->store_key_value('sequence.compression', 0);
-
-  print STDERR "Uncompressing sequence and writing to dna table.\n";
-
-  #get every slice that contains sequence
-  my $slice_adaptor = $db->get_SliceAdaptor();
-  my $seq_adaptor   = $db->get_SequenceAdaptor();
-  
-  if($seq_adaptor->isa('CompressedSequenceAdaptor')) {
-    throw("Tried to get SequenceAdaptor but got CompressedSequenceAdaptor.");
-  }
-
-  foreach my $slice (@{$slice_adaptor->fetch_all('seqlevel')}) {
-    my $seq_region_id = $slice_adaptor->get_seq_region_id($slice);
-    my $seq = $comp_seq_adaptor->fetch_by_Slice_start_end_strand($slice);
-    $seq_adaptor->store($seq_region_id, $$seq);
-    print STDERR ".";
-  }
-
-  print STDERR "\nDeleting compressed sequence.\n";
-  my $sth = $db->prepare("delete from dnac");
-  $sth->execute();
-
-  print STDERR "All done.\n";
-}
-
-
-sub get_dna_count {
-  my $db = shift;
-
-  my $sth = $db->prepare("SELECT count(*) from dna");
-  $sth->execute();
-  my ($count) = $sth->fetchrow_array();
-  $sth->finish();
-
-  return $count;
-}
-
-
-sub get_dnac_count {
-  my $db = shift;
-
-  my $sth = $db->prepare("SELECT count(*) from dnac");
-  $sth->execute();
-  my ($count) = $sth->fetchrow_array();
-  $sth->finish();
-  
-  return $count;
-}
-
-
-
-__END__
-
-=pod
-
-=head1 NAME - name
-
-dna_compress.pl
-
-=head1 DESCRIPTION
-
-Converts compressed sequence in an ensembl database to uncompressed sequence
-or converts uncompressed sequence in an ensembl database to compressed
-sequence.
-
-=head1 SYNOPSIS
-
-    
-
-=head1 EXAMPLES
-
-Compress the contents of the dna table in an ensembl database:
-
-    dna_compress.pl -C -u ensadmin -p password -h host -d homo_sapiens_core_18_34 
-
-Uncompress the contents of the dnac table in an ensembl database
-
-    dna_compress.pl -U -u ensadmin -p password -h host -d homo_sapiens_core_18_34 
-
-
-=head1 FLAGS
-
-=over 4
-
-=item -h
-
-Displays short help
-
-=item -T
-
-Displays this help message
-
-=item -C
-
-Compress the dna in the database
-
-=item -U
-
-Uncompress the dna in the database
-
-=item -h
-
-The database host
-
-=item -p
-
-The database password
-
-=item -P
-
-The database port
-
-=item -u
-
-The database user
-
-=back
-
-=head1 VERSION HISTORY
-
-=over 4
-
-=item 14-Jul-2003
-
-B<th> initial release
-
-=item 08-Oct-2003
-
-B<mcvicker> rewrote for new schema and newly implemented sequence compression
-
-=back
-
-=head1 BUGS
-
-=head1 AUTHOR
-
-B<Tim Hubbard> Email th@sanger.ac.uk
-
-=cut
--- a/misc-scripts/parse_embl.pl
+++ b/misc-scripts/parse_embl.pl
-
-=head1 NAME
-
-parse_embl.pl
-
-=head1 SYNOPSIS
-
- parse_embl -dbname <db name> [-host <db host>] [-port <db port>]
-  [-user <db user>] [-pass <db pass>]
-  [-genetype <gene type>]
-  -codontable <int> -emblfile <file name>
-
-=head1 DESCRIPTION
-
-This is a script originally written by Keith James to load a pathogen
-genome from an EMBL flat file into an Otter (modified ensembl) database.
-It has been has been stripped down and modified to make it more general
-purpose.
-
-This script will peform simple loading of an annotated genome in EMBL
-flat file format into an Ensembl database. The script loads a fairly
-minimal set of gene information from an EMBL file, but could be
-extended to do more.
-
-Certain assumptions are made about the format of the EMBL file such
-that it may require pre-processing before loading - these are listed
-below. A number of non-standard tags (EMBL qualifiers) are used. I
-recommend using BioPerl to write a suitable pre-processor.
-
-=over
-
-=item *
-
-There may be multiple chromosomes/plasmids per EMBL file.
-
-=item *
-
-Each chromosome/plasmid must have a source feature defining the
-chromosome/plasmid name and type. They must all have different
-names. See EMBL feature table definition for 'chromosome' and
-'plasmid' tags.
-
-=item *
-
-Protein coding genes are annotated as CDS features.
-
-=item *
-
-All gene synonyms are defined by multiple 'gene' and/or 'synonym' tags.
-
-=item *
-
-The frame of a CDS feature is defined by a 'codon_start' tag if it is
-not in frame 0.
-
-=item *
-
-If a CDS feature has a 'pseudo' tag it is taken to be a pseudogene.
-
-=item *
-
-If a CDS feature has a 'partial' tag but fuzzy ends are not defined in
-its location it is assumed the ends have not been found.
-
-=item *
-
-Non-standard translation tables are currently not handled.
-
-=item *
-
-Annotator remarks about genes are taken from 'note' tags of
-features. Remarks about transcripts are taken from 'product', 'class'
-and 'EC_number' tags of features.
-
-=back
-
-Redirect STDERR to a file to obtain a rough log of the loading
-operations.
-
-Some functions in the script are simply stubs which may be implemented
-later. Some user assembly may be required.
-
-=head1 AUTHOR - Keith James
-
-Email kdj@sanger.ac.uk
-
-=head1 CONTRIBUTORS
-
-Stephen Searle, sjms@sanger.ac.uk
-
-Graham McVicker
-
-=head1 METHODS
-
-=cut
-
-use strict;
-use Getopt::Long;
-
-use Bio::SeqIO;
-use Bio::Location::Simple;
-
-use Bio::EnsEMBL::DBSQL::DBAdaptor;
-
-use Bio::EnsEMBL::Exon;
-use Bio::EnsEMBL::Gene;
-use Bio::EnsEMBL::Slice;
-use Bio::EnsEMBL::CoordSystem;
-use Bio::EnsEMBL::Transcript;
-use Bio::EnsEMBL::Translation;
-use Bio::EnsEMBL::Analysis;
-
-use Bio::EnsEMBL::Utils::Exception qw(throw);
-
-###
-### Script globals.
-###
-
-my $MAX_SEQUENCE_LEN = 2e6;
-my $CURATED_LOGIC_NAME = 'Genome Reviews';
-
-###
-### Command line options
-###
-my $emblfile      = undef;
-my $host          = '127.0.0.1';
-my $user          = undef;
-my $pass          = undef;
-my $port          = 3306;
-my $dbname        = undef;
-my $gene_type     = 'known';
-
-GetOptions('emblfile:s'     => \$emblfile,
-           'dbname:s'       => \$dbname,
-           'user:s'         => \$user,
-           'pass:s'         => \$pass,
-           'port:n'         => \$port,
-           'host:s'         => \$host,
-           'genetype:s'     => \$gene_type);
-
-throw("-dbname argument required") if(!defined($dbname));
-throw("-emblfile argument required") if(!defined($emblfile));
-
-###
-### Get Ensembl DB adaptor
-###
-my $dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host   => $host,
-                                             -user   => $user,
-                                             -pass   => $pass,
-                                             -port   => $port,
-                                             -dbname => $dbname);
-
-
-###
-### Create new analysis object indicating a curated origin
-###
-my $analysis = Bio::EnsEMBL::Analysis->new(-logic_name => $CURATED_LOGIC_NAME);
-
-###
-### Create sequence object and load it
-###
-my $seqi = Bio::SeqIO->new(-file   => $emblfile,
-                           -format => 'embl');
-###
-### Main loading loop
-###
-while (my $seq = $seqi->next_seq) {
-  # Split sequence and return a slice
-  my $slice = load_sequence($dba, $seq);
-
-  foreach my $f ($seq->get_SeqFeatures) {
-    if ($f->primary_tag =~ /CDS/) {
-      store_gene($dba, $f, $slice);
-    }
-
-    if ($f->primary_tag =~ /repeat/) {
-      # store_repeat($f, $slice);
-    }
-  }
-}
-
-
-=head2 store_gene
-
- Title    : store_gene
- Function : Stores a gene feature (both protein- and RNA-coding)
- Returns  : void
- Argument : Bio::SeqFeatureI, Bio::EnsEMBL::DBSQL::SliceAdaptor
-
-=cut
-
-sub store_gene {
-  my ($db, $f, $slice) = @_;
-
-  my $gene_stable_id = get_gene_stable_id($f);
-
-  my $gene = Bio::EnsEMBL::Gene->new();
-
-  $gene->analysis($analysis);
-  $gene->type($gene_type);
-  $gene->stable_id($gene_stable_id);
-  $gene->version(1);
-  $gene->slice($slice);
-
-  print STDERR sprintf("Found CDS with ID %s\n", $gene_stable_id);
-
-  my $tcount = 0;
-  my $transcript_id = sprintf("%s.%d", $gene_stable_id, $tcount++);
-
-  my $transcript = Bio::EnsEMBL::Transcript->new;
-  $transcript->stable_id($transcript_id);
-  $transcript->version(1);
-  $transcript->slice($slice);
-
-  $gene->add_Transcript($transcript);
-
-  # Add the exons
-  my @exons = create_exons($f);
-  my $ecount = 0;
-  foreach my $exon (@exons) {
-    $exon->stable_id(sprintf("%s.%d", $gene_stable_id, $ecount++));
-    $exon->version(1);
-    $exon->slice($slice);
-    $transcript->add_Exon($exon);
-  }
-
-  if ($f->primary_tag =~ /RNA/) {
-    foreach my $exon (@exons) {
-      $exon->phase(-1);
-      $exon->end_phase(-1);
-    }
-  }
-
-  # Handle protein CDS features
-  if ($f->primary_tag eq 'CDS') {
-    # Based on /codon_start EMBL qualifier
-    my $frame = get_initial_frame($f);
-
-    @exons = @{ $transcript->get_all_Exons };
-
-    # This code assumes no UTRs
-    my $translation = Bio::EnsEMBL::Translation->new;
-    my $rcount = 0;
-    $translation->stable_id(sprintf("%s.%d", $gene_stable_id, $rcount++));
-    $translation->version(1);
-    $translation->start_Exon($exons[0]);
-    $translation->start(1 + $frame);
-    $translation->end_Exon($exons[$#exons]);
-    $translation->end($translation->end_Exon->length);
-
-    set_exon_phases($translation, @exons);
-
-    foreach my $exon (@exons) {
-      print STDERR sprintf("Added exon start: %d end: %d strand: %d phase: %d\n",
-                           $exon->start, $exon->end, $exon->strand, $exon->phase);
-    }
-
-    $transcript->translation($translation);
-
-    my $mrna_seq =
-      Bio::Seq->new(-seq      => $transcript->translateable_seq,
-                    -moltype  => "dna",
-                    -alphabet => 'dna',
-                    -id       => $translation->stable_id);
-
-    # Translate args: stop char, unknown aa char, frame,
-    # table, full CDS, throw
-    my $aa_seq = $transcript->translate()->seq();
-
-    print STDERR sprintf("Translation is: %s\n", $aa_seq);
-
-    if ($aa_seq =~ /\*/) {
-      print STDERR sprintf("Failed translating %s after phase setting\n",
-                           $translation->stable_id);
-    }
-  }
-
-  eval {
-    $db->get_GeneAdaptor->store($gene);
-  };
-  throw(sprintf("Failed loading %s\n%s\n", $gene_stable_id, $@)) if($@);
-}
-
-
-
-=head2 get_gene_stable_id
-
-  Arg [1]    : Bio::SeqFeatureI $f
-  Example    : my $stable_id = get_gene_stable_id($f);
-  Description: Tries to get a sensible stable identifier from a bioperl feature
-               created from an embl flat file.
-  Returntype : string
-  Exceptions : throw if cannot determine stable identifier
-  Caller     : 
-
-=cut
-
-sub get_gene_stable_id {
-  my $f = shift;
-
-  my $stable_id;
-
-  if($f->has_tag('locus_tag')) {
-    my @vals = $f->get_tag_values('locus_tag');
-    ($stable_id) = split(/\s+/,$vals[0]);
-    return $stable_id if($stable_id);
-  }
-
-  if($f->has_tag('gene')) {
-    my @vals = $f->get_tag_values('gene');
-    ($stable_id) = split(/\s+/,$vals[0]);
-    return $stable_id if($stable_id);
-  }
-
-  throw("Could not determine gene identifier\n");
-}
-
-
-=head2 create_exons
-
- Title    : create_exons
- Function : Returns a list of exons created from the location of
-            feature.
- Returns  : List of Bio::EnsEMBL::Exons
- Argument : Bio::SeqFeatureI
-
-=cut
-
-sub create_exons {
-  my $f = shift;
-
-  my @exons;
-
-  foreach my $loc ($f->location->each_Location) {
-    push(@exons, Bio::EnsEMBL::Exon->new(-start  => $loc->start,
-                                         -end    => $loc->end,
-                                         -strand => $loc->strand));
-
-    print STDERR sprintf("Creating exon at %d..%d on strand %d\n",
-                         $loc->start, $loc->end, $loc->strand);
-  }
-
-  if ($f->has_tag('pseudo')) {
-    foreach my $exon (@exons) {
-      $exon->phase(-1);
-      $exon->end_phase(-1);
-    }
-  }
-  else {
-    foreach my $exon (@exons) {
-      $exon->end_phase(0);
-    }
-  }
-
-  return @exons;
-}
-
-=head2 get_initial_frame
-
- Title    : get_initial_frame
- Function : Returns the frame specified by the codon_start tag of
-            feature.
- Returns  : int
- Argument : Bio::SeqFeatureI
-
-=cut
-
-sub get_initial_frame {
-  my $f = shift;
-
-  my $frame = 0;
-
-  if ($f->has_tag('codon_start')) {
-    my @vals = $f->get_tag_values('codon_start');
-    $frame = $vals[0] - 1;
-  }
-
-  return $frame;
-}
-
-=head2 set_exon_phases
-
- Title    : set_exon_phases
- Function : Sets the start and end phases of exons.
- Returns  : void
- Argument : Bio::Otter::AnnotatedTranscript, Bio::Ensembl::Exons
-
-=cut
-
-sub set_exon_phases {
-  my $translation = shift;
-  my @exons = @_;
-
-  my $found_start = 0;
-  my $found_end   = 0;
-  my $phase       = 0;
-
-  foreach my $exon (@exons) {
-    # Internal and end exons
-    if ($found_start && ! $found_end) {
-      $exon->phase($phase);
-      $exon->end_phase(($exon->length + $exon->phase) % 3);
-      $phase = $exon->end_phase;
-    }
-
-    if ($translation->start_Exon == $exon) {
-      $exon->phase($phase);
-      $exon->end_phase((($exon->length - $translation->start + 1)
-                        + $exon->phase) % 3);
-      $phase = $exon->end_phase;
-      $found_start = 1;
-    }
-
-    if ($translation->end_Exon == $exon) {
-      $found_end = 1;
-    }
-  }
-}
-
-
-
-=head2 get_seq_region_data
-
-  Arg [1]    : Bio::SeqI
-  Example    : my ($csname, $seq_region_name) = get_seq_region_data($seq);
-  Description: Gets the coordinate system name (e.g. 'chromosome', 'plasmid')
-               and the name of the seq_region
-  Returntype : pair of strings
-  Exceptions : none
-  Caller     : load_sequence
-
-=cut
-
-sub get_seq_region_data {
-  my $seq = shift;
-  my $type;
-  my $name;
-
-  foreach my $f ($seq->get_SeqFeatures) {
-    if ($f->primary_tag eq 'source') {
-      if ($f->start == 1 and $f->end == $seq->length) {
-        if ($f->has_tag('chromosome')){
-          $type = 'chromosome';
-          my @vals = $f->get_tag_values('chromosome');
-          $name = $vals[0];
-          $name =~ s/chr(omosome)\s+//i; #strip off chromosome prefix if present
-          last;
-        }
-
-        if ($f->has_tag('plasmid')) {
-          $type = 'plasmid';
-          my @vals = $f->get_tag_values('plasmid');
-          $name = $vals[0];
-          $name =~ s/plasmid\s+//i; #strip off plasmid prefix if present
-          last;
-        }
-      }
-    }
-  }
-
-  return ($type, $name);
-}
-
-
-=head2 load_sequence
-
-  Arg [1]    : Bio::EnsEMBL::DBAdaptor $db
-  Arg [2]    : Bio::SeqI $seq
-  Example    : load_sequence
-  Description: Loads the sequence for this organism into the database, creating
-               Coordinate system and seq_region as necessary
-  Returntype : none
-  Exceptions : none
-  Caller     : general
-
-=cut
-
-sub load_sequence {
-  my $db = shift;
-  my $seq = shift;
-
-  my ($csname, $seq_region) = get_seq_region_data($seq);
-
-  my $csa = $db->get_CoordSystemAdaptor();
-  my $cs = $csa->fetch_by_name($csname);
-
-  if(!$cs) {
-    $cs = Bio::EnsEMBL::CoordSystem->new(-NAME           => $csname,
-                                          -SEQUENCE_LEVEL => 1,
-                                          -RANK           => 1,
-                                          -DEFAULT        => 1);
-    $csa->store($cs);
-  }
-
-  my $slice = Bio::EnsEMBL::Slice->new(-SEQ_REGION_NAME => $seq_region,
-                                       -COORD_SYSTEM    => $cs,
-                                       -START           => 1,
-                                       -END             => $seq->length(),
-                                       -SEQ_REGION_LENGTH => $seq->length());
-
-
-  my $slice_adaptor = $db->get_SliceAdaptor();
-
-  my $sequence = $seq->seq();
-  $slice_adaptor->store($slice, \$sequence);
-
-  return $slice;
-}
-