Skip to content
Snippets Groups Projects
Commit fdb3dc37 authored by Patrick Meidl's avatar Patrick Meidl
Browse files

moved to ensembl/misc-scripts/utilities/

parent 7515622c
No related branches found
No related tags found
No related merge requests found
#!/usr/local/bin/perl -- # -*-Perl-*-
#
# Copyright (c) 2003 Tim Hubbard (th@sanger.ac.uk)
# Sanger Centre, Wellcome Trust Genome Campus, Cambs, UK
# All rights reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation
#
# $Header$
# This is a driver script to populate and test the experimental dnac
# (compressed dna) table of ensembl. Use -T for pod based tutorial
use strict;
use Getopt::Std;
use vars qw($opt_H $opt_T
$opt_u $opt_d $opt_P $opt_h $opt_p $opt_C $opt_U);
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
getopts("u:HTd:p:P:h:CU");
$|=1;
# specify defaults
my $def_u='ensadmin';
my $def_P='3306';
my $def_h='127.0.0.1';
if($opt_H){
&help;
}elsif($opt_T){
&help2;
}
sub help {
print <<ENDHELP;
usage:
dna_compress.pl [options]
-H for help
-T tutorial
-d database
-u user ensembl db user [$def_u]
-d db ensembl db
-P port port of mysql [$def_P]
-h host host of mysql [$def_h]
-p pass passwd for mysqluser
-C compress
-U uncompress
ENDHELP
exit 0;
}
sub help2 {
exec('perldoc', $0);
}
# defaults or options
$opt_u ||= $def_u;
$opt_h ||= $def_h;
$opt_P ||= $def_P unless $opt_P;
if(!$opt_d) {
print STDERR "ensembl db (-d) argument is required\n";
help();
}
if(!$opt_C && !$opt_U) {
print STDERR "either compress (-C) or uncompress (-U) is required\n";
help();
}
# db connection
my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host => $opt_h,
-user => $opt_u,
-port => $opt_P,
-dbname => $opt_d,
-pass => $opt_p);
my $meta_container = $db->get_MetaContainer();
my $comp_seq_adaptor = $db->get_CompressedSequenceAdaptor();
my ($compressed) =
@{$meta_container->list_value_by_key('sequence.compression')};
if($opt_C) {
#compress the dna in the database
if($compressed) {
throw("The database meta table indicates this database already contains"
." sequence compression.\nIf this is not the case do the following:\n"
." mysql> delete from meta where meta_key = 'sequence.compression'\n");
}
#one more sanity check - make sure that dna table is populated and dnac table
#is not
if(get_dna_count($db) == 0) {
print STDERR "There is no uncompressed dna in this database to compress.";
exit(0);
}
if(get_dnac_count($db) != 0) {
throw("Unexpected: the dnac table already contains some rows.\n");
}
print STDERR "Compressing sequence and writing to dnac table.\n";
#get every slice that contains sequence
my $slice_adaptor = $db->get_SliceAdaptor();
foreach my $slice (@{$slice_adaptor->fetch_all('seqlevel')}) {
my $seq_region_id = $slice_adaptor->get_seq_region_id($slice);
$comp_seq_adaptor->store($seq_region_id, $slice->seq);
print STDERR ".";
}
print STDERR "\nSetting sequence.compression meta table entry.\n";
$meta_container->delete_key('sequence.compression');
$meta_container->store_key_value('sequence.compression', 1);
print STDERR "Deleting uncompressed sequence.\n";
my $sth = $db->prepare("delete from dna");
$sth->execute();
print STDERR "All done.\n";
} else {
#uncompress the dna in the database
if(!$compressed) {
throw("The database meta table indicates that the sequence in this "
." database is already uncompressed.\nIf this is not the case do "
." the following:\n"
." mysql> delete from meta where meta_key = 'sequence.compression';\n"
." mysql> insert into meta (meta_key,meta_value) "
. "values('sequence.compression', 1);\n");
}
if(get_dnac_count($db) == 0) {
print STDERR "There is no compressed dna in this database to uncompress.";
exit(0);
}
if(get_dna_count($db) != 0) {
throw("Unexpected: the dna table already contains some rows.\n");
}
print STDERR "Setting sequence.compression meta table entry.\n";
$meta_container->delete_key('sequence.compression');
$meta_container->store_key_value('sequence.compression', 0);
print STDERR "Uncompressing sequence and writing to dna table.\n";
#get every slice that contains sequence
my $slice_adaptor = $db->get_SliceAdaptor();
my $seq_adaptor = $db->get_SequenceAdaptor();
if($seq_adaptor->isa('CompressedSequenceAdaptor')) {
throw("Tried to get SequenceAdaptor but got CompressedSequenceAdaptor.");
}
foreach my $slice (@{$slice_adaptor->fetch_all('seqlevel')}) {
my $seq_region_id = $slice_adaptor->get_seq_region_id($slice);
my $seq = $comp_seq_adaptor->fetch_by_Slice_start_end_strand($slice);
$seq_adaptor->store($seq_region_id, $$seq);
print STDERR ".";
}
print STDERR "\nDeleting compressed sequence.\n";
my $sth = $db->prepare("delete from dnac");
$sth->execute();
print STDERR "All done.\n";
}
sub get_dna_count {
my $db = shift;
my $sth = $db->prepare("SELECT count(*) from dna");
$sth->execute();
my ($count) = $sth->fetchrow_array();
$sth->finish();
return $count;
}
sub get_dnac_count {
my $db = shift;
my $sth = $db->prepare("SELECT count(*) from dnac");
$sth->execute();
my ($count) = $sth->fetchrow_array();
$sth->finish();
return $count;
}
__END__
=pod
=head1 NAME - name
dna_compress.pl
=head1 DESCRIPTION
Converts compressed sequence in an ensembl database to uncompressed sequence
or converts uncompressed sequence in an ensembl database to compressed
sequence.
=head1 SYNOPSIS
=head1 EXAMPLES
Compress the contents of the dna table in an ensembl database:
dna_compress.pl -C -u ensadmin -p password -h host -d homo_sapiens_core_18_34
Uncompress the contents of the dnac table in an ensembl database
dna_compress.pl -U -u ensadmin -p password -h host -d homo_sapiens_core_18_34
=head1 FLAGS
=over 4
=item -h
Displays short help
=item -T
Displays this help message
=item -C
Compress the dna in the database
=item -U
Uncompress the dna in the database
=item -h
The database host
=item -p
The database password
=item -P
The database port
=item -u
The database user
=back
=head1 VERSION HISTORY
=over 4
=item 14-Jul-2003
B<th> initial release
=item 08-Oct-2003
B<mcvicker> rewrote for new schema and newly implemented sequence compression
=back
=head1 BUGS
=head1 AUTHOR
B<Tim Hubbard> Email th@sanger.ac.uk
=cut
=head1 NAME
parse_embl.pl
=head1 SYNOPSIS
parse_embl -dbname <db name> [-host <db host>] [-port <db port>]
[-user <db user>] [-pass <db pass>]
[-genetype <gene type>]
-codontable <int> -emblfile <file name>
=head1 DESCRIPTION
This is a script originally written by Keith James to load a pathogen
genome from an EMBL flat file into an Otter (modified ensembl) database.
It has been has been stripped down and modified to make it more general
purpose.
This script will peform simple loading of an annotated genome in EMBL
flat file format into an Ensembl database. The script loads a fairly
minimal set of gene information from an EMBL file, but could be
extended to do more.
Certain assumptions are made about the format of the EMBL file such
that it may require pre-processing before loading - these are listed
below. A number of non-standard tags (EMBL qualifiers) are used. I
recommend using BioPerl to write a suitable pre-processor.
=over
=item *
There may be multiple chromosomes/plasmids per EMBL file.
=item *
Each chromosome/plasmid must have a source feature defining the
chromosome/plasmid name and type. They must all have different
names. See EMBL feature table definition for 'chromosome' and
'plasmid' tags.
=item *
Protein coding genes are annotated as CDS features.
=item *
All gene synonyms are defined by multiple 'gene' and/or 'synonym' tags.
=item *
The frame of a CDS feature is defined by a 'codon_start' tag if it is
not in frame 0.
=item *
If a CDS feature has a 'pseudo' tag it is taken to be a pseudogene.
=item *
If a CDS feature has a 'partial' tag but fuzzy ends are not defined in
its location it is assumed the ends have not been found.
=item *
Non-standard translation tables are currently not handled.
=item *
Annotator remarks about genes are taken from 'note' tags of
features. Remarks about transcripts are taken from 'product', 'class'
and 'EC_number' tags of features.
=back
Redirect STDERR to a file to obtain a rough log of the loading
operations.
Some functions in the script are simply stubs which may be implemented
later. Some user assembly may be required.
=head1 AUTHOR - Keith James
Email kdj@sanger.ac.uk
=head1 CONTRIBUTORS
Stephen Searle, sjms@sanger.ac.uk
Graham McVicker
=head1 METHODS
=cut
use strict;
use Getopt::Long;
use Bio::SeqIO;
use Bio::Location::Simple;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Exon;
use Bio::EnsEMBL::Gene;
use Bio::EnsEMBL::Slice;
use Bio::EnsEMBL::CoordSystem;
use Bio::EnsEMBL::Transcript;
use Bio::EnsEMBL::Translation;
use Bio::EnsEMBL::Analysis;
use Bio::EnsEMBL::Utils::Exception qw(throw);
###
### Script globals.
###
my $MAX_SEQUENCE_LEN = 2e6;
my $CURATED_LOGIC_NAME = 'Genome Reviews';
###
### Command line options
###
my $emblfile = undef;
my $host = '127.0.0.1';
my $user = undef;
my $pass = undef;
my $port = 3306;
my $dbname = undef;
my $gene_type = 'known';
GetOptions('emblfile:s' => \$emblfile,
'dbname:s' => \$dbname,
'user:s' => \$user,
'pass:s' => \$pass,
'port:n' => \$port,
'host:s' => \$host,
'genetype:s' => \$gene_type);
throw("-dbname argument required") if(!defined($dbname));
throw("-emblfile argument required") if(!defined($emblfile));
###
### Get Ensembl DB adaptor
###
my $dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $host,
-user => $user,
-pass => $pass,
-port => $port,
-dbname => $dbname);
###
### Create new analysis object indicating a curated origin
###
my $analysis = Bio::EnsEMBL::Analysis->new(-logic_name => $CURATED_LOGIC_NAME);
###
### Create sequence object and load it
###
my $seqi = Bio::SeqIO->new(-file => $emblfile,
-format => 'embl');
###
### Main loading loop
###
while (my $seq = $seqi->next_seq) {
# Split sequence and return a slice
my $slice = load_sequence($dba, $seq);
foreach my $f ($seq->get_SeqFeatures) {
if ($f->primary_tag =~ /CDS/) {
store_gene($dba, $f, $slice);
}
if ($f->primary_tag =~ /repeat/) {
# store_repeat($f, $slice);
}
}
}
=head2 store_gene
Title : store_gene
Function : Stores a gene feature (both protein- and RNA-coding)
Returns : void
Argument : Bio::SeqFeatureI, Bio::EnsEMBL::DBSQL::SliceAdaptor
=cut
sub store_gene {
my ($db, $f, $slice) = @_;
my $gene_stable_id = get_gene_stable_id($f);
my $gene = Bio::EnsEMBL::Gene->new();
$gene->analysis($analysis);
$gene->type($gene_type);
$gene->stable_id($gene_stable_id);
$gene->version(1);
$gene->slice($slice);
print STDERR sprintf("Found CDS with ID %s\n", $gene_stable_id);
my $tcount = 0;
my $transcript_id = sprintf("%s.%d", $gene_stable_id, $tcount++);
my $transcript = Bio::EnsEMBL::Transcript->new;
$transcript->stable_id($transcript_id);
$transcript->version(1);
$transcript->slice($slice);
$gene->add_Transcript($transcript);
# Add the exons
my @exons = create_exons($f);
my $ecount = 0;
foreach my $exon (@exons) {
$exon->stable_id(sprintf("%s.%d", $gene_stable_id, $ecount++));
$exon->version(1);
$exon->slice($slice);
$transcript->add_Exon($exon);
}
if ($f->primary_tag =~ /RNA/) {
foreach my $exon (@exons) {
$exon->phase(-1);
$exon->end_phase(-1);
}
}
# Handle protein CDS features
if ($f->primary_tag eq 'CDS') {
# Based on /codon_start EMBL qualifier
my $frame = get_initial_frame($f);
@exons = @{ $transcript->get_all_Exons };
# This code assumes no UTRs
my $translation = Bio::EnsEMBL::Translation->new;
my $rcount = 0;
$translation->stable_id(sprintf("%s.%d", $gene_stable_id, $rcount++));
$translation->version(1);
$translation->start_Exon($exons[0]);
$translation->start(1 + $frame);
$translation->end_Exon($exons[$#exons]);
$translation->end($translation->end_Exon->length);
set_exon_phases($translation, @exons);
foreach my $exon (@exons) {
print STDERR sprintf("Added exon start: %d end: %d strand: %d phase: %d\n",
$exon->start, $exon->end, $exon->strand, $exon->phase);
}
$transcript->translation($translation);
my $mrna_seq =
Bio::Seq->new(-seq => $transcript->translateable_seq,
-moltype => "dna",
-alphabet => 'dna',
-id => $translation->stable_id);
# Translate args: stop char, unknown aa char, frame,
# table, full CDS, throw
my $aa_seq = $transcript->translate()->seq();
print STDERR sprintf("Translation is: %s\n", $aa_seq);
if ($aa_seq =~ /\*/) {
print STDERR sprintf("Failed translating %s after phase setting\n",
$translation->stable_id);
}
}
eval {
$db->get_GeneAdaptor->store($gene);
};
throw(sprintf("Failed loading %s\n%s\n", $gene_stable_id, $@)) if($@);
}
=head2 get_gene_stable_id
Arg [1] : Bio::SeqFeatureI $f
Example : my $stable_id = get_gene_stable_id($f);
Description: Tries to get a sensible stable identifier from a bioperl feature
created from an embl flat file.
Returntype : string
Exceptions : throw if cannot determine stable identifier
Caller :
=cut
sub get_gene_stable_id {
my $f = shift;
my $stable_id;
if($f->has_tag('locus_tag')) {
my @vals = $f->get_tag_values('locus_tag');
($stable_id) = split(/\s+/,$vals[0]);
return $stable_id if($stable_id);
}
if($f->has_tag('gene')) {
my @vals = $f->get_tag_values('gene');
($stable_id) = split(/\s+/,$vals[0]);
return $stable_id if($stable_id);
}
throw("Could not determine gene identifier\n");
}
=head2 create_exons
Title : create_exons
Function : Returns a list of exons created from the location of
feature.
Returns : List of Bio::EnsEMBL::Exons
Argument : Bio::SeqFeatureI
=cut
sub create_exons {
my $f = shift;
my @exons;
foreach my $loc ($f->location->each_Location) {
push(@exons, Bio::EnsEMBL::Exon->new(-start => $loc->start,
-end => $loc->end,
-strand => $loc->strand));
print STDERR sprintf("Creating exon at %d..%d on strand %d\n",
$loc->start, $loc->end, $loc->strand);
}
if ($f->has_tag('pseudo')) {
foreach my $exon (@exons) {
$exon->phase(-1);
$exon->end_phase(-1);
}
}
else {
foreach my $exon (@exons) {
$exon->end_phase(0);
}
}
return @exons;
}
=head2 get_initial_frame
Title : get_initial_frame
Function : Returns the frame specified by the codon_start tag of
feature.
Returns : int
Argument : Bio::SeqFeatureI
=cut
sub get_initial_frame {
my $f = shift;
my $frame = 0;
if ($f->has_tag('codon_start')) {
my @vals = $f->get_tag_values('codon_start');
$frame = $vals[0] - 1;
}
return $frame;
}
=head2 set_exon_phases
Title : set_exon_phases
Function : Sets the start and end phases of exons.
Returns : void
Argument : Bio::Otter::AnnotatedTranscript, Bio::Ensembl::Exons
=cut
sub set_exon_phases {
my $translation = shift;
my @exons = @_;
my $found_start = 0;
my $found_end = 0;
my $phase = 0;
foreach my $exon (@exons) {
# Internal and end exons
if ($found_start && ! $found_end) {
$exon->phase($phase);
$exon->end_phase(($exon->length + $exon->phase) % 3);
$phase = $exon->end_phase;
}
if ($translation->start_Exon == $exon) {
$exon->phase($phase);
$exon->end_phase((($exon->length - $translation->start + 1)
+ $exon->phase) % 3);
$phase = $exon->end_phase;
$found_start = 1;
}
if ($translation->end_Exon == $exon) {
$found_end = 1;
}
}
}
=head2 get_seq_region_data
Arg [1] : Bio::SeqI
Example : my ($csname, $seq_region_name) = get_seq_region_data($seq);
Description: Gets the coordinate system name (e.g. 'chromosome', 'plasmid')
and the name of the seq_region
Returntype : pair of strings
Exceptions : none
Caller : load_sequence
=cut
sub get_seq_region_data {
my $seq = shift;
my $type;
my $name;
foreach my $f ($seq->get_SeqFeatures) {
if ($f->primary_tag eq 'source') {
if ($f->start == 1 and $f->end == $seq->length) {
if ($f->has_tag('chromosome')){
$type = 'chromosome';
my @vals = $f->get_tag_values('chromosome');
$name = $vals[0];
$name =~ s/chr(omosome)\s+//i; #strip off chromosome prefix if present
last;
}
if ($f->has_tag('plasmid')) {
$type = 'plasmid';
my @vals = $f->get_tag_values('plasmid');
$name = $vals[0];
$name =~ s/plasmid\s+//i; #strip off plasmid prefix if present
last;
}
}
}
}
return ($type, $name);
}
=head2 load_sequence
Arg [1] : Bio::EnsEMBL::DBAdaptor $db
Arg [2] : Bio::SeqI $seq
Example : load_sequence
Description: Loads the sequence for this organism into the database, creating
Coordinate system and seq_region as necessary
Returntype : none
Exceptions : none
Caller : general
=cut
sub load_sequence {
my $db = shift;
my $seq = shift;
my ($csname, $seq_region) = get_seq_region_data($seq);
my $csa = $db->get_CoordSystemAdaptor();
my $cs = $csa->fetch_by_name($csname);
if(!$cs) {
$cs = Bio::EnsEMBL::CoordSystem->new(-NAME => $csname,
-SEQUENCE_LEVEL => 1,
-RANK => 1,
-DEFAULT => 1);
$csa->store($cs);
}
my $slice = Bio::EnsEMBL::Slice->new(-SEQ_REGION_NAME => $seq_region,
-COORD_SYSTEM => $cs,
-START => 1,
-END => $seq->length(),
-SEQ_REGION_LENGTH => $seq->length());
my $slice_adaptor = $db->get_SliceAdaptor();
my $sequence = $seq->seq();
$slice_adaptor->store($slice, \$sequence);
return $slice;
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment