diff --git a/misc-scripts/alternative_splicing/Fetch_gff.pl b/misc-scripts/alternative_splicing/Fetch_gff.pl deleted file mode 100644 index bb4303d879a80ec38be8c3bb700d778c6af4027a..0000000000000000000000000000000000000000 --- a/misc-scripts/alternative_splicing/Fetch_gff.pl +++ /dev/null @@ -1,175 +0,0 @@ -#!/usr/local/ensembl/bin/perl -w - -=head1 LICENSE - - Copyright (c) 1999-2013 The European Bioinformatics Institute and - Genome Research Limited. All rights reserved. - - This software is distributed under a modified Apache license. - For license details, please see - - http://www.ensembl.org/info/about/code_licence.html - -=head1 CONTACT - - Please email comments or questions to the public Ensembl - developers list at <dev@ensembl.org>. - - Questions may also be sent to the Ensembl help desk at - <helpdesk@ensembl.org>. - -=cut - -=head1 NAME - - Fetch_gff.pl - -=head1 AUTHORS - - Gautier Koscielny (koscieln@ebi.ac.uk) - -=head1 SYNOPSIS - - Fetch_gff.pl -dbhost host -dbuser ensro -dbname homo_sapiens_core_58_37c -output_file homo_sapiens_core_58_37c_variants.gff - -=head1 DESCRIPTION - -This script generates a GFF file containing all the transcript variants -from an Ensembl core database. - -here is an example commandline - - ./Fetch_gff.pl -dbhost host -dbuser user -dbname my_db -dbpass **** -output_file transcript_variants.gff - -=head1 OPTIONS - - -dbhost host name for database (gets put as host= in locator) - -dbname what name to connect to (dbname= in locator) - -dbuser what username to connect as (dbuser= in locator) - -dbpass what password to use (dbpass= in locator) - -chr which chromosome (optional) - -output_file|-o where the GFF output is written (optional, STDOUT by default) - -help displays this documentation with PERLDOC - -=cut - -use strict; -use warnings; - -use Bio::EnsEMBL::Feature; -use Bio::EnsEMBL::DBSQL::DBAdaptor; -use Bio::EnsEMBL::Utils::Exception qw(throw); - -use POSIX qw(ceil); - -use Getopt::Long; -use Bio::EnsEMBL::Utils::Exception qw(throw warning); - -{ # block to avoid namespace pollution - my $host = ''; - my $port = ''; - my $dbname = ''; - my $dbuser = ''; - my $dbpass = ''; - my $chr = undef; - my $output_file = undef; - my $help; - my @coord_system; - - &GetOptions( - 'dbhost:s' => \$host, - 'dbport:n' => \$port, - 'dbname:s' => \$dbname, - 'dbuser:s' => \$dbuser, - 'dbpass:s' => \$dbpass, - 'chr:s' => \$chr, - 'output_file|o=s' => \$output_file, - 'h|help' => \$help, - ) or ($help = 1); - - if(!$host || !$dbuser || !$dbname){ - print STDERR "Can't get any information without database details\n"; - print STDERR "-dbhost '$host' -dbuser '$dbuser' -dbname '$dbname' ". - " -dbpass '$dbpass'\n"; - $help = 1; - } - - if ($help) { - exec('perldoc', $0); - } - - - my $output_stream; - - if (defined($output_file)) { - - open ($output_stream, ">$output_file") || throw "Can't open '$output_file' file for writing\n"; - - } else { - - $output_stream = \*STDOUT; - print STDERR "Will write GFF stream to the standard output.\n"; - } - - my $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new - (-dbname => $dbname, - -host => $host, - -user => $dbuser, - -port => $port, - -pass => $dbpass); - - - my $gene_adaptor = $db->get_GeneAdaptor(); - my @stable_gene_ids = undef; - my $size = 0; - - if (defined($chr)) { - - my $slice_adaptor = $db->get_SliceAdaptor(); - my $slice = $slice_adaptor->fetch_by_region( 'chromosome', $chr ); - @stable_gene_ids = @{ $gene_adaptor->fetch_all_by_Slice($slice) }; - $size = scalar @stable_gene_ids; - print STDERR "Number of stable genes on region $chr:\t" . $size . "\n"; - - } else { - - @stable_gene_ids = @{$gene_adaptor->list_stable_ids()}; - $size = scalar @stable_gene_ids; - print STDERR "Number of stable ids:\t" . $size . "\n"; - } - - for my $id (@stable_gene_ids) { - - my $gene = ($chr) ? $id : $gene_adaptor->fetch_by_stable_id($id); - - my $gene_id = $gene->display_id(); - my $biotype = $gene->biotype(); - my $chr = $gene->slice->seq_region_name(); - my $strand = $gene->strand(); - my $start = $gene->start(); - my $end = $gene->end(); - - my @transcripts = @{$gene->get_all_Transcripts()}; - for my $transcript (@transcripts) { - - my $transcr_id = $transcript->display_id() ; ; - - #Get the exons + print info. - my $exons = $transcript->get_all_Exons() ; - - foreach my $exon (@$exons) { - my $exon_id = $exon->display_id() ; - my $exon_start = $exon->start() ; - my $exon_end = $exon->end() ; - my $exon_std = $exon->strand() ; - my $slice = $exon->slice->seq_region_name(); - $exon_std =~ s/-1/-/ ; - $exon_std =~ s/1/+/ ; - print $output_stream "$chr\tEnsembl\texon\t$exon_start\t$exon_end\t.\t$exon_std\t.\tgene_id \"$gene_id\"; transcript_id \"$transcr_id\"; exon_id \"$exon_id\"\n" ; - } - } - } - - exit 0; - -} diff --git a/misc-scripts/alternative_splicing/README.txt b/misc-scripts/alternative_splicing/README.txt deleted file mode 100644 index 5c1730b27182f4f2abbd04645becf73c1bd58e47..0000000000000000000000000000000000000000 --- a/misc-scripts/alternative_splicing/README.txt +++ /dev/null @@ -1,85 +0,0 @@ -********************************************************* -* Alternative Splicing Events Computations * -* * -* Authors: * -* Ian Longden * -* Gautier Koscielny * -* Please contact Ensembl helpdesk for more * -* information. * -* * -* Last update: 10/10/2011 * -* * -********************************************************* - -Table of contents - 0. Preamble - 1. Dump gene models in GFF - 2. Calculate the AS events - 3. Populate database with AS events - 4. All in one: pipeline script used at Sanger Institute - -0) Preamble ------------ - -Alternative splicing (AS) events computation is a three step process. -First, the transcript structures are fetch from the core database and -stored in a text file in GFF. -Then, a binary program called altSpliceFinder will compute all the events -and generate a GFF output of the events. -Finally, the events are uploaded in the core database. - -There is now a script that does all the job for you for every release of Ensembl. -If you're interested in computing Alternative splicing events for other species, -you can follow section 1) and 2) of this document. -If you want to run the pipeline from start to end in one go, there is a script -described in section 4) of this document. - - -1) Dump gene models in GFF --------------------------- - -use ensembl/misc-scripts/alternative_splicing/Fetch_gff.pl, i.e. - -perl Fetch_gff.pl -dbhost host1 -dbuser ro -dbname ianl_homo_sapiens_core_55_37 > ianl_homo_sapiens_core_55_37.gff - - -2) Calculate the AS events --------------------------- - -2.1 prerequisite - -the AltSplicingToolkit must be installed to compute the splicing events. -Please refer to the documentation in - -ensembl/misc-scripts/alternative_splicing/AltSplicingToolkit/INSTALL - -to install this toolkit. - -2.2 altSpliceFinder - -Run the altSpliceFinder binary program from the toolkit - -altSpliceFinder -i ianl_homo_sapiens_core_55_37.gff -o ianl_homo_sapiens_core_55_37_AS_events.gff - - -3) Populate database with AS events ------------------------------------ - -cat ianl_homo_sapiens_core_55_37_AS_events.gff | \ -perl load_alt_splice_gff.pl -user admin -pass XXX -dbname ianl_homo_sapiens_core_55_37 -host host1 - -4) All in one: pipeline script at Sanger ----------------------------------------- - -Run the script as_event_computations.sh with the following parameters: - -Usage: - as_event_computations.sh -h <dbhost> [-P <dbport>] -u <dbuser> [-p <dbpass>] - -s <species> [-d <dbname>] [-o <output_dir>] - -If the species name is passed, the script will find the corresponding core database on <dbhost>. -If the database name <dbname> is passed, the script will use this database as the core database. -By default, all intermediate results will be written in the /tmp directory. -Please use the -o parameter to pass a different existing writable directory. - --- diff --git a/misc-scripts/alternative_splicing/as_event_computations.sh b/misc-scripts/alternative_splicing/as_event_computations.sh deleted file mode 100755 index 9e4dd6ecdff8f3ae9264b74ce95b7fccb9df2a45..0000000000000000000000000000000000000000 --- a/misc-scripts/alternative_splicing/as_event_computations.sh +++ /dev/null @@ -1,163 +0,0 @@ -#!/bin/bash - -################################################# -# # -# Sanger Ensembl specific script to compute and # -# store Alternative splicing events in a core # -# database. # -# Author: Gautier Koscielny # -# e-mail: ensembl-dev mailing list # -# # -################################################# - -port=3306 -host="" -password="" -user="" -species="" -db="" -core="" -output_dir="/tmp" - -print_help () { - echo "Usage:"; - echo " as_event_computations.sh -h <dbhost> [-P <dbport>] -u <dbuser> [-p <dbpass>] -s <species> [-d <dbname>] [-o <output_dir>]"; - echo ""; - echo "If the species name is passed, the script will find the corresponding core database on <dbhost>."; - echo "If the database name <dbname> is passed, the script will use this database as the core database."; - echo "By default, all intermediate results will be written in the /tmp directory."; - echo "Please use the -o parameter to pass a different existing writable directory."; -} - -echo "Parsing parameters..." - -while getopts ":h:u:p:P:s:d:o:" optname - do - #echo $optname - case "$optname" in - "h") - echo "Hostname=$OPTARG"; - host=$OPTARG; - echo $host - ;; - "o") - echo "Output directory=$OPTARG"; - output_dir=$OPTARG; - ;; - "P") - echo "Port=$OPTARG"; - port=$OPTARG; - ;; - "p") - echo "Password=$OPTARG"; - password=$OPTARG; - ;; - "u") - echo "User=$OPTARG"; - user=$OPTARG - ;; - "d") - echo "Database=$OPTARG"; - db=$OPTARG - ;; - "s") - echo "Species=$OPTARG"; - species=$OPTARG - ;; - "?") - echo "Unknown option $OPTARG"; - print_help; - ;; - ":") - echo "No argument value for option $OPTARG" - print_help; - ;; - *) - # Should not occur - echo "Unknown error while processing options" - ;; - esac - #echo "OPTIND is now $OPTIND" - done - -if [[ -n "$host" && -n "$user" ]] -then - if [ -z "$db" ] - then - if [ -n "$species" ] - then - ## Find a core database for this species on the specified server. - if [[ -n "$password" && $password != "" ]] - then - db=`mysql -h${host} -P${port} -u${user} -p${password} -s --skip-column-names -e "show databases like '${species}\_core\_%'"` - else - db=`mysql -h${host} -P${port} -u${user} -s --skip-column-names -e "show databases like '${species}\_core\_%'"` - fi - else - echo "Species or/and database name are required." ; - print_help; - exit 1; - fi - else - echo "Using ${db} as core database."; - fi -else - echo "Hostname and username are required." - print_help; - exit 1; -fi - -# count how many databases match this name - -y=0; - -for X in ${db} -do - y=$[$y+1]; -done - -if [[ ${y} -eq 0 ]] -then - echo "Check your parameters, there is no database matching species '${species}'"; - exit 1; -fi - - -if [[ ${y} -gt 1 ]] -then - echo "Check your parameters, there are ${y} databases matching species '${species}':"; - echo "${db}"; - exit 1; -fi - -echo "Using ${db} as core database."; - -# Otherwise, start the pipeline - -NOW=$(date +"%Y-%m-%d-%H-%M-%S") - -echo "Starting pipeline with timestamp '${NOW}'"; - -echo "Fetch the gene models from ${db}..." -if [ -n "$password" ] -then - perl Fetch_gff.pl -dbname ${db} -dbhost ${host} -dbport ${port} -dbuser ${user} -dbpass ${password} -o ${output_dir}/${db}_${NOW}_variants.gff; -else - perl Fetch_gff.pl -dbname ${db} -dbhost ${host} -dbport ${port} -dbuser ${user} -o ${output_dir}/${db}_${NOW}_variants.gff; -fi - -echo "Compute the Alternative Splicing events"; -altSpliceFinder -i ${output_dir}/${db}_${NOW}_variants.gff -o ${output_dir}/${db}_${NOW}_events.gff --relax --statistics - -echo "Populate ${db} with alternative splicing information"; - -if [ -n "$password" ] -then - perl load_alt_splice_gff.pl -file ${output_dir}/${db}_${NOW}_events.gff -host ${host} -user ${user} -pass ${password} -dbname ${db} -else - echo "Sorry, you did not provide any password. The script can't populate the database with the Alternative splicing information."; - echo "However, the results are available in ${output_dir}/${db}_${NOW}_events.gff"; -fi - -echo "All done."; -exit 0; diff --git a/misc-scripts/alternative_splicing/generateSplicingEventTable.pl b/misc-scripts/alternative_splicing/generateSplicingEventTable.pl deleted file mode 100644 index afbf56ec38f2cf3f72fb066ab7a71b28f30d57c9..0000000000000000000000000000000000000000 --- a/misc-scripts/alternative_splicing/generateSplicingEventTable.pl +++ /dev/null @@ -1,323 +0,0 @@ -#!/usr/local/ensembl/bin/perl -w - -=head1 LICENSE - - Copyright (c) 1999-2013 The European Bioinformatics Institute and - Genome Research Limited. All rights reserved. - - This software is distributed under a modified Apache license. - For license details, please see - - http://www.ensembl.org/info/about/code_licence.html - -=head1 CONTACT - - Please email comments or questions to the public Ensembl - developers list at <dev@ensembl.org>. - - Questions may also be sent to the Ensembl help desk at - <helpdesk@ensembl.org>. - -=cut - -=head1 NAME - - generateSplicingEventTable.pl - -=head1 AUTHORS - - Gautier Koscielny (koscieln@ebi.ac.uk) - -=head1 SYNOPSIS - - generateSplicingEventTable.pl -dbhost host -dbuser ensro -dbname homo_sapiens_core_62_37g - -=head1 DESCRIPTION - -This script generates a tabular separated table containing all the splicing events for a set of genes. - -here is an example commandline - - ./generateSplicingEventTable.pl -dbhost host -dbuser user -dbname my_db -dbpass **** - -=head1 OPTIONS - - -dbhost host name for database (gets put as host= in locator) - -dbport port (gets put as port= in locator) - -dbname what name to connect to (dbname= in locator) - -dbuser what username to connect as (dbuser= in locator) - -dbpass what password to use (dbpass= in locator) - -help displays this documentation with PERLDOC - -=cut - -use strict; -use warnings; -use Getopt::Long; -use Bio::EnsEMBL::Utils::Exception qw(throw warning); -use Bio::EnsEMBL::Feature; -use Bio::EnsEMBL::DBSQL::DBAdaptor; -use Bio::EnsEMBL::DBSQL::SplicingEventAdaptor; -use Bio::EnsEMBL::DBSQL::GeneAdaptor; -use Bio::EnsEMBL::DBSQL::AttributeAdaptor; -use Bio::EnsEMBL::DBSQL::ExonAdaptor; -use Bio::EnsEMBL::Utils::Exception qw(throw); - - -my $host = ''; -my $port = '3306'; -my $dbname = ''; -my $dbuser = ''; -my $dbpass = ''; -my $file = undef; -my $help; -my @coord_system; - -&GetOptions( - 'dbhost:s' => \$host, - 'dbport:n' => \$port, - 'dbname:s' => \$dbname, - 'dbuser:s' => \$dbuser, - 'file:s' => \$file, - 'dbpass:s' => \$dbpass, - 'h|help' => \$help, - ) or ($help = 1); - -if(!$host || !$dbuser || !$dbname) { - print STDERR "Can't get any information without database details\n"; - print STDERR "-dbhost $host -dbuser $dbuser -dbname $dbname ". - " -dbpass $dbpass\n"; - $help = 1; -} - -if ($help) { - exec('perldoc', $0); -} - -my $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new - (-dbname => $dbname, - -host => $host, - -user => $dbuser, - -port => $port, - -pass => $dbpass); - -my $gene_adaptor = $db->get_GeneAdaptor(); -my $se_adaptor = $db->get_SplicingEventAdaptor(); -my $attrib_adaptor = $db->get_AttributeAdaptor(); -my $exon_adaptor = $db->get_ExonAdaptor(); -my @stable_gene_ids = @{$gene_adaptor->list_stable_ids()}; -my $size = scalar @stable_gene_ids; -print STDERR "Number of stable ids:\t" . $size . "\n"; - -my $count = 0; - -# example genes -my @genes = ("ENSG00000000419", "ENSG00000006282"); - -# don't know how to get this information easyly from the core API - -my $attrib_names = { - II => "Intron isoform", - IR => "Intron retention", - A5SS => "Alternative 5' splice site", - A3SS => "Alternative 3' splice site", - CE => "Cassette exon", - MXE => "Mutually exclusive exons", - EI => "Exon isoform", - ALE => "Alternative last exon", - AFE => "Alternative first exon" - -}; - -print " AS name\tType\tShort desc\tlocation\tdetails\n"; -print "+--------------------------------------------------------------------------+\n"; -for my $id (@genes) { - - $count++; - - my $gene = $gene_adaptor->fetch_by_stable_id($id); - - my $gene_id = $gene->display_id(); - my $biotype = $gene->biotype(); - my $chr = $gene->slice->seq_region_name(); - my $strand = $gene->strand(); - my $start = $gene->start(); - my $end = $gene->end(); - - # get all splicing events - - my @irs = (); - - for my $se ( sort { if ($a->type() eq $b->type()) { if ($a->strand == 1) { $a->start() <=> $b->start() } else { $a->end() <=> $b->end() } } } @ { $se_adaptor->fetch_all_by_Gene($gene) }) { - - next if ($se->type() eq 'CNE'); - - my $location = ($se->strand() == 1) ? ("+\t$chr:" . $se->start() . "-" . $se->end()) : ("-\t$chr:" . $se->end() . "-" . $se->start()); - - if ($se->type() eq 'IR') { - # special case for IR - push @irs, $se; - } else { - # display of the exons depends on the event - print $se->type() . "\t" . $attrib_names->{$se->type()} . "\t$location"; - print "\t" . get_event_display($se) . "\n"; - } - - } - - # finally, IR display if any - print get_IR_display(\@irs, $chr) if (scalar(@irs) > 0); - - # last if ($count >=3); -} - - -sub get_IR_display { - my ($ref, $chr) = @_; - - my $s = ''; - my $exons = {}; - - for my $se (sort { if ($a->strand == 1) { $a->start() <=> $b->start() } else { $a->end() <=> $b->end() } } @{ $ref }) { - - my @features = @{ $se->get_all_Features() }; - my %f_map = map { $_->exon_id() => $_ } @features; - - foreach my $exon_id (sort { $f_map{$a}->feature_order() <=> $f_map{$b}->feature_order() } keys %f_map) { - - my $f = $f_map{$exon_id}; - my $exon = $exon_adaptor->fetch_by_dbID($exon_id); - my $retained = $exon->stable_id() . "(" . $f->start() . "-" . $f->end() . ")"; - - if ($f->feature_order() == 1) { - - if (!defined($exons->{$exon->stable_id()})) { - my $location = ($se->strand() == 1) ? ("+\t$chr:" . $f->start() . "-" . $f->end()) : ("-\t$chr:" . $f->end() . "-" . $f->start()); - $s .= $se->type() . "\t" . $attrib_names->{$se->type()} . "\t$location\tIntron retained in: $retained\n"; - $exons->{$exon->stable_id()} = { - text => $se->type() . "\t" . $attrib_names->{$se->type()} . "\t$location\tIntron retained in: $retained\n", - start => $f->start(), - end => $f->end() - } - } - - } - } - - } - - return $s; -} - -sub get_event_display { - my $se = shift; - - my $s = ''; - my @features = @{ $se->get_all_Features() }; - my %f_map = map { $_->exon_id() => $_ } @features; - #print join("...", keys %f_map) . "\n"; - - if ($se->type() eq 'CE') { - - # what are the exons skipped and their coordinates? - $s = (scalar(keys %f_map) > 1) ? "Skipped exons: " : "Skipped exon: "; - my $nb = 0; - my @sl = (); - foreach my $exon_id (sort { $f_map{$a}->feature_order() <=> $f_map{$b}->feature_order() } keys %f_map) { - my $f = $f_map{$exon_id}; - my $exon = $exon_adaptor->fetch_by_dbID($exon_id); - push @sl, $exon->stable_id() . "(" . $f->start() . "-" . $f->end() . ")"; - - } - $s .= join("; ", @sl); - - } elsif ($se->type() eq 'MXE' || $se->type() eq 'AFE' || $se->type() eq 'ALE' ) { - - my $nb = 0; - my @s1 = (); - my @s2 = (); - - foreach my $exon_id (sort { $f_map{$a}->feature_order() <=> $f_map{$b}->feature_order() } keys %f_map) { - my $f = $f_map{$exon_id}; - my $exon = $exon_adaptor->fetch_by_dbID($exon_id); - my $s = $exon->stable_id() . "(" . $f->start() . "-" . $f->end() . ")"; - my $v = ($se->type() eq 'MXE') ? $f->transcript_association() : $f->feature_order(); - if ($v == 1) { - push @s1, $s; - } else { - push @s2, $s; - } - - } - $s = "Alternative exons: " . join("; ", @s1) . " / " . join("; ", @s2); - - } elsif ($se->type() eq 'II') { - - my $nb = 0; - my @s1 = (); - my @s2 = (); - - foreach my $exon_id (sort { $f_map{$a}->feature_order() <=> $f_map{$b}->feature_order() } keys %f_map) { - my $f = $f_map{$exon_id}; - my $exon = $exon_adaptor->fetch_by_dbID($exon_id); - my $s = $exon->stable_id() . "(" . $f->start() . "-" . $f->end() . ")"; - if ($f->transcript_association() == 1) { - push @s1, $s; - } else { - push @s2, $s; - } - - } - - $s = "Flanking exons: " . join("; ", @s1) . " / " . join("; ", @s2); - - } elsif ($se->type() eq 'EI') { - - my $nb = 0; - my @s1 = (); - my @s2 = (); - - foreach my $exon_id (sort { $f_map{$a}->feature_order() <=> $f_map{$b}->feature_order() } keys %f_map) { - my $f = $f_map{$exon_id}; - my $exon = $exon_adaptor->fetch_by_dbID($exon_id); - my $s = $exon->stable_id() . "(" . $f->start() . "-" . $f->end() . ")"; - if ($f->feature_order() == 1) { - push @s1, $s; - } else { - push @s2, $s; - } - - } - - $s = "Alternative exons: " . join("; ", @s1) . " / " . join("; ", @s2); - - } elsif ($se->type() eq 'A5SS' || $se->type() eq 'A3SS') { - - my $nb = 0; - my @s1 = (); - my @s2 = (); - - foreach my $exon_id (sort { $f_map{$a}->feature_order() <=> $f_map{$b}->feature_order() } keys %f_map) { - - my $f = $f_map{$exon_id}; - my $exon = $exon_adaptor->fetch_by_dbID($exon_id); - my $s = $exon->stable_id() . "(" . $f->start() . "-" . $f->end() . ")"; - - if ($f->feature_order() == 1) { - push @s1, $s; - } else { - push @s2, $s; - } - - } - - $s = "Alternative flanking exons: " . join("; ", @s1) . " / " . join("; ", @s2); - - } - - return $s; - -} - - -exit 0; diff --git a/misc-scripts/alternative_splicing/load_alt_splice_gff.pl b/misc-scripts/alternative_splicing/load_alt_splice_gff.pl deleted file mode 100644 index 24c3cd445c54335cf3cdf1b3314e7aff3516ceb2..0000000000000000000000000000000000000000 --- a/misc-scripts/alternative_splicing/load_alt_splice_gff.pl +++ /dev/null @@ -1,395 +0,0 @@ -use strict; -use warnings; - -use DBI; -use Getopt::Long; - -my $file; # if not specified use stdin; -my $host; -my $user; -my $pass; -my $dbname; -my $port = 3306; - - - -GetOptions ('file=s' => \$file, - 'host=s' => \$host, - 'user=s' => \$user, - 'pass=s' => \$pass, - 'port=s' => \$port, - 'dbname=s' => \$dbname); - - - -# -# NOTE: if no file is given then STDIN is used :-) -# - -if(defined($file)){ - open(GFF,"<$file") || die "Could not open $file for reading"; -} -else{ - open(GFF,"-") || die "Could not open STDIN for reading";; -} - - -my $dbi = dbi(); - -# First get the attrib_type_ids for the splicing event types - -my %ase_type; -my $ase_type_sql = 'select attrib_type_id, code, name from attrib_type where attrib_type_id between 300 and 311 order by attrib_type_id'; -my $ase_type_sth = $dbi->prepare($ase_type_sql); -$ase_type_sth->execute(); -my ($attrib_type_id, $ase_code, $ase_name); -$ase_type_sth->bind_columns(\$attrib_type_id, \$ase_code, \$ase_name); -while($ase_type_sth->fetch()){ - $ase_type{$ase_code} = $attrib_type_id; -} -$ase_type_sth->finish; - -my $name_sql = 'select s.seq_region_id, s.name from seq_region s, seq_region_attrib sra, attrib_type at where at.attrib_type_id = sra.attrib_type_id and at.name like "top_level" and s.seq_region_id = sra.seq_region_id'; - -my $name_sth = $dbi->prepare($name_sql); -$name_sth->execute; -my ($seq_region_id, $name); -my %name_to_seq_region_id; -$name_sth->bind_columns(\$seq_region_id, \$name); -while($name_sth->fetch()){ - $name_to_seq_region_id{$name}= $seq_region_id; -} -$name_sth->finish; - - -my %stable_id_to_dbid; # use for gene,transcripts and exons - - -foreach my $type (qw(gene transcript exon)){ - my $sth = $dbi->prepare("SELECT ".$type."_id, stable_id FROM ".$type); - $sth->execute; - my ($stable_id, $dbid); - $sth->bind_columns(\$dbid, \$stable_id); - while($sth->fetch){ - $stable_id_to_dbid{$stable_id} = $dbid; - } - $sth->finish; -} -my %exon_to_transcripts; - -my $et_sth = $dbi->prepare("select exon_id, transcript_id from exon_transcript"); -$et_sth->execute(); -my ($e_id, $t_id); -$et_sth->bind_columns(\$e_id, \$t_id); -while($et_sth->fetch){ - if(!defined($e_id)){ - print "no exon_id?? $t_id\n"; - next; - } - if(defined($exon_to_transcripts{$e_id})){ - push @{$exon_to_transcripts{$e_id}}, $t_id; - } - else{ - $exon_to_transcripts{$e_id} = [$t_id]; - } -} -$et_sth->finish; - - - -# Remove the previous data - -my $sth = $dbi->prepare("delete from splicing_event"); -$sth->execute || die "Could not delete entries from table splicing_event"; - -$sth = $dbi->prepare("delete from splicing_event_feature"); -$sth->execute || die "Could not delete entries from table splicing_event_feature"; - -$sth = $dbi->prepare("delete from splicing_transcript_pair"); -$sth->execute || die "Could not delete entries from table splicing_transcript_pair"; - - - -my $ins_SE_sql = "INSERT INTO splicing_event (splicing_event_id, name, gene_id, seq_region_id, seq_region_start, seq_region_end, seq_region_strand, attrib_type_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?)"; -my $ins_SE_sth = $dbi->prepare($ins_SE_sql); - -my $ins_SEF_sql = "INSERT INTO splicing_event_feature (splicing_event_feature_id, splicing_event_id, exon_id, transcript_id, feature_order, transcript_association, type, start, end) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)"; -my $ins_SEF_sth = $dbi->prepare($ins_SEF_sql); - -my $ins_STP_sql = "INSERT INTO splicing_transcript_pair (splicing_transcript_pair_id, splicing_event_id, transcript_id_1, transcript_id_2) VALUES (?, ?, ?, ?)"; -my $ins_STP_sth = $dbi->prepare($ins_STP_sql); - - - -my %type_hash = ('CNE' => 0, 'CE' => 0, 'AFE'=> 0 ,"A5SS" => 0, "A3SS" => 0, "MXE" => 0, "EI" => 0, "II"=> 0, "IR" => 0, 'ALE' => 0, ); -my %missing_type; - - -my $sef_id = 1; -my $stp_id = 1; - - - - - -my $i = 0; -my $at_count = 0; -while( my $line = <GFF>){ - chomp $line; - $i++; - my ($chrom, $junk, $type, $start, $end, $junk2, $strand, $junk3, $feat) = split (/\t/, $line); - my $ase_attrib_type_id; - - if(!defined($type_hash{$type})){ - $missing_type{$type}++; - next; - } - else{ - $type_hash{$type}++; - $ase_attrib_type_id = $ase_type{$type}; - } - - if(!defined($feat)){ - next; - } - my (@array) = split (/;/,$feat); - my %feat_hash; - foreach my $f (@array){ - my ($key, $value) = split(/=/,$f); - $key =~ s/^ *//g; - if(defined($feat_hash{$key})){ - $feat_hash{$key} .= "-".$value; - } - else{ - $feat_hash{$key} = $value; - } - } - - - - my $gene_id = undef; - if(defined($feat_hash{"Derives_from"})){ - $gene_id = $stable_id_to_dbid{$feat_hash{"Derives_from"}}; - } - else{ - die "No Derives from for line\n$line\n"; - } - my $name = undef; - if(defined($feat_hash{"ID"})){ - $name = $feat_hash{"ID"}; - } - - my $seq_region_id = $name_to_seq_region_id{$chrom}; - - # transcript_event_id INT(10) UNSIGNED NOT NULL AUTO_INCREMENT, - # name VARCHAR(40), - # gene_id INT(10) UNSIGNED NOT NULL, - # seq_region_id INT(10) UNSIGNED NOT NULL, - # seq_region_start INT(10) UNSIGNED NOT NULL, - # seq_region_end INT(10) UNSIGNED NOT NULL, - # seq_region_strand INT(10) UNSIGNED NOT NULL, - # type ENUM('CNE','CE','AFE','A5SS','A3SS','MXE','IR','II','EI', 'AT'), - - my $strand_id = 0; - if($strand eq "-"){ - $strand_id = -1; - } - elsif($strand eq "+"){ - $strand_id =1; - } - else{ - print STDERR "No strand info?? $_\n"; - } - my $okay = 1; - if(!defined($gene_id)){ - $okay = 0; - print "ERROR: Could not get gene_id for ".$feat_hash{"Derives_from"}." from line $_\n"; - } - if(!defined($seq_region_id)){ - $okay = 0; - print "ERROR: Could not get seq_region_id for $chrom from line $_\n"; - } - $ins_SE_sth->execute($i,$name,$gene_id,$seq_region_id,$start,$end,$strand_id,$ase_attrib_type_id) if($okay); - # print SE "$i\t$name\t$gene_id\t$seq_region_id\t$start\t$end\t$strand_id\t$type($ase_attrib_type_id)\n" if($okay); - - if(defined($feat_hash{"Pair"})){ - my (@pairs) = split(/-/, $feat_hash{"Pair"}); - foreach my $pair (@pairs){ - my ($t1, $t2) = split(/,/,$pair); - $ins_STP_sth->execute($stp_id, $i, $stable_id_to_dbid{$t1}, $stable_id_to_dbid{$t2}); -# print STP "$stp_id\t$i\t".$stable_id_to_dbid{$t1}."\t".$stable_id_to_dbid{$t2}."\n"; - $stp_id++; - } - } - - if($type eq "CNE"){ - if(defined($feat_hash{"ConstitutiveExons"})){ - my (@exons) = split(/,/,$feat_hash{"ConstitutiveExons"}); - foreach my $exon (@exons){ - # splicing_event_id INT(10) UNSIGNED NOT NULL, - # splicing_event_feature_id - # exon_id INT(10) UNSIGNED NOT NULL, - # transcript_id - # feature_order INT(10) UNSIGNED NOT NULL, - # transcript_association INT(10) UNSIGNED NOT NULL, - # type ENUM('constitutive exon','exon','flanking_exon'), - # start INT(10) UNSIGNED NOT NULL, - # end INT(10) UNSIGNED NOT NULL, print TEF "$i\t - - my $exon_id = $stable_id_to_dbid{$exon}; - my $ta = 1; - foreach my $transcript_id (@{$exon_to_transcripts{$exon_id}}){ - $ins_SEF_sth->execute($sef_id, $i, $exon_id, $transcript_id, 1, $ta, "constitutive_exon", $start, $end); - # print SEF "$sef_id\t$i\t$exon_id\t$transcript_id\t1\t$ta\tconstitutive_exon\t$start\t$end\n"; - $ta++; - } - } - $sef_id++; - } - } - elsif($type eq "CE"){ - ##FeaturesA=ENSE00001541085[ENST00000374404:ENST00000399974],ENSE00001463431[ENST00000374404:ENST00000399974]; - my @features = split(/,/, $feat_hash{"FeaturesA"}); - my @start_end = split(/,/, $feat_hash{"SitesA"}); - my $index = 1; - while(defined($features[$index-1])){ - my $index_2 = index($features[$index-1], "\["); - my $exon_id = $stable_id_to_dbid{substr($features[$index-1], 0, $index_2)}; - my (@trans) = split( ":", substr($features[$index-1],$index_2+1, -1)); - my $ta = 1; - foreach my $t (@trans){ - my $transcript_id = $stable_id_to_dbid{$t}; - $start_end[$index-1] =~ /e\((\d+)-(\d+)/; - $ins_SEF_sth->execute($sef_id, $i, $exon_id, $transcript_id, $index, $ta, "exon", $1, $2); -# print SEF "$sef_id\t$i\t$exon_id\t$transcript_id\t$index\t$ta\texon\t$1\t$2\n"; - $ta++; - } - $sef_id++; - $index++; - } - - - } - elsif($type eq "IR"){ - my $index=1; - ### FeaturesA=ENSE00000673410[ENST00000373020]; - ### FeaturesB=ENSE00001731724[ENST00000431386],ENSE00001805604[ENST00000431386]; - ### SitesA=e(99890555-99890743); - ### SitesB=e(99890720-99890743),e(99890555-99890587); - - - - ### FeaturesA=ENSE00001299015[ENST00000341376:ENST00000353205]; - ### FeaturesB=ENSE00001550702[ENST00000407855],ENSE00001554966[ENST00000407855]; - ### SitesA=e(41065096-41070145); - ### SitesB=e(41068761-41068917),e(41069749-41069853) - foreach my $letter (qw(A B)){ - my $start_end = $feat_hash{"Sites".$letter}; - my @start; - my @end; - if($letter eq "B"){ - $start_end =~ /[e]\((\d+)-(\d+)\),[e]\((\d+)-(\d+)/; - $start[0] = $1; - $end[0] = $2; - $start[1] = $3; - $end[1] = $4; - } - else{ - $start_end = $feat_hash{"Sites".$letter}; - $start_end =~ /[ie]\((\d+)-(\d+)/; - $start[0] = $1; - $end[0] = $2; - $start[1] = $1; - $end[1] = $2; - } - - my @exons = split(/,/,$feat_hash{"Features".$letter}); - my $ec = 0; - foreach my $exon (@exons){ - my $index_2 = index($exon, "\["); - my $exon_id = $stable_id_to_dbid{substr($exon, 0, $index_2)}; - my (@trans) = split( ":", substr($exon,$index_2+1, -1)); - my $ta = 1; - foreach my $t (@trans){ - my $transcript_id = $stable_id_to_dbid{$t}; - $ins_SEF_sth->execute($sef_id, $i, $exon_id, $transcript_id, $index, $ta, "exon", $start[$ec], $end[$ec]); - $ta++; - } - } - $sef_id++; - $index++; - } - } - elsif($type eq "AFE" - or $type eq "ALE" - or $type eq "A5SS" - or $type eq "A3SS" - or $type eq "MXE" - or $type eq "EI" - or $type eq "IR" - or $type eq "II"){ - my $index=1; - ##FeaturesA=ENSE00001411023[ENST00000337248:ENST00000374404:ENST00000003583:ENST00000399974]; - ##FeaturesB=ENSE00001407060[ENST00000374409]; - ##alt FeaturesB=ENSE00001558073[ENST00000407906],ENSE00001410909[ENST00000404556]; - foreach my $letter (qw(A B)){ - my $start_end = $feat_hash{"Sites".$letter}; - $start_end =~ /[ie]\((\d+)-(\d+)/; - my $start = $1; - my $end = $2; - my @exons = split(/,/,$feat_hash{"Features".$letter}); - foreach my $exon (@exons){ - my $index_2 = index($exon, "\["); - my $exon_id = $stable_id_to_dbid{substr($exon, 0, $index_2)}; - my (@trans) = split( ":", substr($exon,$index_2+1, -1)); - my $ta = 1; - foreach my $t (@trans){ - my $transcript_id = $stable_id_to_dbid{$t}; - $ins_SEF_sth->execute($sef_id, $i, $exon_id, $transcript_id, $index, $ta, "exon", $start, $end); -# print SEF "$sef_id\t$i\t$exon_id\t$transcript_id\t$index\t$ta\texon\t$start\t$end\n"; - $ta++; - } - } - $sef_id++; - $index++; - } - } - - else{ - $missing_type{$type}++; -# print STDERR "AHH do not know type $type\n"; - } - -} - -close GFF; - -foreach my $type (keys %missing_type){ - print STDERR "Skipping ".$missing_type{$type}." entries of type $type as do not know what to do with them\n"; -} -print STDERR "\n\n"; -foreach my $type (keys %type_hash){ - print STDERR "Added ".$type_hash{$type}." entries of type $type\n"; -} - - - -sub dbi -{ - my $self = shift; - - if ( !defined $dbi || !$dbi->ping() ) { - my $connect_string = - sprintf( "dbi:mysql:host=%s;port=%s;database=%s", - $host, $port, $dbname ); - - $dbi = - DBI->connect( $connect_string, $user, $pass, - { 'RaiseError' => 1 } ) - or croak( "Can't connect to database: " . $DBI::errstr ); - $dbi->{'mysql_auto_reconnect'} = 1; # Reconnect on timeout - } - - return $dbi; -} -