Skip to content
Snippets Groups Projects
Commit 46edfe0b authored by Graham McVicker's avatar Graham McVicker
Browse files

removed old perl ID mapping (v. 2) scripts

parent cb4f919a
No related branches found
No related tags found
No related merge requests found
# here we expect a list of exons from old and new database
# each exon contains exon_stable to mark it as mapped
# each exon contains the internal gene_id of the gene it is
# part of. This module tries to map the gene_stable ids over to
# the new exons.
package MapGenes;
use strict;
# starting with one exon, build_exon_group tries to build an
# 'envelope' around it. That is a set of genes and exons where neither
# of them misses parts. You call build_exon_group as often as needed
# until it doesnt produce new exons any more
my ( %newGeneHash, %oldGeneHash,
%oldExonHash, %newExonHash,
@oldPruneExon, @newPruneExon );
my $mappedGene;
# go through old Exons
# skip if gene is mapped
# build envelope if not
sub map_genes {
my ( $oldExonInfo, $newExonInfo ) = @_;
my %oldGenesMapped;
print STDERR ( "Start gene mapping ",scalar( localtime() ),"\n");
init_lookup_tables( $oldExonInfo, $newExonInfo );
for my $oExon ( @oldPruneExon ) {
if( ! exists $oExon->{'gene_mapped'} ) {
my @map_exons = build_envelope( $oExon, \@oldPruneExon, \@newPruneExon );
map_envelope( \@map_exons );
}
}
print STDERR ( "End gene mapping ",scalar( localtime() ),"\n");
print STDERR ( "Mapped Genes ",$mappedGene,"\n" );
}
sub map_envelope {
my $exon_stables = shift;
my %gene_map_hash;
my $common_exons = 0;
for my $exon_stable_id ( @$exon_stables ) {
if( exists $oldExonHash{ $exon_stable_id } &&
exists $newExonHash{ $exon_stable_id } ) {
my ( $oEx, $nEx, $hashkey );
$oEx = $oldExonHash{ $exon_stable_id };
$nEx = $newExonHash{ $exon_stable_id };
$common_exons++;
$hashkey = $oEx->{'gene_id'}."-".$nEx->{'gene_id'};
if( exists $gene_map_hash{$hashkey} ) {
$gene_map_hash{$hashkey}->[2] +=
$nEx->{'exon_end'}-$nEx->{'exon_start'}+1;
} else {
$gene_map_hash{$hashkey} = [ $oEx->{'gene_id'}, $nEx->{'gene_id'},
$nEx->{'exon_end'}-$nEx->{'exon_start'}+1,
$oEx->{'gene_stable'}];
}
}
}
my @sortedMappings = sort { $b->[2] <=> $a->[2] } values %gene_map_hash;
my ( %oldGenesMapped, %newGenesMapped );
for my $mapRecord ( @sortedMappings ) {
if( ! exists $oldGenesMapped{$mapRecord->[0]} &&
! exists $newGenesMapped{$mapRecord->[1]} ) {
$oldGenesMapped{$mapRecord->[0]} = $mapRecord->[1];
$newGenesMapped{$mapRecord->[1]} = $mapRecord->[0];
print ( "Gene mapped:\t",$mapRecord->[0], "\t",$mapRecord->[1],
"\t",$mapRecord->[3],".\n" );
$mappedGene++;
}
}
for my $exon_stable_id ( @$exon_stables ) {
if( exists $oldExonHash{ $exon_stable_id }) {
$oldExonHash{ $exon_stable_id }{'gene_mapped'} = 1;
}
if( exists $newExonHash{ $exon_stable_id }) {
$newExonHash{ $exon_stable_id }{'gene_mapped'} = 1;
}
}
}
sub init_lookup_tables {
my ( $oldExonInfo, $newExonInfo ) = @_;
# init the lookup tables
for my $exon ( @$newExonInfo ) {
if( exists $exon->{'exon_stable'} ) {
push( @{$newGeneHash{$exon->{'gene_id'}}}, $exon );
$newExonHash{ $exon->{'exon_stable'} } = $exon;
push( @newPruneExon, $exon );
}
}
for my $exon ( @$oldExonInfo ) {
# should exist for all of them ...
if( exists $newExonHash{ $exon->{'exon_stable'}} ) {
push( @{$oldGeneHash{$exon->{'gene_id'}}}, $exon );
$oldExonHash{ $exon->{'exon_stable'} } = $exon;
push( @oldPruneExon, $exon );
my $newExon = $newExonHash{ $exon->{'exon_stable'}};
# print "vvvv Mapping vvvv\n";
# ::print_exon( $exon );
# ::print_exon( $newExon );
# print "^^^^ Mapping ^^^^\n";
}
}
# print "Lookup initialized: ",scalar( localtime() ),"\n";
# print ( "old exons:",scalar( @oldPruneExon ),"\n" );
# print ( "new exons:",scalar( @newPruneExon ),"\n" );
}
# given one old exon, build envelope around it
# return set of stable ids which cover the
# given exon.
# That means: All old and new genes of these exons have all
# their stable_id exons in this set as well.
sub build_envelope {
my ( $exon, $oExonInfo, $nExonInfo ) = @_;
my $oGeneGroup = {};
my $nGeneGroup = {};
my $exonSet = {};
my $newExons = [];
push( @$newExons, $exon->{'exon_stable'} );
$newExons = build_exon_group( $oGeneGroup, $exonSet, $newExons,
\%oldGeneHash, \%oldExonHash );
while( scalar( @$newExons )) {
$newExons = build_exon_group( $nGeneGroup, $exonSet, $newExons,
\%newGeneHash, \%newExonHash );
if( scalar( @$newExons )) {
$newExons = build_exon_group( $oGeneGroup, $exonSet, $newExons,
\%oldGeneHash, \%oldExonHash );
}
}
# print ( "Exon envelope: ", join( " ", keys %$exonSet),"\n" );
# print ( "Old Genes:", join( " ", keys %$oGeneGroup ) ,"\n" );
# print ( "New Genes:", join( " ", keys %$nGeneGroup ) ,"\n\n" );
return keys %$exonSet;
}
# expand newExonSet that it covers all exons of its genes
# wiuth respect to one exon/gene set.
sub build_exon_group {
my ( $geneGroup, $exonSet, $newExonSet,
$geneLookup, $exonLookup ) = @_;
# print STDERR ( "Build group ", join( " ", @$newExonSet ),"\n" );
my (%newGenes, @newExons, $exon);
# get Genes from new Exons
# get Exons from this genes
for my $exon_stable ( @$newExonSet ) {
$exon = $exonLookup->{$exon_stable};
my $geneId;
$geneId = $exon->{'gene_id'};
if( ! defined $geneId ) {
print STDERR "undefined ";
&::print_exon( $exon );
}
if( !exists $geneGroup->{$geneId} ) {
$newGenes{$geneId} = 1;
$geneGroup->{$geneId} = 1;
}
}
for my $geneId ( keys %newGenes ) {
my $exons = $geneLookup->{$geneId};
for my $exon ( @$exons ) {
if( ! exists( $exonSet->{$exon->{'exon_stable'}} ) ) {
$exonSet->{$exon->{'exon_stable'}} = 1;
push( @newExons, $exon->{'exon_stable'} );
# print STDERR "Pushed Exon to Envelope\n";
}
}
}
# print STDERR ( "Exon envelope: ", join( " ", @newExons),"\n\n" );
return \@newExons;
}
1;
# here we expect a list of exons from old and new database
# each exon contains exon_stable to mark it as mapped
# each exon contains the internal transcript_id of the transcript it is
# part of. This module tries to map the transcript_stable ids over to
# the new exons.
package MapTranscripts;
use strict;
# starting with one exon, build_exon_group tries to build an
# 'envelope' around it. That is a set of transcripts and exons where neither
# of them misses parts. You call build_exon_group as often as needed
# until it doesnt produce new exons any more
my ( %newTranscriptHash, %oldTranscriptHash,
%oldExonHash, %newExonHash );
my $mappedTranscripts;
# go through old Exons
# skip if Transcript is mapped
# build envelope if not
sub map_transcripts {
my ( $oldExonInfo, $newExonInfo ) = @_;
my %oldGenesMapped;
print STDERR ( "Start transcript mapping ",scalar( localtime() ),"\n");
init_lookup_tables( $oldExonInfo, $newExonInfo );
for my $oExon ( @$oldExonInfo ) {
if( ! exists $oExon->{'transcript_mapped'} ) {
my @map_exons = build_envelope( $oExon, $oldExonInfo, $newExonInfo );
map_envelope( \@map_exons );
}
}
print STDERR ( "End transcript mapping ",scalar( localtime() ),"\n");
print STDERR ( "Mapped Transcripts ",$mappedTranscripts,"\n" );
}
sub map_envelope {
my $exon_stables = shift;
my %transcript_map_hash;
my $common_exons = 0;
my ( @oldExList, @newExList );
for my $exon_stable_id ( @$exon_stables ) {
if( exists $oldExonHash{ $exon_stable_id } &&
exists $newExonHash{ $exon_stable_id } ) {
my ( $oEx, $nEx, $hashkey );
@oldExList = @{$oldExonHash{ $exon_stable_id }};
@newExList = @{$newExonHash{ $exon_stable_id }};
for my $oEx ( @oldExList ) {
for my $nEx ( @newExList ) {
$hashkey = $oEx->{'transcript_id'}."-".
$nEx->{'transcript_id'};
if( exists $transcript_map_hash{$hashkey} ) {
$transcript_map_hash{$hashkey}->[2] +=
$nEx->{'exon_end'}-$nEx->{'exon_start'}+1;
} else {
$transcript_map_hash{$hashkey} =
[ $oEx->{'transcript_id'}, $nEx->{'transcript_id'},
$nEx->{'exon_end'}-$nEx->{'exon_start'}+1,
$oEx->{'transcript_stable'}];
}
}
}
}
}
my @sortedMappings = sort { $b->[2] <=> $a->[2] } values %transcript_map_hash;
my ( %oldTranscriptsMapped, %newTranscriptsMapped );
for my $mapRecord ( @sortedMappings ) {
if( ! exists $oldTranscriptsMapped{$mapRecord->[0]} &&
! exists $newTranscriptsMapped{$mapRecord->[1]} ) {
$oldTranscriptsMapped{$mapRecord->[0]} = $mapRecord->[1];
$newTranscriptsMapped{$mapRecord->[1]} = $mapRecord->[0];
print ( "Transcript mapped:\t",$mapRecord->[0], "\t",$mapRecord->[1],
"\t",$mapRecord->[3],"\n" );
$mappedTranscripts++;
}
}
for my $exon_stable_id ( @$exon_stables ) {
if( exists $oldExonHash{ $exon_stable_id }) {
for my $exon ( @{$oldExonHash{ $exon_stable_id }} ) {
$exon->{'transcript_mapped'} = 1;
}
}
if( exists $newExonHash{ $exon_stable_id }) {
for my $exon ( @{$newExonHash{ $exon_stable_id }} ) {
$exon->{'transcript_mapped'} = 1;
}
}
}
}
sub init_lookup_tables {
my ( $oldExonInfo, $newExonInfo ) = @_;
# problem: not all exons which are mapped ar emarked as such
# that happened if they are in more than one transcript
my %exHash= ();
for my $exon ( @$newExonInfo ) {
if( exists $exon->{'mapped'} ) {
$exHash{$exon->{'exon_id'}." ".$exon->{'sticky_rank'}." mapped"} =
$exon->{'mapped'};
$exHash{$exon->{'exon_id'}." ".$exon->{'sticky_rank'}." stable"} =
$exon->{'exon_stable'};
}
}
for my $exon ( @$newExonInfo ) {
if( ! exists $exon->{'mapped'} &&
( exists $exHash{$exon->{'exon_id'}." ".$exon->{'sticky_rank'}." mapped"} )) {
$exon->{'mapped'}= $exHash{$exon->{'exon_id'}." ".$exon->{'sticky_rank'}." mapped"};
$exon->{'exon_stable'}= $exHash{$exon->{'exon_id'}." ".$exon->{'sticky_rank'}." stable"};
}
push( @{$newTranscriptHash{$exon->{'transcript_id'}}}, $exon );
push( @{$newExonHash{ $exon->{'exon_stable'} }}, $exon );
}
%exHash = ();
for my $exon ( @$oldExonInfo ) {
if( exists $exon->{'mapped'} ) {
$exHash{$exon->{'exon_id'}." ".$exon->{'sticky_rank'}} =
$exon->{'mapped'};
}
}
for my $exon ( @$oldExonInfo ) {
if( ! exists $exon->{'mapped'} &&
( exists $exHash{$exon->{'exon_id'}." ".$exon->{'sticky_rank'}} )) {
$exon->{'mapped'}= $exHash{$exon->{'exon_id'}." ".$exon->{'sticky_rank'}};
}
if( exists $exon->{'mapped'} ) {
push( @{$oldTranscriptHash{$exon->{'transcript_id'}}}, $exon );
push( @{$oldExonHash{ $exon->{'exon_stable'} }}, $exon );
}
}
# remove all exons from $oldExonInfo which dont map
my $iter = 0;
while( $iter < scalar @$oldExonInfo ) {
my $exon = $oldExonInfo->[$iter];
if( ! exists $exon->{'mapped'} ) {
splice( @$oldExonInfo, $iter ,1 );
} else {
$iter++;
}
}
}
# given one old exon, build envelope around it
# return set of stable ids which cover the
# given exon.
# That means: All old and new genes of these exons have all
# their stable_id exons in this set as well.
sub build_envelope {
my ( $exon, $oExonInfo, $nExonInfo ) = @_;
my $oTranscriptGroup = {};
my $nTranscriptGroup = {};
my $exonSet = {};
my $newExons = [];
push( @$newExons, $exon->{'exon_stable'} );
$newExons = build_exon_group( $oTranscriptGroup, $exonSet, $newExons,
\%oldTranscriptHash, \%oldExonHash );
while( scalar( @$newExons )) {
$newExons = build_exon_group( $nTranscriptGroup, $exonSet, $newExons,
\%newTranscriptHash, \%newExonHash );
if( scalar( @$newExons )) {
$newExons = build_exon_group( $oTranscriptGroup, $exonSet, $newExons,
\%oldTranscriptHash, \%oldExonHash );
}
}
# print ( "Exon envelope: ", join( " ", keys %$exonSet),"\n" );
# print ( "Old Transcripts:", join( " ", keys %$oTranscriptGroup ) ,"\n" );
# print ( "New Transcripts:", join( " ", keys %$nTranscriptGroup ) ,"\n\n" );
return keys %$exonSet;
}
# expand newExonSet that it covers all exons of its transcripts
# with respect to one exon/transcript set.
sub build_exon_group {
my ( $transcriptGroup, $exonSet, $newExonSet,
$transcriptLookup, $exonLookup ) = @_;
# print STDERR ( "Build group ", join( " ", @$newExonSet ),"\n" );
my (%newTranscripts, @newExons, $exon);
# get Transcrips from new Exons
# get Exons from this Transcripts
for my $exon_stable ( @$newExonSet ) {
for my $exon ( @{$exonLookup->{$exon_stable}} ) {
my $transcriptId;
$transcriptId = $exon->{'transcript_id'};
if( ! defined $transcriptId ) {
print STDERR "undefined ";
&::print_exon( $exon );
}
if( !exists $transcriptGroup->{$transcriptId} ) {
$newTranscripts{$transcriptId} = 1;
$transcriptGroup->{$transcriptId} = 1;
}
}
}
for my $transcriptId ( keys %newTranscripts ) {
my $exons = $transcriptLookup->{$transcriptId};
for my $exon ( @$exons ) {
if( ! exists( $exonSet->{$exon->{'exon_stable'}} ) ) {
$exonSet->{$exon->{'exon_stable'}} = 1;
push( @newExons, $exon->{'exon_stable'} );
# print STDERR "Pushed Exon to Envelope\n";
}
}
}
# print STDERR ( "Exon envelope: ", join( " ", @newExons),"\n\n" );
return \@newExons;
}
1;
This document shell describe how id mapping in EnsEMBL is done.
All idmapping will be based on mapping exons from an old release of our
database to the next. It is assumed that the new and the old release
are available in the schema of the new release and the code of the new
release is used throughout. ( There will not be a DBOLD directory
involved with the idmapping code. )
1. Exon mapping
The basic proceeding is, to try to map all old exons over to the new
golden path.
1.1 Easy exon mapping
The first exons ids mapped to the new database are the ones which keep
their position on the same contig. (No contig version update).
Criteria: any overlap and sensitive handling of splits and merges.
(easier: 50% min overlap gets it )
1.2 New contig version
Some Exons might sit on contigs which are still on the golden path in
the new release, but in a different version. Does any of the new Exons
on this contigs look like the old Exon (exact string match or crossmatch)
Criteria: Crossmatch and 95% sequence identity.
Problem: more than one exon on the same clone is almost identical.
In this case use best match on clone..
1.25 Redundant clone information
Map exons on redundant clones to the new clone covering the same
area.
1.3 Sister Exons
Do we have unmapped Exons in old Genes which have sister Exons which
are mapped? Search for Exons in similar position on the new golden
path.
Remark on sequence comparison: Use a window around exon sequence to map
intronic sequence as well. Window 50bp around the exons.
1.6 So far unmapped.
Can we find a >95% sequence match somewhere in the genome?
If not consider this exon not to be mappable.
2. Transcript, Gene mapping
This mapping happens on the basis of the mapped exons. We screen
through the old gene, transcript objects and look if the majority of
their exons were mapped to a corresponding new object. Then the id is
transferred to that object.
In case of a merger, split or general mix of old objects, we transfer
ids for the biggest number of exons translated from one old to one new
first, then for the second biggest and so on.
# This module encapsulates the SQL needed for the idmapping
# Author: Arne Stabenau 3.10.2001
# ( why not use EnsEMBL here? Currently to slow?? This piece of software is
# going to be replaced very soon by a java version )
use DBI;
package SQL;
# takes a database handle and produces all old exon information needed
sub old_orig_exon_information {
my $dbh = shift;
my %res;
my $sth = $dbh->prepare( "
SELECT exon_stable, transcript_stable,
gene_stable, exon_id, gene_id, transcript_id,
clone_id, clone_version, sticky_rank,
contig_offset, contig_length,
exon_start, exon_end, exon_strand,
chr_name, chr_start, chr_end,
raw_start, raw_end, raw_ori
FROM exon_temp
ORDER by clone_id
" );
# LIMIT 5000" );
$sth->execute();
my $arrRef = _store_in_arrayref( $sth );
for my $exon ( @$arrRef ) {
calc_chr_coord( $exon );
}
return $arrRef;;
}
sub old_target_exon_information {
my $dbh = shift;
my $sth = $dbh->prepare( "
SELECT exon_id, transcript_id, gene_id,
clone_id, clone_version, sticky_rank,
contig_offset, contig_length,
exon_start, exon_end, exon_strand,
chr_name, chr_start, chr_end,
raw_start, raw_end, raw_ori
FROM exon_temp
ORDER by clone_id
" );
# LIMIT 5000");
$sth->execute();
my $arrRef = _store_in_arrayref( $sth );
for my $exon ( @$arrRef ) {
calc_chr_coord( $exon );
}
return $arrRef;
}
sub orig_exon_information {
my $dbh = shift;
my %res;
# $dbh->do( "drop table exon_temp" );
# CREATE TABLE exon_temp
my $sth = $dbh->prepare( "
SELECT esi.stable_id as exon_stable, tsi.stable_id as transcript_stable,
gsi.stable_id as gene_stable, e.exon_id as exon_id, t.gene_id as gene_id,
t.transcript_id as transcript_id,
cl.embl_id as clone_id, cl.embl_version as clone_version,
c.offset as contig_offset, c.length as contig_length,
e.seq_start as exon_start, e.seq_end as exon_end, e.strand as exon_strand,
e.sticky_rank as sticky_rank,
sgp.chr_name as chr_name, sgp.chr_start as chr_start, sgp.chr_end as chr_end,
sgp.raw_start as raw_start, sgp.raw_end as raw_end, sgp.raw_ori as raw_ori
FROM exon e, exon_transcript et, exon_stable_id esi, contig c, clone cl,
static_golden_path sgp, transcript t, gene_stable_id gsi,
transcript_stable_id tsi
WHERE e.exon_id = et.exon_id
AND esi.exon_id = e.exon_id
AND et.transcript_id = t.transcript_id
AND tsi.transcript_id = et.transcript_id
AND t.gene_id = gsi.gene_id
AND c.internal_id = e.contig_id
AND cl.internal_id = c.clone
AND sgp.raw_id = c.internal_id
AND e.sticky_rank = 1
ORDER by clone_id, clone_version,contig_offset, exon_strand
" );
$sth->execute();
# ORDER by gene_stable, transcript_stable, exon_stable, e.sticky_rank
return _store_in_arrayref( $sth );
}
sub target_exon_information {
my $dbh = shift;
# $dbh->do( "drop table exon_temp" );
# CREATE TABLE exon_temp
my $sth = $dbh->prepare( "
SELECT e.exon_id, t.transcript_id, t.gene_id,
cl.embl_id as clone_id, cl.embl_version as clone_version,
c.offset as contig_offset, c.length as contig_length,
e.seq_start as exon_start, e.seq_end as exon_end, e.strand as exon_strand,
e.sticky_rank as sticky_rank,
sgp.chr_name as chr_name, sgp.chr_start as chr_start, sgp.chr_end as chr_end,
sgp.raw_start as raw_start, sgp.raw_end as raw_end, sgp.raw_ori as raw_ori
FROM exon e, exon_transcript et, contig c, clone cl,
static_golden_path sgp, transcript t
WHERE e.exon_id = et.exon_id
AND et.transcript_id = t.transcript_id
AND c.internal_id = e.contig_id
AND cl.internal_id = c.clone
AND sgp.raw_id = c.internal_id
AND e.sticky_rank = 1
ORDER by clone_id, clone_version,contig_offset, exon_strand
" );
$sth->execute();
return _store_in_arrayref( $sth );
}
sub _store_in_arrayref {
my $sth = shift;
my $res = [];
my $count = 0;
while( my $hr = $sth->fetchrow_hashref() ) {
my %tempHash = %$hr;
push( @$res, \%tempHash );
}
return $res;
}
sub exon_sequence {
my $dbh = shift;
my $exon_id = shift;
my $exonSeq;
my $sth = $dbh->prepare( "
SELECT e.exon_id, substring( d.sequence, e.seq_start, e.seq_end-e.seq_start+1 ),
e.seq_start as start, e.seq_end as end,
e.strand as strand, e.sticky_rank as rank
FROM exon e, contig c, dna d
WHERE e.contig_id = c.internal_id
AND c.dna = d.id
AND e.exon_id = ?
ORDER by rank
" );
$sth->execute( $exon_id );
while( my $arrRef = $sth->fetchrow_arrayref ) {
my $part = $arrRef->[1];
if( $arrRef->[4] != 1 ) {
$part = reverse( $part );
$part =~ tr/ACGTacgt/TGCATGCA/;
}
$exonSeq .= $part;
}
return $exonSeq;
}
sub calc_chr_coord {
my $ex = shift;
my ( $chr_start, $chr_end, $chr_strand );
$chr_strand = $ex->{'exon_strand'} * $ex->{'raw_ori'};
if( $ex->{'raw_ori'} == 1 ) {
$chr_start = $ex->{'exon_start'} + $ex->{'chr_start'} - $ex->{'raw_start'};
$chr_end = $ex->{'exon_end'} + $ex->{'chr_start'} - $ex->{'raw_start'};
} else {
$chr_start = $ex->{'chr_start'} + $ex->{'raw_end'} - $ex->{'exon_end'};
$chr_end = $ex->{'chr_start'} + $ex->{'raw_end'} - $ex->{'exon_start'};
}
$ex->{'chr_start'} = $chr_start;
$ex->{'chr_end' } = $chr_end;
$ex->{'chr_strand' } = $chr_strand;
}
1;
# read the mapping out put file
# have the first free id in the variables
# go through new exons.
# mapped, make a exon_stable_id row
# not mapped, generate a new stable_id
# go through genes with same technique
# go through transcript
# if mapped, retrieve the translation_stable_id
use strict;
use DBI;
my $today = "2002-02-20";
my $sdbh = DBI->connect( "DBI:mysql:host=ecs1d;database=homo_sapiens_core_130", "ensro", "");
my $tdbh = DBI->connect( "DBI:mysql:host=ecs1e;database=ens_NCBI_28", "ensro", "" );
my $starttime = scalar( localtime() );
my $sth;
my $resultfile = "map_130_28.txt";
my %stable_id;
for my $table ( "gene_stable_id", "exon_stable_id",
"transcript_stable_id", "translation_stable_id" ) {
$sth = $sdbh->prepare( "select max( stable_id ) from $table" );
$sth->execute();
( $stable_id{$table} ) = $sth->fetchrow_array();
}
# print join( " ", %stable_id ),"\n";
# exit;
exon_stables();
gene_stables();
trans_stables();
sub exon_stables {
my %exonTranslate;
open( RES, $resultfile );
while( <RES> ) {
/Exon mapped:/ || next;
my @cols;
chomp;
@cols = split( "\t",$_ );
$exonTranslate{$cols[2]} = $cols[3];
}
close( RES );
$sth = $tdbh->prepare( "select distinct(exon_id) from exon" );
$sth->execute;
open( EXON, ">exon_stable_id.txt" ) or die;
while( my ( $exonId ) = $sth->fetchrow_array() ) {
if( ! exists $exonTranslate{$exonId} ) {
$stable_id{'exon_stable_id'}++;
print EXON ( join( "\t",($exonId, $stable_id{'exon_stable_id'},"1\t$today\t$today")),"\n" );
} else {
print EXON ( join( "\t",($exonId, $exonTranslate{$exonId},"1\t$today\t$today")),"\n" );
}
}
close( EXON );
}
sub gene_stables {
my %geneTranslate;
open( RES, $resultfile );
while( <RES> ) {
/Gene mapped:/ || next;
my @cols;
chomp;
@cols = split( "\t",$_ );
$geneTranslate{$cols[2]} = $cols[3];
}
close( RES );
$sth = $tdbh->prepare( "select gene_id from gene" );
$sth->execute;
open( GEN, ">gene_stable_id.txt" ) or die;
while( my ( $geneId ) = $sth->fetchrow_array() ) {
if( ! exists $geneTranslate{$geneId} ) {
$stable_id{'gene_stable_id'}++;
print GEN ( join( "\t",($geneId, $stable_id{'gene_stable_id'},"1\t$today\t$today")),"\n");
} else {
print GEN ( join( "\t",($geneId, $geneTranslate{$geneId},"1\t$today\t$today")),"\n" );
}
}
close ( GEN );
}
sub trans_stables {
my %transcriptHash;
my %transcriptTranslation;
open( RES, $resultfile );
while( <RES> ) {
/Transcript mapped:/ || next;
my @cols;
chomp;
@cols = split( "\t",$_ );
$transcriptHash{$cols[2]} = $cols[3];
}
close( RES );
$sth = $sdbh->prepare( "select tsi.stable_id, tli.stable_id
from transcript_stable_id tsi,
transcript t, translation_stable_id tli
where t.transcript_id = tsi.transcript_id
and t.translation_id = tli.translation_id" );
$sth->execute;
while( my $arrref = $sth->fetchrow_arrayref() ) {
$transcriptTranslation{$arrref->[0]} = $arrref->[1];
}
$sth = $tdbh->prepare( "select transcript_id, translation_id from transcript" );
$sth->execute();
open( TSC, ">transcript_stable_id.txt" ) or die;
open( TSL, ">translation_stable_id.txt" ) or die;
while( my ( $tsc, $tsl ) = $sth->fetchrow_array() ) {
if( ! exists $transcriptHash{$tsc} ) {
$stable_id{'transcript_stable_id'}++;
$stable_id{'translation_stable_id'}++;
print TSC ( join( "\t",($tsc, $stable_id{'transcript_stable_id'},"1")),"\n");
print TSL ( join( "\t",($tsl, $stable_id{'translation_stable_id'},"1")),"\n");
} else {
my $stable_id = $transcriptHash{$tsc};
print TSC ( join( "\t",($tsc, $stable_id,"1")),"\n");
print TSL ( join( "\t",($tsl, $transcriptTranslation{$stable_id},"1")),"\n");
}
}
close ( TCS );
close ( TSL );
}
# this script removes exon duplicates from the ensembl database
# An Exon duplicate is an exon whose start end strand contig phase information
# is the same to another one.
# step one: gather all exon information
# step two build mappings from internal ids to internal ids
# step three change exon_id in exon_transcript
# remove superfluous exon records
# remove exon_stable_id records and record which stable_ids were
# included from which others.
use DBI;
use strict;
my ( %exonHash, %stickies );
my ( $rowsRead, $nrExons );
my %exonTranslate;
my $dbh = DBI->connect( "DBI:mysql:host=ecs1d;database=ens_UCSC_0801", "ensadmin", "ensembl");
my $sth = $dbh->prepare( "select exon_id, contig_id, seq_start, seq_end, strand, phase, end_phase, sticky_rank from exon order by exon_id, sticky_rank desc" );
$sth->execute();
while( my $hr = $sth->fetchrow_hashref ) {
$rowsRead++;
my $hashkey = join( "\t", ( $hr->{'contig_id'}, $hr->{'seq_start'}, $hr->{'seq_end'},
$hr->{'strand'}, $hr->{'phase'}, $hr->{'end_phase'} ));
push( @{$exonHash{$hashkey}},[ $hr->{'exon_id'}, $hr->{'sticky_rank'} ] );
if( $hr->{'sticky_rank'} > 1 ) {
$stickies{$hr->{'exon_id'}} = 1;
}
}
$nrExons = scalar( keys %exonHash );
print ( "Exons in table: ", $rowsRead,"\n" );
print ( "Nonredundant: ",$nrExons,"\n" );
# build equivalence classes
my @equClass;
my %allExons;
my $i = 0;
for my $hashkey ( keys %exonHash ) {
$equClass[$i] = { 'position' => $hashkey,
'exon_id' => undef };
for my $exon ( @{$exonHash{$hashkey}} ) {
$allExons{$exon->[0]}->[$exon->[1]-1] = $i;
}
$i++;
}
# make new equclasses by combining old ones
my %combinedEqu;
for my $exonid ( keys %allExons ) {
if( scalar( @{$allExons{$exonid}} ) > 1 ) {
my $newEqKey = join( "-",@{$allExons{$exonid}} );
my $newEq;
# print STDERR "New equclass made $newEqKey\n";
if( ! exists $combinedEqu{$newEqKey} ) {
$equClass[$i] = { 'position' => $newEqKey,
'exon_id' => undef };
$combinedEqu{$newEqKey} = $i;
$newEq = $i;
$i++;
} else {
$newEq = $combinedEqu{$newEqKey};
}
$allExons{$exonid} = [$newEq];
}
}
# now give away exon_ids for equClasses
for my $exonid ( keys %allExons ) {
my $equNo = $allExons{$exonid}->[0];
if( defined $equClass[$equNo]->{'exon_id'} ) {
$exonTranslate{$exonid} = $equClass[$equNo]->{'exon_id'}
} else {
$equClass[$equNo]->{'exon_id'} = $exonid;
}
}
print "Translation entries: ",scalar( keys %exonTranslate ),"\n";
for my $exon ( keys %exonTranslate ) {
print "Map:\t",$exon,"\t",$exonTranslate{$exon},"\n";
}
# dumping_data
$dbh->disconnect;
exit;
sub dumping_data {
# dump and filter exon, exon_transcript, exon_stable_id
# should do supporting feature I guess ...
# this is ONE OFF,
$sth = $dbh->prepare( "select * from exon" );
$sth->execute();
open ( EXON, ">exon.txt" ) or die;
while( my $arr = $sth->fetchrow_arrayref() ) {
if( ! defined $exonTranslate{$arr->[0]} ) {
print EXON join( "\t", @$arr ),"\n";
}
}
close( EXON );
open ( EXONT, ">exon_transcript.txt" ) or die;
$sth = $dbh->prepare( "select * from exon_transcript" );
$sth->execute();
while( my $arr = $sth->fetchrow_arrayref() ) {
if( defined $exonTranslate{$arr->[0]} ) {
$arr->[0] = $exonTranslate{$arr->[0]};
}
print EXONT join( "\t", @$arr ),"\n";
}
close( EXONT );
# open ( EXONS, ">exon_stable_id.txt" ) or die;
# $sth = $dbh->prepare( "select * from exon_stable_id" );
# $sth->execute();
# while( my $arr = $sth->fetchrow_arrayref() ) {
# if( ! defined $exonTranslate{$arr->[0]} ) {
# print EXONS join( "\t", @$arr ),"\n";
# }
# }
# close( EXONS );
open ( SUPF, ">supporting_feature.txt" ) or die;
$sth = $dbh->prepare( "select * from supporting_feature" );
$sth->execute();
while( my $arr = $sth->fetchrow_arrayref() ) {
if( defined $exonTranslate{$arr->[0]} ) {
$arr->[1] = $exonTranslate{$arr->[1]};
}
print SUPF join( "\t", @$arr ),"\n";
}
close( SUPF );
}
# this script produces idmapping for genes, transcripts, translations and exons
# It needs Database orig with stable_ids set
# it needs Database target where the stable ids are to be set
# it produces log output
# it uses direct sql for speed. All queries are in SQL.pm
# it uses LSF for some jobs I guess ...
use DBI;
use SQL;
use Bio::PrimarySeq;
use strict;
use Bio::EnsEMBL::Pipeline::Runnable::CrossMatch;
use MapGenes;
use MapTranscripts;
my $direct_maps = 0;
my $crossmatches = 0;
my $cloneGroups = 0;
my $seqMatches = 0;
my $cmmaps = 0;
my $lasttime;
my $sdbh = DBI->connect( "DBI:mysql:host=ecs1d;database=homo_sapiens_core_130", "ensro", "");
my $tdbh = DBI->connect( "DBI:mysql:host=ecs1e;database=ens_NCBI_28", "ensro", "");
my $starttime = scalar( localtime() );
print STDERR "Start: ",scalar(localtime()),"\n";
my $alloExonInfo = &SQL::orig_exon_information( $sdbh );
my $allnExonInfo = &SQL::target_exon_information( $tdbh );
# remove exon doublettes caused by multiple transcripts
my ( %exonHash, %geneHash );
my ( $oExonInfo, $nExonInfo, @exonInfo );
my ( $oGenes, $nGenes );
for my $exon ( @$alloExonInfo ) {
if( ! exists $exonHash{ $exon->{'exon_id'}."-".$exon->{'sticky_rank'} } ) {
push( @{$oExonInfo}, $exon );
$exonHash{$exon->{'exon_id'}."-".$exon->{'sticky_rank'} } = 1;
}
if( ! exists $geneHash{$exon->{'gene_id'}} ) {
$geneHash{$exon->{'gene_id'}} = 1;
}
}
# filter out stickies
# @{$oExonInfo} = ( grep { ! defined $exonHash{$_->{'exon_id'}."-2"} } @exonInfo );
$oGenes = scalar( keys %geneHash);
# @exonInfo= ();
%geneHash = ();
%exonHash = ();
for my $exon ( @$allnExonInfo ) {
if( ! exists $exonHash{ $exon->{'exon_id'}."-".$exon->{'sticky_rank'} } ) {
push( @{$nExonInfo},$exon );
$exonHash{$exon->{'exon_id'}."-".$exon->{'sticky_rank'} } = 1;
}
if( ! exists $geneHash{$exon->{'gene_id'}} ) {
$geneHash{$exon->{'gene_id'}} = 1;
}
}
$nGenes = scalar( keys %geneHash);
# @{$nExonInfo} = ( grep { ! defined $exonHash{$_->{'exon_id'}."-2"} } @exonInfo );
print STDERR "Finish: ",scalar(localtime()),"\n";
print STDERR "OldExons: ",scalar( @$oExonInfo ),"\n";
print STDERR "NewExons: ",scalar( @$nExonInfo ),"\n";
print STDERR "OldGenes: $oGenes\n";
print STDERR "NewGenes: $nGenes\n";
direct_mapping( $oExonInfo, $nExonInfo );
contig_version_update( $oExonInfo, $nExonInfo );
MapGenes::map_genes( $oExonInfo, $nExonInfo );
MapTranscripts::map_transcripts( $alloExonInfo, $allnExonInfo );
$sdbh->disconnect();
$tdbh->disconnect();
print STDERR "Start: $starttime\n";
print STDERR "End : ",scalar( localtime()),"\n";
print STDERR "Old Exon Count: ", scalar( @$nExonInfo ), "\n";
print STDERR "New Exon Count: ", scalar( @$nExonInfo ), "\n";
print STDERR "Direct Mappings: ", $direct_maps,"\n";
print STDERR "Crossmatches: $crossmatches\n";
print STDERR "CloneGroups: $cloneGroups\n";
print STDERR "SeqMatches: $seqMatches\n";
print STDERR "Mappings on Crossmatch: $cmmaps\n";
exit;
open( D1, ">dump1.txt" );
open( D2, ">dump2.txt" );
my ( @dumpOld, @dumpNew );
print STDERR "Dumping ...\n";
@dumpOld = sort { ( $a->{'clone_id'} cmp $b->{'clone_id'} ) ||
( $a->{'exon_start'} <=> $b->{'exon_start'}) } @$oExonInfo;
@dumpNew = sort { ( $a->{'clone_id'} cmp $b->{'clone_id'} ) ||
( $a->{'exon_start'} <=> $b->{'exon_start'}) } @$nExonInfo;
*STDOUT = *D1;
for my $ex ( @dumpOld ) {
print_exon( $ex );
}
close D1;
*STDOUT = *D2;
for my $ex ( @dumpNew ) {
print_exon( $ex );
}
close D2;
exit;
# extract information (lot of data)
# direct mappings
# contig version update
# similar as direct mapping
# checks if there is an old exon, and a new exon on same clone but
# updated version. Exclude clones with same version.
sub contig_version_update {
my ( $old, $new ) = @_;
$lasttime = time();
my ( $pruneOld, $pruneNew, $sortOld, $sortNew );
my ( $old_iter, $new_iter, $skip_clone );
@{$pruneOld} = grep { ! exists $_->{'mapped'} } @{$old};
@{$pruneNew} = grep { ! exists $_->{'mapped'} } @{$new};
# prune short exons (<10) would be appropriat
print STDERR "Old Exons left to map: ",scalar( @$pruneOld ),"\n";
print STDERR "New Exons left to map: ",scalar( @$pruneNew ),"\n";
@{$sortOld} = sort { $a->{'clone_id'} cmp $b->{'clone_id'} } @$pruneOld;
@{$sortNew} = sort { $a->{'clone_id'} cmp $b->{'clone_id'} } @$pruneNew;
$new_iter = 0;
$old_iter = 0;
while( 1 ) {
my $old_exon = $sortOld->[$old_iter];
my $new_exon = $sortNew->[$new_iter];
my $cmp = ( $old_exon->{'clone_id'} cmp $new_exon->{'clone_id'} );
if( $cmp == 0 ) {
my $clone_id = $old_exon->{'clone_id'};
# skip clones with no updates
if( $old_exon->{'clone_version'} == $new_exon->{'clone_version'} ) {
$skip_clone = 1;
} else {
$skip_clone = 0;
}
my ( $newCloneExons, $oldCloneExons );
while( defined $old_exon &&
$old_exon->{'clone_id'} eq $clone_id ) {
push( @{$oldCloneExons}, $old_exon );
$old_iter++;
$old_exon = $sortOld->[$old_iter];
}
while( defined $new_exon &&
$new_exon->{'clone_id'} eq $clone_id ) {
push( @{$newCloneExons}, $new_exon );
$new_iter++;
$new_exon = $sortNew->[$new_iter];
}
if( !$skip_clone ) {
clone_map( $oldCloneExons, $newCloneExons );
}
} elsif ( $cmp < 0 ) {
$old_iter++;
} else {
$new_iter++;
}
if( $new_iter >= scalar( @$sortNew ) ||
$old_iter >= scalar( @$sortOld )) {
last;
}
if( time() - $lasttime > 3600 ) {
print STDERR ( "Finished $old_iter exons ",scalar( localtime() ),"\n" );
$lasttime = time();
}
}
}
# map two sets of exons in the same clone from old to new
# by using crossmatch module ...
sub clone_map {
my ( $oldCloneExons, $newCloneExons ) = @_;
my ( $oldExHash, $newExHash );
local *HERR, *HOUT;
open ( NULL, ">/dev/null" );
$cloneGroups++;
for my $exon ( @$oldCloneExons ) {
$oldExHash->{$exon->{'exon_id'}} = SQL::exon_sequence( $sdbh, $exon->{'exon_id'} );
# $exon->{'seq'} = $oldExHash->{$exon->{'exon_id'}};
}
for my $exon ( @$newCloneExons ) {
$newExHash->{$exon->{'exon_id'}} = SQL::exon_sequence( $tdbh, $exon->{'exon_id'} );
# $exon->{'seq'} = $newExHash->{$exon->{'exon_id'}};
}
clone_seq_match( $oldCloneExons, $newCloneExons,
$oldExHash, $newExHash );
# no crossmatch mappings
return;
my ( @oldPrune, @newPrune, @scorelist );
@oldPrune = grep { ! exists $_->{'mapped'} } @$oldCloneExons;
@newPrune = grep { ! exists $_->{'mapped'} } @$newCloneExons;
for my $oEx ( @oldPrune ) {
for my $nEx ( @newPrune ) {
my $crossMatcher = new Bio::EnsEMBL::Pipeline::Runnable::CrossMatch
( -seq1 => new Bio::PrimarySeq( -seq => $oldExHash->{$oEx->{'exon_id'}},
-id => $oEx->{'exon_id'} ),
-seq2 => new Bio::PrimarySeq( -seq => $newExHash->{$nEx->{'exon_id'}},
-id => $nEx->{'exon_id'} ),
-score => 200
);
*HERR = *STDERR;
*HOUT = *STDOUT;
*STDOUT = *NULL;
*STDERR = *NULL;
$crossMatcher->run();
*STDOUT = *HOUT;
*STDERR = *HERR;
$crossmatches++;
my $score = 0;
my $matches = 0;
for my $fp ( $crossMatcher->output() ) {
$score += $fp->score();
$matches = 1;
}
if( $matches ) {
push( @scorelist, { score => $score,
oldEx => $oEx,
newEx => $nEx } );
}
}
}
# now map exons which have crossmatches
my @sortedScores;
@sortedScores = sort { $b->{'score'} <=> $a->{'score'} } @scorelist;
for my $scoreRecord ( @sortedScores ) {
if( ! exists $scoreRecord->{'oldEx'}{'mapped'} &&
! exists $scoreRecord->{'newEx'}{'mapped'} ) {
$scoreRecord->{'oldEx'}{'mapped'} = 'crossmatch Clone Seq';
$scoreRecord->{'newEx'}{'mapped'} = 'crossmatch Clone Seq';;
$scoreRecord->{'newEx'}{'exon_stable'} =
$scoreRecord->{'oldEx'}{'exon_stable'};
print ( "Exon mapped:\t", $scoreRecord->{'oldEx'}{'exon_id'},"\t",
$scoreRecord->{'newEx'}{'exon_id'},"\t",
$scoreRecord->{'oldEx'}{'exon_stable'},"\n" );
$cmmaps++;
# print STDERR "---- CROSSMATCH MAP ----",$scoreRecord->{'score'},"\n";
}
}
}
# direct sequence mapping inside a clone
sub clone_seq_match {
my ( $oldCloneExons, $newCloneExons,
$oldSeqHash, $newSeqHash ) = @_;
my ( @sold, @snew );
@sold = sort { $a->{'chr_start'} <=> $b->{'chr_start'} } @$oldCloneExons;
@snew = sort { $a->{'chr_start'} <=> $b->{'chr_start'} } @$newCloneExons;
my %hashOnSeq;
for my $exon ( @$oldCloneExons ) {
my $seq = $oldSeqHash->{$exon->{'exon_id'}};
if( ! exists $hashOnSeq{ $seq } ) {
$hashOnSeq{$seq} = [[ $exon ],[]];
} else {
push( @{$hashOnSeq{ $seq }->[0]}, $exon );
}
}
for my $exon ( @$newCloneExons ) {
my $seq = $newSeqHash->{$exon->{'exon_id'}};
if( ! exists $hashOnSeq{ $seq } ) {
next;
} else {
push( @{$hashOnSeq{ $seq }->[1]}, $exon );
}
}
for my $seq ( keys %hashOnSeq ) {
my $seqEqExons = $hashOnSeq{$seq};
my $seqO = $seqEqExons->[0];
my $seqN = $seqEqExons->[1];
if(( ! @$seqO ) || ( ! @$seqN )) {
next;
}
if( scalar( @$seqO ) == scalar( @$seqN )) {
if( scalar( @$seqO ) > 1 ) {
print STDERR ( "Matching exon sequences.\n" );
}
for( my $i=0; $i<=$#$seqO; $i++ ) {
my $oEx = $seqO->[$i];
my $nEx = $seqN->[$i];
if( ! exists $oEx->{'mapped'} &&
! exists $nEx->{'mapped'} ) {
$oEx->{'mapped'} = 'same Clone Seq';
$nEx->{'mapped'} = 'same Clone Seq';
$nEx->{'exon_stable'} = $oEx->{'exon_stable'};
print ( "Exon mapped:\t", $oEx->{'exon_id'},"\t",
$nEx->{'exon_id'},"\t", $oEx->{'exon_stable'},"\n" );
$seqMatches++;
}
}
} else {
print STDERR ( "Unequal number of matching exon sequences.\n" );
}
}
}
# takes two exon lists and marks old exons with exact new exons matching
# marks new exons as matched as well
# does the direct approach. Exons are equal, when same position on unchanged contig
sub direct_mapping {
my ( $old, $new ) = @_;
my ( $sold, $snew );
print STDERR "Direct Mapping started: ",scalar( localtime() ),"\n";
@{$sold} = sort { exon_direct_compare( $a, $b ) } @$old;
@{$snew} = sort { exon_direct_compare( $a, $b ) } @$new;
print STDERR "Sorted: ",scalar(localtime()),"\n";
my ( $new_iter, $old_iter );
my ( $old_exon, $new_exon );
$new_iter = 0;
$old_iter = 0;
while( 1 ) {
$old_exon = $sold->[$old_iter];
$new_exon = $snew->[$new_iter];
my $exon_compare_result;
if(( $old_exon->{'clone_id'} cmp $new_exon->{'clone_id'}) == 0 &&
$old_exon->{'clone_version'} == $new_exon->{'clone_version'} &&
$old_exon->{'exon_strand'} == $new_exon->{'exon_strand'} &&
$old_exon->{'contig_offset'} == $new_exon->{'contig_offset'} ) {
$exon_compare_result = 0;
} else {
$exon_compare_result = exon_direct_compare( $old_exon, $new_exon );
}
if( $exon_compare_result == 0 ) {
# direct map possible
# find overlap between the two exons. Need >50% of new exon
# covered
my ( $overlap ) =
sort { $a <=> $b } ( $old_exon->{'exon_end'}-$old_exon->{'exon_start'}+1,
$new_exon->{'exon_end'}-$new_exon->{'exon_start'}+1,
$old_exon->{'exon_end'}-$new_exon->{'exon_start'}+1,
$new_exon->{'exon_end'}-$old_exon->{'exon_start'}+1 );
# print "---- MAPPING ----\n";
# print_exon( $old_exon );
# print_exon( $new_exon );
# print "OVERLAP: $overlap\n";
if(( $overlap /
( $new_exon->{'exon_end'} - $new_exon->{'exon_start'} + 1 )) > 0.5 ) {
# print "MAPPED\n";
$old_exon->{'mapped'} = 'direct';
$new_exon->{'mapped'} = 'direct';
$new_exon->{'exon_stable'} = $old_exon->{'exon_stable'};
print ( "Exon mapped:\t", $old_exon->{'exon_id'},"\t",
$new_exon->{'exon_id'},"\t", $old_exon->{'exon_stable'},"\n" );
$old_iter++;
$new_iter++;
if( $old_iter >= scalar( @$old ) ||
$new_iter >= scalar( @$new ) ) {
last;
}
$direct_maps++;
next;
}
}
# print "NOT MAPPED\n";
$exon_compare_result = exon_direct_compare( $old_exon, $new_exon );
if( $exon_compare_result == -1 ) {
$old_iter++;
if( $old_iter > scalar( @$sold ) ) {
last;
}
next;
} elsif( $exon_compare_result == 1 ) {
$new_iter++;
if( $new_iter > scalar( @$snew ) ) {
last;
}
next;
} else {
$old_iter++;
$new_iter++;
if( $old_iter > scalar( @$old ) ||
$new_iter > scalar( @$new ) ) {
last;
}
}
}
print STDERR "Direct Mapping finished: ",scalar( localtime() ),"\n";
return;
}
sub exon_direct_compare {
my ( $a, $b ) = @_;
return $a->{'clone_id'} cmp $b->{'clone_id'} ||
$a->{'clone_version'} <=> $b->{'clone_version'} ||
$a->{'contig_offset'} <=> $b->{'contig_offset'} ||
$a->{'exon_start'} <=> $b->{'exon_start'} ||
$a->{'exon_strand'} <=> $b->{'exon_strand'} ;
}
sub print_exon {
my $exon = shift;
for my $key ( sort keys %{$exon} ) {
print $key,"\t",$exon->{$key},"\n";
}
print "\n";
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment