From 46edfe0b26044ce45e99787f0c8546d4989997f6 Mon Sep 17 00:00:00 2001 From: Graham McVicker <mcvicker@sanger.ac.uk> Date: Fri, 9 Jan 2004 17:16:42 +0000 Subject: [PATCH] removed old perl ID mapping (v. 2) scripts --- misc-scripts/idmap2/MapGenes.pm | 208 ----------- misc-scripts/idmap2/MapTranscripts.pm | 247 ------------ misc-scripts/idmap2/README | 81 ---- misc-scripts/idmap2/SQL.pm | 193 ---------- misc-scripts/idmap2/apply_mappings.pl | 149 -------- misc-scripts/idmap2/double_exon_fix.pl | 157 -------- misc-scripts/idmap2/mapping.pl | 497 ------------------------- 7 files changed, 1532 deletions(-) delete mode 100644 misc-scripts/idmap2/MapGenes.pm delete mode 100644 misc-scripts/idmap2/MapTranscripts.pm delete mode 100644 misc-scripts/idmap2/README delete mode 100644 misc-scripts/idmap2/SQL.pm delete mode 100644 misc-scripts/idmap2/apply_mappings.pl delete mode 100644 misc-scripts/idmap2/double_exon_fix.pl delete mode 100644 misc-scripts/idmap2/mapping.pl diff --git a/misc-scripts/idmap2/MapGenes.pm b/misc-scripts/idmap2/MapGenes.pm deleted file mode 100644 index 10cf0b08b4..0000000000 --- a/misc-scripts/idmap2/MapGenes.pm +++ /dev/null @@ -1,208 +0,0 @@ -# here we expect a list of exons from old and new database -# each exon contains exon_stable to mark it as mapped - -# each exon contains the internal gene_id of the gene it is -# part of. This module tries to map the gene_stable ids over to -# the new exons. - - -package MapGenes; -use strict; - - -# starting with one exon, build_exon_group tries to build an -# 'envelope' around it. That is a set of genes and exons where neither -# of them misses parts. You call build_exon_group as often as needed -# until it doesnt produce new exons any more - -my ( %newGeneHash, %oldGeneHash, - %oldExonHash, %newExonHash, - @oldPruneExon, @newPruneExon ); -my $mappedGene; - -# go through old Exons -# skip if gene is mapped -# build envelope if not -sub map_genes { - my ( $oldExonInfo, $newExonInfo ) = @_; - - my %oldGenesMapped; - print STDERR ( "Start gene mapping ",scalar( localtime() ),"\n"); - init_lookup_tables( $oldExonInfo, $newExonInfo ); - - for my $oExon ( @oldPruneExon ) { - if( ! exists $oExon->{'gene_mapped'} ) { - my @map_exons = build_envelope( $oExon, \@oldPruneExon, \@newPruneExon ); - map_envelope( \@map_exons ); - } - } - print STDERR ( "End gene mapping ",scalar( localtime() ),"\n"); - print STDERR ( "Mapped Genes ",$mappedGene,"\n" ); -} - - - -sub map_envelope { - my $exon_stables = shift; - - my %gene_map_hash; - my $common_exons = 0; - - for my $exon_stable_id ( @$exon_stables ) { - if( exists $oldExonHash{ $exon_stable_id } && - exists $newExonHash{ $exon_stable_id } ) { - my ( $oEx, $nEx, $hashkey ); - $oEx = $oldExonHash{ $exon_stable_id }; - $nEx = $newExonHash{ $exon_stable_id }; - $common_exons++; - - $hashkey = $oEx->{'gene_id'}."-".$nEx->{'gene_id'}; - if( exists $gene_map_hash{$hashkey} ) { - $gene_map_hash{$hashkey}->[2] += - $nEx->{'exon_end'}-$nEx->{'exon_start'}+1; - } else { - $gene_map_hash{$hashkey} = [ $oEx->{'gene_id'}, $nEx->{'gene_id'}, - $nEx->{'exon_end'}-$nEx->{'exon_start'}+1, - $oEx->{'gene_stable'}]; - - } - } - } - - my @sortedMappings = sort { $b->[2] <=> $a->[2] } values %gene_map_hash; - my ( %oldGenesMapped, %newGenesMapped ); - - for my $mapRecord ( @sortedMappings ) { - if( ! exists $oldGenesMapped{$mapRecord->[0]} && - ! exists $newGenesMapped{$mapRecord->[1]} ) { - $oldGenesMapped{$mapRecord->[0]} = $mapRecord->[1]; - $newGenesMapped{$mapRecord->[1]} = $mapRecord->[0]; - print ( "Gene mapped:\t",$mapRecord->[0], "\t",$mapRecord->[1], - "\t",$mapRecord->[3],".\n" ); - $mappedGene++; - } - } - - for my $exon_stable_id ( @$exon_stables ) { - if( exists $oldExonHash{ $exon_stable_id }) { - $oldExonHash{ $exon_stable_id }{'gene_mapped'} = 1; - } - if( exists $newExonHash{ $exon_stable_id }) { - $newExonHash{ $exon_stable_id }{'gene_mapped'} = 1; - } - } - -} - - -sub init_lookup_tables { - my ( $oldExonInfo, $newExonInfo ) = @_; - - # init the lookup tables - for my $exon ( @$newExonInfo ) { - if( exists $exon->{'exon_stable'} ) { - push( @{$newGeneHash{$exon->{'gene_id'}}}, $exon ); - $newExonHash{ $exon->{'exon_stable'} } = $exon; - push( @newPruneExon, $exon ); - } - } - - for my $exon ( @$oldExonInfo ) { - # should exist for all of them ... - if( exists $newExonHash{ $exon->{'exon_stable'}} ) { - push( @{$oldGeneHash{$exon->{'gene_id'}}}, $exon ); - $oldExonHash{ $exon->{'exon_stable'} } = $exon; - push( @oldPruneExon, $exon ); - my $newExon = $newExonHash{ $exon->{'exon_stable'}}; - # print "vvvv Mapping vvvv\n"; - # ::print_exon( $exon ); - # ::print_exon( $newExon ); - # print "^^^^ Mapping ^^^^\n"; - } - } - - -# print "Lookup initialized: ",scalar( localtime() ),"\n"; -# print ( "old exons:",scalar( @oldPruneExon ),"\n" ); -# print ( "new exons:",scalar( @newPruneExon ),"\n" ); -} - - -# given one old exon, build envelope around it -# return set of stable ids which cover the -# given exon. - -# That means: All old and new genes of these exons have all -# their stable_id exons in this set as well. -sub build_envelope { - my ( $exon, $oExonInfo, $nExonInfo ) = @_; - - my $oGeneGroup = {}; - my $nGeneGroup = {}; - my $exonSet = {}; - my $newExons = []; - push( @$newExons, $exon->{'exon_stable'} ); - - $newExons = build_exon_group( $oGeneGroup, $exonSet, $newExons, - \%oldGeneHash, \%oldExonHash ); - - while( scalar( @$newExons )) { - $newExons = build_exon_group( $nGeneGroup, $exonSet, $newExons, - \%newGeneHash, \%newExonHash ); - if( scalar( @$newExons )) { - $newExons = build_exon_group( $oGeneGroup, $exonSet, $newExons, - \%oldGeneHash, \%oldExonHash ); - } - } - - # print ( "Exon envelope: ", join( " ", keys %$exonSet),"\n" ); - # print ( "Old Genes:", join( " ", keys %$oGeneGroup ) ,"\n" ); - # print ( "New Genes:", join( " ", keys %$nGeneGroup ) ,"\n\n" ); - return keys %$exonSet; -} - - -# expand newExonSet that it covers all exons of its genes -# wiuth respect to one exon/gene set. -sub build_exon_group { - my ( $geneGroup, $exonSet, $newExonSet, - $geneLookup, $exonLookup ) = @_; - # print STDERR ( "Build group ", join( " ", @$newExonSet ),"\n" ); - my (%newGenes, @newExons, $exon); - - # get Genes from new Exons - # get Exons from this genes - - for my $exon_stable ( @$newExonSet ) { - $exon = $exonLookup->{$exon_stable}; - my $geneId; - - $geneId = $exon->{'gene_id'}; - if( ! defined $geneId ) { - print STDERR "undefined "; - &::print_exon( $exon ); - } - if( !exists $geneGroup->{$geneId} ) { - $newGenes{$geneId} = 1; - $geneGroup->{$geneId} = 1; - } - } - - - for my $geneId ( keys %newGenes ) { - my $exons = $geneLookup->{$geneId}; - for my $exon ( @$exons ) { - if( ! exists( $exonSet->{$exon->{'exon_stable'}} ) ) { - $exonSet->{$exon->{'exon_stable'}} = 1; - push( @newExons, $exon->{'exon_stable'} ); -# print STDERR "Pushed Exon to Envelope\n"; - } - } - } - # print STDERR ( "Exon envelope: ", join( " ", @newExons),"\n\n" ); - - return \@newExons; -} - - -1; diff --git a/misc-scripts/idmap2/MapTranscripts.pm b/misc-scripts/idmap2/MapTranscripts.pm deleted file mode 100644 index 5d33d70709..0000000000 --- a/misc-scripts/idmap2/MapTranscripts.pm +++ /dev/null @@ -1,247 +0,0 @@ -# here we expect a list of exons from old and new database -# each exon contains exon_stable to mark it as mapped - -# each exon contains the internal transcript_id of the transcript it is -# part of. This module tries to map the transcript_stable ids over to -# the new exons. - - -package MapTranscripts; -use strict; - - -# starting with one exon, build_exon_group tries to build an -# 'envelope' around it. That is a set of transcripts and exons where neither -# of them misses parts. You call build_exon_group as often as needed -# until it doesnt produce new exons any more - -my ( %newTranscriptHash, %oldTranscriptHash, - %oldExonHash, %newExonHash ); - -my $mappedTranscripts; - - -# go through old Exons -# skip if Transcript is mapped -# build envelope if not -sub map_transcripts { - my ( $oldExonInfo, $newExonInfo ) = @_; - - my %oldGenesMapped; - print STDERR ( "Start transcript mapping ",scalar( localtime() ),"\n"); - init_lookup_tables( $oldExonInfo, $newExonInfo ); - - for my $oExon ( @$oldExonInfo ) { - if( ! exists $oExon->{'transcript_mapped'} ) { - my @map_exons = build_envelope( $oExon, $oldExonInfo, $newExonInfo ); - map_envelope( \@map_exons ); - } - } - print STDERR ( "End transcript mapping ",scalar( localtime() ),"\n"); - print STDERR ( "Mapped Transcripts ",$mappedTranscripts,"\n" ); -} - - - -sub map_envelope { - my $exon_stables = shift; - - my %transcript_map_hash; - my $common_exons = 0; - my ( @oldExList, @newExList ); - - for my $exon_stable_id ( @$exon_stables ) { - if( exists $oldExonHash{ $exon_stable_id } && - exists $newExonHash{ $exon_stable_id } ) { - my ( $oEx, $nEx, $hashkey ); - @oldExList = @{$oldExonHash{ $exon_stable_id }}; - @newExList = @{$newExonHash{ $exon_stable_id }}; - for my $oEx ( @oldExList ) { - for my $nEx ( @newExList ) { - - $hashkey = $oEx->{'transcript_id'}."-". - $nEx->{'transcript_id'}; - if( exists $transcript_map_hash{$hashkey} ) { - $transcript_map_hash{$hashkey}->[2] += - $nEx->{'exon_end'}-$nEx->{'exon_start'}+1; - } else { - $transcript_map_hash{$hashkey} = - [ $oEx->{'transcript_id'}, $nEx->{'transcript_id'}, - $nEx->{'exon_end'}-$nEx->{'exon_start'}+1, - $oEx->{'transcript_stable'}]; - - } - } - } - } - } - - my @sortedMappings = sort { $b->[2] <=> $a->[2] } values %transcript_map_hash; - my ( %oldTranscriptsMapped, %newTranscriptsMapped ); - - for my $mapRecord ( @sortedMappings ) { - if( ! exists $oldTranscriptsMapped{$mapRecord->[0]} && - ! exists $newTranscriptsMapped{$mapRecord->[1]} ) { - $oldTranscriptsMapped{$mapRecord->[0]} = $mapRecord->[1]; - $newTranscriptsMapped{$mapRecord->[1]} = $mapRecord->[0]; - print ( "Transcript mapped:\t",$mapRecord->[0], "\t",$mapRecord->[1], - "\t",$mapRecord->[3],"\n" ); - $mappedTranscripts++; - } - } - - for my $exon_stable_id ( @$exon_stables ) { - if( exists $oldExonHash{ $exon_stable_id }) { - for my $exon ( @{$oldExonHash{ $exon_stable_id }} ) { - $exon->{'transcript_mapped'} = 1; - } - } - if( exists $newExonHash{ $exon_stable_id }) { - for my $exon ( @{$newExonHash{ $exon_stable_id }} ) { - $exon->{'transcript_mapped'} = 1; - } - } - } - -} - - -sub init_lookup_tables { - my ( $oldExonInfo, $newExonInfo ) = @_; - - # problem: not all exons which are mapped ar emarked as such - # that happened if they are in more than one transcript - my %exHash= (); - - - for my $exon ( @$newExonInfo ) { - if( exists $exon->{'mapped'} ) { - $exHash{$exon->{'exon_id'}." ".$exon->{'sticky_rank'}." mapped"} = - $exon->{'mapped'}; - $exHash{$exon->{'exon_id'}." ".$exon->{'sticky_rank'}." stable"} = - $exon->{'exon_stable'}; - } - } - - for my $exon ( @$newExonInfo ) { - if( ! exists $exon->{'mapped'} && - ( exists $exHash{$exon->{'exon_id'}." ".$exon->{'sticky_rank'}." mapped"} )) { - $exon->{'mapped'}= $exHash{$exon->{'exon_id'}." ".$exon->{'sticky_rank'}." mapped"}; - $exon->{'exon_stable'}= $exHash{$exon->{'exon_id'}." ".$exon->{'sticky_rank'}." stable"}; - } - push( @{$newTranscriptHash{$exon->{'transcript_id'}}}, $exon ); - push( @{$newExonHash{ $exon->{'exon_stable'} }}, $exon ); - } - - %exHash = (); - - for my $exon ( @$oldExonInfo ) { - if( exists $exon->{'mapped'} ) { - $exHash{$exon->{'exon_id'}." ".$exon->{'sticky_rank'}} = - $exon->{'mapped'}; - } - } - - for my $exon ( @$oldExonInfo ) { - if( ! exists $exon->{'mapped'} && - ( exists $exHash{$exon->{'exon_id'}." ".$exon->{'sticky_rank'}} )) { - $exon->{'mapped'}= $exHash{$exon->{'exon_id'}." ".$exon->{'sticky_rank'}}; - } - if( exists $exon->{'mapped'} ) { - push( @{$oldTranscriptHash{$exon->{'transcript_id'}}}, $exon ); - push( @{$oldExonHash{ $exon->{'exon_stable'} }}, $exon ); - } - } - # remove all exons from $oldExonInfo which dont map - my $iter = 0; - while( $iter < scalar @$oldExonInfo ) { - my $exon = $oldExonInfo->[$iter]; - if( ! exists $exon->{'mapped'} ) { - splice( @$oldExonInfo, $iter ,1 ); - } else { - $iter++; - } - } -} - - -# given one old exon, build envelope around it -# return set of stable ids which cover the -# given exon. - -# That means: All old and new genes of these exons have all -# their stable_id exons in this set as well. -sub build_envelope { - my ( $exon, $oExonInfo, $nExonInfo ) = @_; - - my $oTranscriptGroup = {}; - my $nTranscriptGroup = {}; - my $exonSet = {}; - my $newExons = []; - - push( @$newExons, $exon->{'exon_stable'} ); - - $newExons = build_exon_group( $oTranscriptGroup, $exonSet, $newExons, - \%oldTranscriptHash, \%oldExonHash ); - - while( scalar( @$newExons )) { - $newExons = build_exon_group( $nTranscriptGroup, $exonSet, $newExons, - \%newTranscriptHash, \%newExonHash ); - if( scalar( @$newExons )) { - $newExons = build_exon_group( $oTranscriptGroup, $exonSet, $newExons, - \%oldTranscriptHash, \%oldExonHash ); - } - } - -# print ( "Exon envelope: ", join( " ", keys %$exonSet),"\n" ); -# print ( "Old Transcripts:", join( " ", keys %$oTranscriptGroup ) ,"\n" ); -# print ( "New Transcripts:", join( " ", keys %$nTranscriptGroup ) ,"\n\n" ); - return keys %$exonSet; -} - - -# expand newExonSet that it covers all exons of its transcripts -# with respect to one exon/transcript set. -sub build_exon_group { - my ( $transcriptGroup, $exonSet, $newExonSet, - $transcriptLookup, $exonLookup ) = @_; - # print STDERR ( "Build group ", join( " ", @$newExonSet ),"\n" ); - my (%newTranscripts, @newExons, $exon); - - # get Transcrips from new Exons - # get Exons from this Transcripts - - for my $exon_stable ( @$newExonSet ) { - for my $exon ( @{$exonLookup->{$exon_stable}} ) { - my $transcriptId; - - $transcriptId = $exon->{'transcript_id'}; - if( ! defined $transcriptId ) { - print STDERR "undefined "; - &::print_exon( $exon ); - } - if( !exists $transcriptGroup->{$transcriptId} ) { - $newTranscripts{$transcriptId} = 1; - $transcriptGroup->{$transcriptId} = 1; - } - } - } - - - for my $transcriptId ( keys %newTranscripts ) { - my $exons = $transcriptLookup->{$transcriptId}; - for my $exon ( @$exons ) { - if( ! exists( $exonSet->{$exon->{'exon_stable'}} ) ) { - $exonSet->{$exon->{'exon_stable'}} = 1; - push( @newExons, $exon->{'exon_stable'} ); -# print STDERR "Pushed Exon to Envelope\n"; - } - } - } - # print STDERR ( "Exon envelope: ", join( " ", @newExons),"\n\n" ); - - return \@newExons; -} - - -1; diff --git a/misc-scripts/idmap2/README b/misc-scripts/idmap2/README deleted file mode 100644 index 6dd7b1eddf..0000000000 --- a/misc-scripts/idmap2/README +++ /dev/null @@ -1,81 +0,0 @@ -This document shell describe how id mapping in EnsEMBL is done. - - -All idmapping will be based on mapping exons from an old release of our -database to the next. It is assumed that the new and the old release -are available in the schema of the new release and the code of the new -release is used throughout. ( There will not be a DBOLD directory -involved with the idmapping code. ) - -1. Exon mapping - -The basic proceeding is, to try to map all old exons over to the new -golden path. - - -1.1 Easy exon mapping - -The first exons ids mapped to the new database are the ones which keep -their position on the same contig. (No contig version update). - -Criteria: any overlap and sensitive handling of splits and merges. -(easier: 50% min overlap gets it ) - -1.2 New contig version - -Some Exons might sit on contigs which are still on the golden path in -the new release, but in a different version. Does any of the new Exons -on this contigs look like the old Exon (exact string match or crossmatch) - -Criteria: Crossmatch and 95% sequence identity. -Problem: more than one exon on the same clone is almost identical. -In this case use best match on clone.. - - -1.25 Redundant clone information - -Map exons on redundant clones to the new clone covering the same -area. - -1.3 Sister Exons - -Do we have unmapped Exons in old Genes which have sister Exons which -are mapped? Search for Exons in similar position on the new golden -path. - -Remark on sequence comparison: Use a window around exon sequence to map -intronic sequence as well. Window 50bp around the exons. - - -1.6 So far unmapped. - -Can we find a >95% sequence match somewhere in the genome? -If not consider this exon not to be mappable. - - - -2. Transcript, Gene mapping - -This mapping happens on the basis of the mapped exons. We screen -through the old gene, transcript objects and look if the majority of -their exons were mapped to a corresponding new object. Then the id is -transferred to that object. - -In case of a merger, split or general mix of old objects, we transfer -ids for the biggest number of exons translated from one old to one new -first, then for the second biggest and so on. - - - - - - - - - - - - - - - diff --git a/misc-scripts/idmap2/SQL.pm b/misc-scripts/idmap2/SQL.pm deleted file mode 100644 index 6488ee936f..0000000000 --- a/misc-scripts/idmap2/SQL.pm +++ /dev/null @@ -1,193 +0,0 @@ -# This module encapsulates the SQL needed for the idmapping -# Author: Arne Stabenau 3.10.2001 - -# ( why not use EnsEMBL here? Currently to slow?? This piece of software is -# going to be replaced very soon by a java version ) - -use DBI; - -package SQL; - - - -# takes a database handle and produces all old exon information needed - -sub old_orig_exon_information { - my $dbh = shift; - my %res; - - my $sth = $dbh->prepare( " - SELECT exon_stable, transcript_stable, - gene_stable, exon_id, gene_id, transcript_id, - clone_id, clone_version, sticky_rank, - contig_offset, contig_length, - exon_start, exon_end, exon_strand, - chr_name, chr_start, chr_end, - raw_start, raw_end, raw_ori - FROM exon_temp - ORDER by clone_id - " ); -# LIMIT 5000" ); - $sth->execute(); - - my $arrRef = _store_in_arrayref( $sth ); - for my $exon ( @$arrRef ) { - calc_chr_coord( $exon ); - } - - return $arrRef;; -} - -sub old_target_exon_information { - my $dbh = shift; - - my $sth = $dbh->prepare( " - SELECT exon_id, transcript_id, gene_id, - clone_id, clone_version, sticky_rank, - contig_offset, contig_length, - exon_start, exon_end, exon_strand, - chr_name, chr_start, chr_end, - raw_start, raw_end, raw_ori - FROM exon_temp - ORDER by clone_id - " ); -# LIMIT 5000"); - $sth->execute(); - my $arrRef = _store_in_arrayref( $sth ); - for my $exon ( @$arrRef ) { - calc_chr_coord( $exon ); - } - return $arrRef; -} - - - -sub orig_exon_information { - my $dbh = shift; - my %res; -# $dbh->do( "drop table exon_temp" ); - -# CREATE TABLE exon_temp - my $sth = $dbh->prepare( " - SELECT esi.stable_id as exon_stable, tsi.stable_id as transcript_stable, - gsi.stable_id as gene_stable, e.exon_id as exon_id, t.gene_id as gene_id, - t.transcript_id as transcript_id, - cl.embl_id as clone_id, cl.embl_version as clone_version, - c.offset as contig_offset, c.length as contig_length, - e.seq_start as exon_start, e.seq_end as exon_end, e.strand as exon_strand, - e.sticky_rank as sticky_rank, - sgp.chr_name as chr_name, sgp.chr_start as chr_start, sgp.chr_end as chr_end, - sgp.raw_start as raw_start, sgp.raw_end as raw_end, sgp.raw_ori as raw_ori - FROM exon e, exon_transcript et, exon_stable_id esi, contig c, clone cl, - static_golden_path sgp, transcript t, gene_stable_id gsi, - transcript_stable_id tsi - WHERE e.exon_id = et.exon_id - AND esi.exon_id = e.exon_id - AND et.transcript_id = t.transcript_id - AND tsi.transcript_id = et.transcript_id - AND t.gene_id = gsi.gene_id - AND c.internal_id = e.contig_id - AND cl.internal_id = c.clone - AND sgp.raw_id = c.internal_id - AND e.sticky_rank = 1 - ORDER by clone_id, clone_version,contig_offset, exon_strand -" ); - $sth->execute(); -# ORDER by gene_stable, transcript_stable, exon_stable, e.sticky_rank - - return _store_in_arrayref( $sth ); -} - -sub target_exon_information { - my $dbh = shift; -# $dbh->do( "drop table exon_temp" ); -# CREATE TABLE exon_temp - my $sth = $dbh->prepare( " - SELECT e.exon_id, t.transcript_id, t.gene_id, - cl.embl_id as clone_id, cl.embl_version as clone_version, - c.offset as contig_offset, c.length as contig_length, - e.seq_start as exon_start, e.seq_end as exon_end, e.strand as exon_strand, - e.sticky_rank as sticky_rank, - sgp.chr_name as chr_name, sgp.chr_start as chr_start, sgp.chr_end as chr_end, - sgp.raw_start as raw_start, sgp.raw_end as raw_end, sgp.raw_ori as raw_ori - FROM exon e, exon_transcript et, contig c, clone cl, - static_golden_path sgp, transcript t - WHERE e.exon_id = et.exon_id - AND et.transcript_id = t.transcript_id - AND c.internal_id = e.contig_id - AND cl.internal_id = c.clone - AND sgp.raw_id = c.internal_id - AND e.sticky_rank = 1 - ORDER by clone_id, clone_version,contig_offset, exon_strand -" ); - - $sth->execute(); - - return _store_in_arrayref( $sth ); -} - - -sub _store_in_arrayref { - my $sth = shift; - my $res = []; - - my $count = 0; - - while( my $hr = $sth->fetchrow_hashref() ) { - my %tempHash = %$hr; - push( @$res, \%tempHash ); - } - return $res; -} - -sub exon_sequence { - my $dbh = shift; - my $exon_id = shift; - my $exonSeq; - - my $sth = $dbh->prepare( " - SELECT e.exon_id, substring( d.sequence, e.seq_start, e.seq_end-e.seq_start+1 ), - e.seq_start as start, e.seq_end as end, - e.strand as strand, e.sticky_rank as rank - FROM exon e, contig c, dna d - WHERE e.contig_id = c.internal_id - AND c.dna = d.id - AND e.exon_id = ? - ORDER by rank - " ); - $sth->execute( $exon_id ); - - while( my $arrRef = $sth->fetchrow_arrayref ) { - my $part = $arrRef->[1]; - if( $arrRef->[4] != 1 ) { - $part = reverse( $part ); - $part =~ tr/ACGTacgt/TGCATGCA/; - } - $exonSeq .= $part; - } - - return $exonSeq; -} - - -sub calc_chr_coord { - my $ex = shift; - - my ( $chr_start, $chr_end, $chr_strand ); - - $chr_strand = $ex->{'exon_strand'} * $ex->{'raw_ori'}; - if( $ex->{'raw_ori'} == 1 ) { - $chr_start = $ex->{'exon_start'} + $ex->{'chr_start'} - $ex->{'raw_start'}; - $chr_end = $ex->{'exon_end'} + $ex->{'chr_start'} - $ex->{'raw_start'}; - } else { - $chr_start = $ex->{'chr_start'} + $ex->{'raw_end'} - $ex->{'exon_end'}; - $chr_end = $ex->{'chr_start'} + $ex->{'raw_end'} - $ex->{'exon_start'}; - } - - $ex->{'chr_start'} = $chr_start; - $ex->{'chr_end' } = $chr_end; - $ex->{'chr_strand' } = $chr_strand; -} - - -1; diff --git a/misc-scripts/idmap2/apply_mappings.pl b/misc-scripts/idmap2/apply_mappings.pl deleted file mode 100644 index 39d798cfc9..0000000000 --- a/misc-scripts/idmap2/apply_mappings.pl +++ /dev/null @@ -1,149 +0,0 @@ -# read the mapping out put file -# have the first free id in the variables - -# go through new exons. -# mapped, make a exon_stable_id row -# not mapped, generate a new stable_id - -# go through genes with same technique - -# go through transcript -# if mapped, retrieve the translation_stable_id - -use strict; -use DBI; - -my $today = "2002-02-20"; - -my $sdbh = DBI->connect( "DBI:mysql:host=ecs1d;database=homo_sapiens_core_130", "ensro", ""); -my $tdbh = DBI->connect( "DBI:mysql:host=ecs1e;database=ens_NCBI_28", "ensro", "" ); - -my $starttime = scalar( localtime() ); - -my $sth; -my $resultfile = "map_130_28.txt"; -my %stable_id; - -for my $table ( "gene_stable_id", "exon_stable_id", - "transcript_stable_id", "translation_stable_id" ) { - $sth = $sdbh->prepare( "select max( stable_id ) from $table" ); - $sth->execute(); - ( $stable_id{$table} ) = $sth->fetchrow_array(); -} - -# print join( " ", %stable_id ),"\n"; -# exit; - - -exon_stables(); -gene_stables(); -trans_stables(); - - -sub exon_stables { - my %exonTranslate; - - open( RES, $resultfile ); - while( <RES> ) { - /Exon mapped:/ || next; - my @cols; - chomp; - @cols = split( "\t",$_ ); - $exonTranslate{$cols[2]} = $cols[3]; - } - - close( RES ); - - $sth = $tdbh->prepare( "select distinct(exon_id) from exon" ); - $sth->execute; - open( EXON, ">exon_stable_id.txt" ) or die; - - while( my ( $exonId ) = $sth->fetchrow_array() ) { - if( ! exists $exonTranslate{$exonId} ) { - $stable_id{'exon_stable_id'}++; - print EXON ( join( "\t",($exonId, $stable_id{'exon_stable_id'},"1\t$today\t$today")),"\n" ); - } else { - print EXON ( join( "\t",($exonId, $exonTranslate{$exonId},"1\t$today\t$today")),"\n" ); - } - } - close( EXON ); -} - -sub gene_stables { - my %geneTranslate; - - open( RES, $resultfile ); - while( <RES> ) { - /Gene mapped:/ || next; - my @cols; - chomp; - @cols = split( "\t",$_ ); - $geneTranslate{$cols[2]} = $cols[3]; - } - - close( RES ); - - $sth = $tdbh->prepare( "select gene_id from gene" ); - $sth->execute; - - open( GEN, ">gene_stable_id.txt" ) or die; - - while( my ( $geneId ) = $sth->fetchrow_array() ) { - if( ! exists $geneTranslate{$geneId} ) { - $stable_id{'gene_stable_id'}++; - print GEN ( join( "\t",($geneId, $stable_id{'gene_stable_id'},"1\t$today\t$today")),"\n"); - } else { - print GEN ( join( "\t",($geneId, $geneTranslate{$geneId},"1\t$today\t$today")),"\n" ); - } - } - close ( GEN ); -} - -sub trans_stables { - my %transcriptHash; - my %transcriptTranslation; - - open( RES, $resultfile ); - while( <RES> ) { - /Transcript mapped:/ || next; - my @cols; - chomp; - @cols = split( "\t",$_ ); - $transcriptHash{$cols[2]} = $cols[3]; - } - - close( RES ); - - $sth = $sdbh->prepare( "select tsi.stable_id, tli.stable_id - from transcript_stable_id tsi, - transcript t, translation_stable_id tli - where t.transcript_id = tsi.transcript_id - and t.translation_id = tli.translation_id" ); - $sth->execute; - - while( my $arrref = $sth->fetchrow_arrayref() ) { - $transcriptTranslation{$arrref->[0]} = $arrref->[1]; - } - - $sth = $tdbh->prepare( "select transcript_id, translation_id from transcript" ); - $sth->execute(); - - open( TSC, ">transcript_stable_id.txt" ) or die; - open( TSL, ">translation_stable_id.txt" ) or die; - - while( my ( $tsc, $tsl ) = $sth->fetchrow_array() ) { - if( ! exists $transcriptHash{$tsc} ) { - $stable_id{'transcript_stable_id'}++; - $stable_id{'translation_stable_id'}++; - - print TSC ( join( "\t",($tsc, $stable_id{'transcript_stable_id'},"1")),"\n"); - print TSL ( join( "\t",($tsl, $stable_id{'translation_stable_id'},"1")),"\n"); - } else { - my $stable_id = $transcriptHash{$tsc}; - print TSC ( join( "\t",($tsc, $stable_id,"1")),"\n"); - print TSL ( join( "\t",($tsl, $transcriptTranslation{$stable_id},"1")),"\n"); - } - } - close ( TCS ); - close ( TSL ); -} diff --git a/misc-scripts/idmap2/double_exon_fix.pl b/misc-scripts/idmap2/double_exon_fix.pl deleted file mode 100644 index 732a523cf2..0000000000 --- a/misc-scripts/idmap2/double_exon_fix.pl +++ /dev/null @@ -1,157 +0,0 @@ -# this script removes exon duplicates from the ensembl database -# An Exon duplicate is an exon whose start end strand contig phase information -# is the same to another one. - -# step one: gather all exon information -# step two build mappings from internal ids to internal ids -# step three change exon_id in exon_transcript -# remove superfluous exon records -# remove exon_stable_id records and record which stable_ids were -# included from which others. - -use DBI; -use strict; - -my ( %exonHash, %stickies ); -my ( $rowsRead, $nrExons ); -my %exonTranslate; - -my $dbh = DBI->connect( "DBI:mysql:host=ecs1d;database=ens_UCSC_0801", "ensadmin", "ensembl"); - -my $sth = $dbh->prepare( "select exon_id, contig_id, seq_start, seq_end, strand, phase, end_phase, sticky_rank from exon order by exon_id, sticky_rank desc" ); -$sth->execute(); - - - - - -while( my $hr = $sth->fetchrow_hashref ) { - $rowsRead++; - my $hashkey = join( "\t", ( $hr->{'contig_id'}, $hr->{'seq_start'}, $hr->{'seq_end'}, - $hr->{'strand'}, $hr->{'phase'}, $hr->{'end_phase'} )); - push( @{$exonHash{$hashkey}},[ $hr->{'exon_id'}, $hr->{'sticky_rank'} ] ); - if( $hr->{'sticky_rank'} > 1 ) { - $stickies{$hr->{'exon_id'}} = 1; - } -} - -$nrExons = scalar( keys %exonHash ); - -print ( "Exons in table: ", $rowsRead,"\n" ); -print ( "Nonredundant: ",$nrExons,"\n" ); - - -# build equivalence classes -my @equClass; -my %allExons; - -my $i = 0; -for my $hashkey ( keys %exonHash ) { - $equClass[$i] = { 'position' => $hashkey, - 'exon_id' => undef }; - - for my $exon ( @{$exonHash{$hashkey}} ) { - $allExons{$exon->[0]}->[$exon->[1]-1] = $i; - } - $i++; -} - -# make new equclasses by combining old ones -my %combinedEqu; - -for my $exonid ( keys %allExons ) { - if( scalar( @{$allExons{$exonid}} ) > 1 ) { - my $newEqKey = join( "-",@{$allExons{$exonid}} ); - my $newEq; - # print STDERR "New equclass made $newEqKey\n"; - - if( ! exists $combinedEqu{$newEqKey} ) { - $equClass[$i] = { 'position' => $newEqKey, - 'exon_id' => undef }; - $combinedEqu{$newEqKey} = $i; - $newEq = $i; - $i++; - } else { - $newEq = $combinedEqu{$newEqKey}; - } - - $allExons{$exonid} = [$newEq]; - } -} - - -# now give away exon_ids for equClasses -for my $exonid ( keys %allExons ) { - my $equNo = $allExons{$exonid}->[0]; - if( defined $equClass[$equNo]->{'exon_id'} ) { - $exonTranslate{$exonid} = $equClass[$equNo]->{'exon_id'} - } else { - $equClass[$equNo]->{'exon_id'} = $exonid; - } -} - -print "Translation entries: ",scalar( keys %exonTranslate ),"\n"; - -for my $exon ( keys %exonTranslate ) { - print "Map:\t",$exon,"\t",$exonTranslate{$exon},"\n"; -} - -# dumping_data - -$dbh->disconnect; - - -exit; - -sub dumping_data { - # dump and filter exon, exon_transcript, exon_stable_id - # should do supporting feature I guess ... - # this is ONE OFF, - $sth = $dbh->prepare( "select * from exon" ); - $sth->execute(); - - open ( EXON, ">exon.txt" ) or die; - - while( my $arr = $sth->fetchrow_arrayref() ) { - if( ! defined $exonTranslate{$arr->[0]} ) { - print EXON join( "\t", @$arr ),"\n"; - } - } - close( EXON ); - - - open ( EXONT, ">exon_transcript.txt" ) or die; - $sth = $dbh->prepare( "select * from exon_transcript" ); - $sth->execute(); - while( my $arr = $sth->fetchrow_arrayref() ) { - if( defined $exonTranslate{$arr->[0]} ) { - $arr->[0] = $exonTranslate{$arr->[0]}; - } - print EXONT join( "\t", @$arr ),"\n"; - } - close( EXONT ); - - -# open ( EXONS, ">exon_stable_id.txt" ) or die; -# $sth = $dbh->prepare( "select * from exon_stable_id" ); -# $sth->execute(); -# while( my $arr = $sth->fetchrow_arrayref() ) { -# if( ! defined $exonTranslate{$arr->[0]} ) { -# print EXONS join( "\t", @$arr ),"\n"; -# } -# } -# close( EXONS ); - - open ( SUPF, ">supporting_feature.txt" ) or die; - $sth = $dbh->prepare( "select * from supporting_feature" ); - $sth->execute(); - while( my $arr = $sth->fetchrow_arrayref() ) { - if( defined $exonTranslate{$arr->[0]} ) { - $arr->[1] = $exonTranslate{$arr->[1]}; - } - print SUPF join( "\t", @$arr ),"\n"; - } - close( SUPF ); -} - - diff --git a/misc-scripts/idmap2/mapping.pl b/misc-scripts/idmap2/mapping.pl deleted file mode 100644 index 6aa54d870f..0000000000 --- a/misc-scripts/idmap2/mapping.pl +++ /dev/null @@ -1,497 +0,0 @@ -# this script produces idmapping for genes, transcripts, translations and exons -# It needs Database orig with stable_ids set -# it needs Database target where the stable ids are to be set -# it produces log output - -# it uses direct sql for speed. All queries are in SQL.pm -# it uses LSF for some jobs I guess ... - -use DBI; -use SQL; -use Bio::PrimarySeq; - -use strict; -use Bio::EnsEMBL::Pipeline::Runnable::CrossMatch; -use MapGenes; -use MapTranscripts; - -my $direct_maps = 0; -my $crossmatches = 0; -my $cloneGroups = 0; -my $seqMatches = 0; -my $cmmaps = 0; -my $lasttime; - -my $sdbh = DBI->connect( "DBI:mysql:host=ecs1d;database=homo_sapiens_core_130", "ensro", ""); -my $tdbh = DBI->connect( "DBI:mysql:host=ecs1e;database=ens_NCBI_28", "ensro", ""); - -my $starttime = scalar( localtime() ); - -print STDERR "Start: ",scalar(localtime()),"\n"; -my $alloExonInfo = &SQL::orig_exon_information( $sdbh ); -my $allnExonInfo = &SQL::target_exon_information( $tdbh ); - -# remove exon doublettes caused by multiple transcripts -my ( %exonHash, %geneHash ); -my ( $oExonInfo, $nExonInfo, @exonInfo ); -my ( $oGenes, $nGenes ); - -for my $exon ( @$alloExonInfo ) { - if( ! exists $exonHash{ $exon->{'exon_id'}."-".$exon->{'sticky_rank'} } ) { - push( @{$oExonInfo}, $exon ); - $exonHash{$exon->{'exon_id'}."-".$exon->{'sticky_rank'} } = 1; - } - if( ! exists $geneHash{$exon->{'gene_id'}} ) { - $geneHash{$exon->{'gene_id'}} = 1; - } -} - -# filter out stickies -# @{$oExonInfo} = ( grep { ! defined $exonHash{$_->{'exon_id'}."-2"} } @exonInfo ); - -$oGenes = scalar( keys %geneHash); - -# @exonInfo= (); -%geneHash = (); -%exonHash = (); - -for my $exon ( @$allnExonInfo ) { - if( ! exists $exonHash{ $exon->{'exon_id'}."-".$exon->{'sticky_rank'} } ) { - push( @{$nExonInfo},$exon ); - $exonHash{$exon->{'exon_id'}."-".$exon->{'sticky_rank'} } = 1; - } - if( ! exists $geneHash{$exon->{'gene_id'}} ) { - $geneHash{$exon->{'gene_id'}} = 1; - } -} -$nGenes = scalar( keys %geneHash); -# @{$nExonInfo} = ( grep { ! defined $exonHash{$_->{'exon_id'}."-2"} } @exonInfo ); - -print STDERR "Finish: ",scalar(localtime()),"\n"; - -print STDERR "OldExons: ",scalar( @$oExonInfo ),"\n"; -print STDERR "NewExons: ",scalar( @$nExonInfo ),"\n"; -print STDERR "OldGenes: $oGenes\n"; -print STDERR "NewGenes: $nGenes\n"; - - -direct_mapping( $oExonInfo, $nExonInfo ); -contig_version_update( $oExonInfo, $nExonInfo ); - -MapGenes::map_genes( $oExonInfo, $nExonInfo ); -MapTranscripts::map_transcripts( $alloExonInfo, $allnExonInfo ); - -$sdbh->disconnect(); - -$tdbh->disconnect(); - -print STDERR "Start: $starttime\n"; -print STDERR "End : ",scalar( localtime()),"\n"; - -print STDERR "Old Exon Count: ", scalar( @$nExonInfo ), "\n"; -print STDERR "New Exon Count: ", scalar( @$nExonInfo ), "\n"; -print STDERR "Direct Mappings: ", $direct_maps,"\n"; -print STDERR "Crossmatches: $crossmatches\n"; -print STDERR "CloneGroups: $cloneGroups\n"; -print STDERR "SeqMatches: $seqMatches\n"; -print STDERR "Mappings on Crossmatch: $cmmaps\n"; - -exit; - - -open( D1, ">dump1.txt" ); -open( D2, ">dump2.txt" ); - - -my ( @dumpOld, @dumpNew ); - -print STDERR "Dumping ...\n"; - -@dumpOld = sort { ( $a->{'clone_id'} cmp $b->{'clone_id'} ) || - ( $a->{'exon_start'} <=> $b->{'exon_start'}) } @$oExonInfo; -@dumpNew = sort { ( $a->{'clone_id'} cmp $b->{'clone_id'} ) || - ( $a->{'exon_start'} <=> $b->{'exon_start'}) } @$nExonInfo; - - -*STDOUT = *D1; -for my $ex ( @dumpOld ) { - print_exon( $ex ); -} - -close D1; - -*STDOUT = *D2; -for my $ex ( @dumpNew ) { - print_exon( $ex ); -} - -close D2; - - -exit; - - -# extract information (lot of data) -# direct mappings -# contig version update - - - -# similar as direct mapping -# checks if there is an old exon, and a new exon on same clone but -# updated version. Exclude clones with same version. - -sub contig_version_update { - my ( $old, $new ) = @_; - $lasttime = time(); - - my ( $pruneOld, $pruneNew, $sortOld, $sortNew ); - my ( $old_iter, $new_iter, $skip_clone ); - - @{$pruneOld} = grep { ! exists $_->{'mapped'} } @{$old}; - @{$pruneNew} = grep { ! exists $_->{'mapped'} } @{$new}; - - # prune short exons (<10) would be appropriat - print STDERR "Old Exons left to map: ",scalar( @$pruneOld ),"\n"; - print STDERR "New Exons left to map: ",scalar( @$pruneNew ),"\n"; - - @{$sortOld} = sort { $a->{'clone_id'} cmp $b->{'clone_id'} } @$pruneOld; - @{$sortNew} = sort { $a->{'clone_id'} cmp $b->{'clone_id'} } @$pruneNew; - - - $new_iter = 0; - $old_iter = 0; - - while( 1 ) { - - my $old_exon = $sortOld->[$old_iter]; - my $new_exon = $sortNew->[$new_iter]; - - my $cmp = ( $old_exon->{'clone_id'} cmp $new_exon->{'clone_id'} ); - if( $cmp == 0 ) { - my $clone_id = $old_exon->{'clone_id'}; - - # skip clones with no updates - if( $old_exon->{'clone_version'} == $new_exon->{'clone_version'} ) { - $skip_clone = 1; - } else { - $skip_clone = 0; - } - - my ( $newCloneExons, $oldCloneExons ); - while( defined $old_exon && - $old_exon->{'clone_id'} eq $clone_id ) { - push( @{$oldCloneExons}, $old_exon ); - $old_iter++; - $old_exon = $sortOld->[$old_iter]; - } - - while( defined $new_exon && - $new_exon->{'clone_id'} eq $clone_id ) { - push( @{$newCloneExons}, $new_exon ); - $new_iter++; - $new_exon = $sortNew->[$new_iter]; - } - if( !$skip_clone ) { - clone_map( $oldCloneExons, $newCloneExons ); - } - - } elsif ( $cmp < 0 ) { - $old_iter++; - } else { - $new_iter++; - } - if( $new_iter >= scalar( @$sortNew ) || - $old_iter >= scalar( @$sortOld )) { - last; - } - if( time() - $lasttime > 3600 ) { - print STDERR ( "Finished $old_iter exons ",scalar( localtime() ),"\n" ); - $lasttime = time(); - } - } - -} - - - -# map two sets of exons in the same clone from old to new -# by using crossmatch module ... -sub clone_map { - my ( $oldCloneExons, $newCloneExons ) = @_; - my ( $oldExHash, $newExHash ); - local *HERR, *HOUT; - - open ( NULL, ">/dev/null" ); - - $cloneGroups++; - - for my $exon ( @$oldCloneExons ) { - $oldExHash->{$exon->{'exon_id'}} = SQL::exon_sequence( $sdbh, $exon->{'exon_id'} ); - # $exon->{'seq'} = $oldExHash->{$exon->{'exon_id'}}; - } - - for my $exon ( @$newCloneExons ) { - $newExHash->{$exon->{'exon_id'}} = SQL::exon_sequence( $tdbh, $exon->{'exon_id'} ); - # $exon->{'seq'} = $newExHash->{$exon->{'exon_id'}}; - } - - clone_seq_match( $oldCloneExons, $newCloneExons, - $oldExHash, $newExHash ); - - # no crossmatch mappings - return; - - my ( @oldPrune, @newPrune, @scorelist ); - @oldPrune = grep { ! exists $_->{'mapped'} } @$oldCloneExons; - @newPrune = grep { ! exists $_->{'mapped'} } @$newCloneExons; - - for my $oEx ( @oldPrune ) { - for my $nEx ( @newPrune ) { - my $crossMatcher = new Bio::EnsEMBL::Pipeline::Runnable::CrossMatch - ( -seq1 => new Bio::PrimarySeq( -seq => $oldExHash->{$oEx->{'exon_id'}}, - -id => $oEx->{'exon_id'} ), - -seq2 => new Bio::PrimarySeq( -seq => $newExHash->{$nEx->{'exon_id'}}, - -id => $nEx->{'exon_id'} ), - -score => 200 - ); - - *HERR = *STDERR; - *HOUT = *STDOUT; - - *STDOUT = *NULL; - *STDERR = *NULL; - - $crossMatcher->run(); - - *STDOUT = *HOUT; - *STDERR = *HERR; - - $crossmatches++; - my $score = 0; - my $matches = 0; - for my $fp ( $crossMatcher->output() ) { - $score += $fp->score(); - $matches = 1; - } - if( $matches ) { - push( @scorelist, { score => $score, - oldEx => $oEx, - newEx => $nEx } ); - } - - } - } - - # now map exons which have crossmatches - my @sortedScores; - @sortedScores = sort { $b->{'score'} <=> $a->{'score'} } @scorelist; - - for my $scoreRecord ( @sortedScores ) { - if( ! exists $scoreRecord->{'oldEx'}{'mapped'} && - ! exists $scoreRecord->{'newEx'}{'mapped'} ) { - $scoreRecord->{'oldEx'}{'mapped'} = 'crossmatch Clone Seq'; - $scoreRecord->{'newEx'}{'mapped'} = 'crossmatch Clone Seq';; - $scoreRecord->{'newEx'}{'exon_stable'} = - $scoreRecord->{'oldEx'}{'exon_stable'}; - print ( "Exon mapped:\t", $scoreRecord->{'oldEx'}{'exon_id'},"\t", - $scoreRecord->{'newEx'}{'exon_id'},"\t", - $scoreRecord->{'oldEx'}{'exon_stable'},"\n" ); - - $cmmaps++; -# print STDERR "---- CROSSMATCH MAP ----",$scoreRecord->{'score'},"\n"; - } - } -} - - -# direct sequence mapping inside a clone -sub clone_seq_match { - my ( $oldCloneExons, $newCloneExons, - $oldSeqHash, $newSeqHash ) = @_; - - - my ( @sold, @snew ); - @sold = sort { $a->{'chr_start'} <=> $b->{'chr_start'} } @$oldCloneExons; - @snew = sort { $a->{'chr_start'} <=> $b->{'chr_start'} } @$newCloneExons; - - my %hashOnSeq; - - for my $exon ( @$oldCloneExons ) { - my $seq = $oldSeqHash->{$exon->{'exon_id'}}; - if( ! exists $hashOnSeq{ $seq } ) { - $hashOnSeq{$seq} = [[ $exon ],[]]; - } else { - push( @{$hashOnSeq{ $seq }->[0]}, $exon ); - } - } - - for my $exon ( @$newCloneExons ) { - my $seq = $newSeqHash->{$exon->{'exon_id'}}; - if( ! exists $hashOnSeq{ $seq } ) { - next; - } else { - push( @{$hashOnSeq{ $seq }->[1]}, $exon ); - } - } - - for my $seq ( keys %hashOnSeq ) { - my $seqEqExons = $hashOnSeq{$seq}; - my $seqO = $seqEqExons->[0]; - my $seqN = $seqEqExons->[1]; - - if(( ! @$seqO ) || ( ! @$seqN )) { - next; - } - - if( scalar( @$seqO ) == scalar( @$seqN )) { - if( scalar( @$seqO ) > 1 ) { - print STDERR ( "Matching exon sequences.\n" ); - } - for( my $i=0; $i<=$#$seqO; $i++ ) { - my $oEx = $seqO->[$i]; - my $nEx = $seqN->[$i]; - if( ! exists $oEx->{'mapped'} && - ! exists $nEx->{'mapped'} ) { - $oEx->{'mapped'} = 'same Clone Seq'; - $nEx->{'mapped'} = 'same Clone Seq'; - $nEx->{'exon_stable'} = $oEx->{'exon_stable'}; - print ( "Exon mapped:\t", $oEx->{'exon_id'},"\t", - $nEx->{'exon_id'},"\t", $oEx->{'exon_stable'},"\n" ); - $seqMatches++; - } - } - } else { - print STDERR ( "Unequal number of matching exon sequences.\n" ); - } - } -} - - - -# takes two exon lists and marks old exons with exact new exons matching -# marks new exons as matched as well -# does the direct approach. Exons are equal, when same position on unchanged contig - -sub direct_mapping { - my ( $old, $new ) = @_; - - my ( $sold, $snew ); - - print STDERR "Direct Mapping started: ",scalar( localtime() ),"\n"; - - @{$sold} = sort { exon_direct_compare( $a, $b ) } @$old; - @{$snew} = sort { exon_direct_compare( $a, $b ) } @$new; - - print STDERR "Sorted: ",scalar(localtime()),"\n"; - - my ( $new_iter, $old_iter ); - my ( $old_exon, $new_exon ); - - $new_iter = 0; - $old_iter = 0; - - while( 1 ) { - - $old_exon = $sold->[$old_iter]; - $new_exon = $snew->[$new_iter]; - my $exon_compare_result; - if(( $old_exon->{'clone_id'} cmp $new_exon->{'clone_id'}) == 0 && - $old_exon->{'clone_version'} == $new_exon->{'clone_version'} && - $old_exon->{'exon_strand'} == $new_exon->{'exon_strand'} && - $old_exon->{'contig_offset'} == $new_exon->{'contig_offset'} ) { - $exon_compare_result = 0; - } else { - $exon_compare_result = exon_direct_compare( $old_exon, $new_exon ); - } - - if( $exon_compare_result == 0 ) { - # direct map possible - - # find overlap between the two exons. Need >50% of new exon - # covered - - my ( $overlap ) = - sort { $a <=> $b } ( $old_exon->{'exon_end'}-$old_exon->{'exon_start'}+1, - $new_exon->{'exon_end'}-$new_exon->{'exon_start'}+1, - $old_exon->{'exon_end'}-$new_exon->{'exon_start'}+1, - $new_exon->{'exon_end'}-$old_exon->{'exon_start'}+1 ); -# print "---- MAPPING ----\n"; - -# print_exon( $old_exon ); -# print_exon( $new_exon ); -# print "OVERLAP: $overlap\n"; - - if(( $overlap / - ( $new_exon->{'exon_end'} - $new_exon->{'exon_start'} + 1 )) > 0.5 ) { - -# print "MAPPED\n"; - $old_exon->{'mapped'} = 'direct'; - $new_exon->{'mapped'} = 'direct'; - - $new_exon->{'exon_stable'} = $old_exon->{'exon_stable'}; - print ( "Exon mapped:\t", $old_exon->{'exon_id'},"\t", - $new_exon->{'exon_id'},"\t", $old_exon->{'exon_stable'},"\n" ); - $old_iter++; - $new_iter++; - - if( $old_iter >= scalar( @$old ) || - $new_iter >= scalar( @$new ) ) { - last; - } - $direct_maps++; - next; - } - } -# print "NOT MAPPED\n"; - - $exon_compare_result = exon_direct_compare( $old_exon, $new_exon ); - - if( $exon_compare_result == -1 ) { - $old_iter++; - if( $old_iter > scalar( @$sold ) ) { - last; - } - next; - - } elsif( $exon_compare_result == 1 ) { - $new_iter++; - if( $new_iter > scalar( @$snew ) ) { - last; - } - next; - - } else { - $old_iter++; - $new_iter++; - if( $old_iter > scalar( @$old ) || - $new_iter > scalar( @$new ) ) { - last; - } - - } - } - - print STDERR "Direct Mapping finished: ",scalar( localtime() ),"\n"; - return; -} - -sub exon_direct_compare { - my ( $a, $b ) = @_; - return $a->{'clone_id'} cmp $b->{'clone_id'} || - $a->{'clone_version'} <=> $b->{'clone_version'} || - $a->{'contig_offset'} <=> $b->{'contig_offset'} || - $a->{'exon_start'} <=> $b->{'exon_start'} || - $a->{'exon_strand'} <=> $b->{'exon_strand'} ; -} - - - - -sub print_exon { - my $exon = shift; - for my $key ( sort keys %{$exon} ) { - print $key,"\t",$exon->{$key},"\n"; - } - print "\n"; -} -- GitLab