diff --git a/scripts/igi/README b/scripts/igi/README deleted file mode 100644 index 121439d467684b912770f5207e55ba160546a236..0000000000000000000000000000000000000000 --- a/scripts/igi/README +++ /dev/null @@ -1,31 +0,0 @@ -# $Id$ -Just a little documentation on how this is supposed to work. Most of this is -written in scripted from in all-igi.sh; this may not be entirely up-to-date, -but you'll get the idea. - -- go to some igi working directory -- get all the data files -- make subdirs for each data source, containing their data file(s) -- run all-collates.sh. This will (or should) prepare one big gtf file foreach - data source in a new subdirectory out/collated/*.all - The data is now ready to be merged. -- run all-merges.sh. This will (or should) run all the pairwise, triple, (and - higher) merges on all the single-source gtf files, and dump the resulting - merged files into sub-directory out/merged/ -- run all-stats.sh. This will (or should) run all the statistics on each of - the merged files. The output goes into sub-directory out/stats/. It will - also produce file that map igi's to native ids and vice versa, in directory - out/mapping. It will output so-called summary files into out/summary, which - contain 'gene' features that extend from the start of the left most exon - found in any igi up to the end of the rightmost exon in any igi. The igi's - included are those given by the -cluster_n parameter (currently 2), which - says how many sources must 'report' an igi (i.e., predict something that - falls in this igi) before it's included in this set. The summary files are - then used to filter the merged/ files so that only the right cluster_n - igi's are listed. They are called 'final', and this bit is done by - gtfsummary2gtf.pl script. -- lastly, the peptides corresponding to the 'final' gft file are to be - dumped, using the original peptides produced by the different sources. -- run pep-collate to prepare the original peptide files for dumping. -- then run gtfsummary2pep.pl. -- gzip and ftp the stuff to the ftp site. diff --git a/scripts/igi/affymetrix.collate.sh b/scripts/igi/affymetrix.collate.sh deleted file mode 100755 index 99a692f32dd3b19796e82bd6010aea04fbb3880e..0000000000000000000000000000000000000000 --- a/scripts/igi/affymetrix.collate.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh -x -# -*- mode: sh; -*- - -for chr in "$@"; do - find $chr -name '*.affymetrix.gtf' -exec \ - nawk -F\t '$3=="exon" || $3 ~ "_codon" ' {} \; -done -# done | sort -k1,1 -k7,7 -k4,4n : not needed, sort will take place later on -# anyway - diff --git a/scripts/igi/all-collates.sh b/scripts/igi/all-collates.sh deleted file mode 100755 index a15c9f168b8d71020ac780635146a830e282db45..0000000000000000000000000000000000000000 --- a/scripts/igi/all-collates.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/sh -x -# -*- mode: sh; -*- -# $Id$ - -# script to prepare all the raw data files from all the different sources -# (ensembl, affymetrix, fgeneh etc.) that will go in to the merge. It will -# typically call a source-specific collate.sh script in the source-specific -# subdirectories. - -# If the only thing that changes across releases are file names or directory -# names, then having the right symlinks in place will make it general enough -# See comments on symlinks below each of the data sources - -igihome=$HOME/proj/igi # change to needs -resultdir=$igihome/out -outdir=$resultdir/collated - -[ -d $resultdir ] && echo "found $resultdir, not overwriting it" >&2 && exit 1 -mkdir $resultdir -mkdir $outdir - -# all=all.gtf # where local collated data goes to -# log=collate.log # where local log goes to -rename=remap-sources.sed # to translate the source field - -### EnsEMBL: -source=ensembl -indir=$igihome/$source - -[ ! -d $indir ] && echo "$indir: not found" >&2 && exit 1 -ensembl.collate.sh < $indir/ensembl.gtf 2> $outdir/$source.log | $rename > $outdir/$source.all -### end Ensembl - - -### Affymetrix -## It has everything in chromosome directories that don't seem to -## change across releases, but warn about this: -# chroms="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y" -chroms="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y UL NA" -echo "make sure that these are the right chromosomes:" | tee /dev/tty >&2 -echo $chroms >&2 -echo "" > /dev/tty -source=affymetrix -indir=$igihome/$source -[ ! -d $indir ] && echo "$indir: not found" >&2 && exit 1 -cd $indir -affymetrix.collate.sh $chroms 2> $outdir/$source.log | $rename > $outdir/$source.all -### end Affymetrix - -### Softberry/Fgenesh -source=fgenesh -indir=$igihome/$source # can be symlink -# outdir=$resultdir/$source -[ ! -d $indir ] && echo "$indir: not found" >&2 && exit 1 -cd $indir - -# fgenesh.collate.sh chr_gff/*.sgp.gff 2>$source.log | $rename > $source.all -fgenesh.collate.sh chr_gff/*.sgp.gff chr_r/*.sgp.gff 2>$outdir/$source.log | $rename > $outdir/$source.all - -# fgenesh chromo+chromo_coords -> fpcctg+fpc_coords mapping may be funny, run -# statistics on this: -missingdir=$outdir/fgenesh.missing -mkdir $missingdir -cd $missingdir -fgenesh.missing-stats.sh $outdir/$source.log # puts stuff into missing-stats/* - -### end Softberry/Fgenesh - - -# add more sources here if needed - -# done diff --git a/scripts/igi/all-igi.sh b/scripts/igi/all-igi.sh deleted file mode 100755 index ce7d3b9210e848b11742b81f9548f71e6b18823e..0000000000000000000000000000000000000000 --- a/scripts/igi/all-igi.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/sh -x -# -*- mode: sh; -*- -# $Id$ -# -# run the complete IGI show. May not be all that useful (since often I have -# to repair things by hand ... in fact, a Makefile would be more convenient), -# but at least can serve as some kind of documentation of what's supposed to -# happen. See also README in this directory, of course. - -# prepare the sources into a form that can be merged; results go to -# subdir out/collated -all-collates.sh > all-collates.out 2>all-collates.log - -# do the merge; results go to subdir out/merged -all-merges.sh > all-merges.out 2>all-merges.log - -# run statistics on the merges, and also produce the summary and final gtf -# file(s). Results go to out/{stats,mapping,summary,final} -all-stats.sh > all-stats.out 2>all-stats.log - -igihome=$HOME/proj/igi # change to needs -resultdir=$igihome/out -indir=$resultdir/merged - -outdir=$resultdir/stats -mappingoutdir=$resultdir/mapping -summaryoutdir=$resultdir/summary -finaloutdir=$resultdir/final - -allmerges=*.merge -fullmergeonly=ens_affy_fgenesh.merge -indir=$resultdir/merged - -# now produce the 'final' gtf files, i.e. the ones that are predicted by two -# or more sources: -for d in $finaloutdir; do - [ -d $d ] && echo "Found dir $d, not merging" >&2 && exit 1 - mkdir $d -done - -cd $indir -for m in $fullmergeonly; do - name=`basename $m .merge` - final=$name.gtf - gtfsummary2gtf.pl $summaryoutdir/$name.summary < $m > $finaloutdir/$final - gzip < $finaloutdir/$final > $finaloutdir/$final.gz -done - - -# do the peptide business; first, collate them: -pep-collate.sh 2> pep-collate.log - -# Now pull out the longest peptide of each original igi -# peptide file, using the 'valid' igi's from the summary files (which are -# produced by all-stats.sh. -summary=out/summary/ens_affy_fgenesh.summary -peptidefiles="ensembl/ensembl.pep affymetrix/affymetrix.pep fgenesh/fgenesh.pep" -outdir=out/pep -outfile=$outdir/igi3.pep -logfile=$outdir/igi3.log -gtfsummary2pep.pl $summary $peptidefiles > $outfile 2> $logfile -gzip -c $outfile > $outfile.gz -# ftp stuff, and we're done. - diff --git a/scripts/igi/all-merges.sh b/scripts/igi/all-merges.sh deleted file mode 100755 index dea015c64eb8e56ec1e165a17d38a59f2f38f26f..0000000000000000000000000000000000000000 --- a/scripts/igi/all-merges.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/sh -x -# -*- mode: sh; -*- -# $Id$ - -## script to do all the merges. Change to needs, this is not something very -## general or so. Although you probably could make that using some cheeky -## symlinks - -## For each source (ensembl, affymetrix, fgeneh etc.), it needs one big -## collated file (called all.gtf). These files (must) have been created first -## by all-collate.sh. - -igihome=$HOME/proj/igi # change to needs -resultdir=$igihome/out -outdir=$resultdir/merged -indir=$resultdir/collated - -prefix=igi3 - -[ -d $outdir ] && echo "Found dir $outdir, not merging" >&2 && exit 1 -mkdir $outdir -[ ! -d $indir ] && echo "$indir: not found" >&2 && exit 1 - -ens=ensembl.all -affy=affymetrix.all -fgenesh=fgenesh.all - -cd $outdir -# gtf_merge.pl -p $prefix $indir/$ens $indir/$affy > ens_affy.merge 2> ens_affy.log -# gtf_merge.pl -p $prefix $indir/$ens $indir/$fgenesh > ens_fgenesh.merge 2> ens_fgenesh.log -# gtf_merge.pl -p $prefix $indir/$affy $indir/$fgenesh > affy_fgenesh.merge 2> affy_fgenesh.log -gtf_merge.pl -p $prefix $indir/$ens $indir/$affy $indir/$fgenesh > ens_affy_fgenesh.merge 2>ens_affy_fgenesh.log - -cd $outdir -for i in *.merge; do - gzip < $i > $i.gz -done - -### after this, run the statistics on these files using all-stats.sh diff --git a/scripts/igi/all-stats.sh b/scripts/igi/all-stats.sh deleted file mode 100755 index c7042a446faa960b92610d8d1305ae3985d18296..0000000000000000000000000000000000000000 --- a/scripts/igi/all-stats.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/sh -x -# $Id$ - -# run the vital statistics on the results of the merges. This will -# use the *.merge files that result from running all-merges.sh - -igihome=$HOME/proj/igi # change to needs -resultdir=$igihome/out -indir=$resultdir/merged - -outdir=$resultdir/stats -mappingoutdir=$resultdir/mapping -summaryoutdir=$resultdir/summary -finaloutdir=$resultdir/final - -allmerges=*.merge -fullmergeonly=ens_affy_fgenesh.merge - -[ ! -d $indir ] && echo "$indir: not found ">&2 && exit 1 - -for d in $outdir $mappingoutdir $summaryoutdir ; do - [ -d $d ] && echo "Found dir $d, not merging" >&2 && exit 1 - mkdir $d -done - -cd $indir - -# for m in $allmerges ; do -for m in $fullmergeonly ; do - name=`basename $m .merge` - stats-from-merge-files.pl < $m -stats -chaining 5 \ - -igi2native $mappingoutdir/$name-i2n \ - -native2igi $mappingoutdir/$name-n2i \ - -cluster_n 2 \ - -gtfsummary $summaryoutdir/$name.summary \ - > $outdir/$name.stats 2> $outdir/$name.log -done - - diff --git a/scripts/igi/ensembl.collate.sh b/scripts/igi/ensembl.collate.sh deleted file mode 100755 index f2451b0c06d008ecc0614575e4f1c43ac401436b..0000000000000000000000000000000000000000 --- a/scripts/igi/ensembl.collate.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh -x -# -*- mode: sh; -*- - -nawk -F\t '$3=="exon" || $3 ~ "_codon" ' -# | sort -k1,1 -k7,7 -k4,4n diff --git a/scripts/igi/example.stats b/scripts/igi/example.stats deleted file mode 100644 index 1881f17a024f3a3dadf2ba27528e5b2c83805c74..0000000000000000000000000000000000000000 --- a/scripts/igi/example.stats +++ /dev/null @@ -1,120 +0,0 @@ -// $Id$ -### $Id: stats-from-merge-files.pl,v 1.17 2001/03/27 15:59:07 lijnzaad Exp -### run on Thu Mar 22 14:24:18 GMT 2001 by lijnzaad @ ecs1d -### argument(s): ens_affy_fgenesh.merge -### -rw-rw-r-- 1 lijnzaad ensembl 104089688 Mar 16 17:21 ens_affy_fgenesh.merge -Sources: AFFY ENS FGENH // i.e., this was a merge of gene predictions from - // two sources, AFFY ENS note: if the merge is done - // on AFFY, ENS + another source, statistics will be - // different, since the other source will link more - // native features into bigger igi's -number of igi's per source: // one 'igi' is a gene merged from all different - // predictions -ALL: 42854 -AFFY: 19087 // -ENS: 22467 // the numbers don't add up, because of overlaps. The closer -FGENH: 32832 // these numbers are to the ALL numbers, the better. ----- -number of igis shared with other source; sharing multiplicity -// (sorry about any misalignments ...) - Affyme ENS FGENH >= 3 >= 2 unshared total - - ALL 19087 (44%) 22467 (52%) 32832 (76%) 11484 (26%) 20048 (46%) 22806 (53%) 42854 -// AFFY has 44% of all igi's, ENS 52%, etc. 26% of all igi's is found in -// three sources, 46% in two or three sources, 53% is unique too it's -// source. There's a total of 42854 igi's. - - AFFY -- 12387 (64%) 13216 (69%) 11484 (60%) 14119 (73%) 4968 (26%) 19087 -// 64% of AFFY igi's is shared with ENS, 69% with FGENH,etc. 60% of the AFFY -// igi's are also found in two other sources, 73% in at least one other -// source, 26% are unique to AFFY. - - ENS 12387 (55%) -- 17413 (77%) 11484 (51%) 18316 (81%) 4151 (18%) 22467 -// 55% of ENS igis is shared with AFFY, etc. - - FGENH 13216 (40%) 17413 (53%) -- 11484 (34%) 19145 (58%) 13687 (41%) 32832 -// ----- -igi stats on ALL: - minl=3, maxl=40655289, avgl=66649 - // minimum length of an igi, maximum, average - minfeats=1, maxfeats=852, avgfeats=18 - // mininum number of features of an igi, maximum, average - minexons=0, maxexons=190, avgexons=5 - // mininum number of exons of an igi, maximum, average -igi stats on AFFY: // same stats, but now per source - minl=107, maxl=16016027, avgl=50930 - minfeats=1, maxfeats=563, avgfeats=13 - minexons=0, maxexons=190, avgexons=7 -. -. -. ----- -igi stats per cluster group: - // same stats, but now on the igi's found in >=1, >=2 , >=3 sources: -those in 1 sources: - minl=3, maxl=40655289, avgl=66649 - minfeats=1, maxfeats=852, avgfeats=18 - minexons=0, maxexons=190, avgexons=5 -those in 2 sources: - minl=137, maxl=36234355, avgl=80609 - minfeats=3, maxfeats=852, avgfeats=32 - minexons=1, maxexons=190, avgexons=8 -those in 3 sources: - minl=297, maxl=21843793, avgl=92732 - minfeats=6, maxfeats=852, avgfeats=44 - minexons=1, maxexons=190, avgexons=10 ----- -gene id chaining: histogram of native gene_ids per igi, and igi statistics on this: -source ALL: - 1: 20301 // i.e., 20301 igi's correspond to exactly one native gene - minl=3, maxl=40655289, avgl=51149 // igi statistics on this as above - minfeats=1, maxfeats=75, avgfeats=5 - minexons=0, maxexons=73, avgexons=3 - 2: 6708 // // This many igi's consist of - minl=35, maxl=39446593, avgl=64369 // 2 different native - minfeats=3, maxfeats=265, avgfeats=11 // genes. The average igi gene - minexons=1, maxexons=52, avgexons=4 // length goes up the more - 3: 8807 // native genes it has (of - . // course) - . - . - 26: 1 - minl=2114986, maxl=2114986, avgl=2114986 - minfeats=113, maxfeats=113, avgfeats=113 - minexons=13, maxexons=13, avgexons=13 -source AFFY: // same thing, but now per source. - 1: 18146 - minl=102, maxl=63721433, avgl=4277470 - minfeats=1, maxfeats=762, avgfeats=26 - minexons=0, maxexons=190, avgexons=7 - 2: 787 - . - . - . ----- -igi's and native ids of igi's that chain together >= 5 native ids -source AFFY: - chaining together 5 native ids ( 4 cases) : - igi3_39917 [ ctg22fin4 3596308 3701371 - ]: - // this igi, which lies on fpcctg ctg22fin4, start 3596308, - // end 3701371, reverse strand, chains to together the following - // AFFY gene_ids: - AFFY Affy.ctg22fin4-000000.57 [ 3596308 3599716 ] - // runs from 3596308 to 3599716. This list is sorted by coordinate - AFFY Affy.ctg22fin4-000000.59 [ 3609955 3618417 ] - AFFY Affy.ctg22fin4-000000.61 [ 3636993 3643656 ] - AFFY Affy.ctg22fin4-000000.62 [ 3672539 3680629 ] - AFFY Affy.ctg22fin4-000000.63 [ 3691050 3701371 ] - // this chaining together is brought about by the following genes - // from other sources; this list is sorted by source, then coordinates - linked by: - ENS ENSG00000099970 [ 3596311 3698200 ] - ENS ENSG00000099977 [ 3610141 3613001 ] - ENS ENSG00000099987 [ 3642053 3643651 ] - FGENH S.C22000279 [ 3596307 3600526 ] - FGENH S.C22000281 [ 3612136 3616899 ] - FGENH S.C22000284 [ 3637133 3643650 ] - FGENH S.C22000288 [ 3672820 3680628 ] - FGENH S.C22000289 [ 3691049 3698199 ] - FGENH S.C22000291 [ 3699141 3700957 ] diff --git a/scripts/igi/exon-lengths.awk b/scripts/igi/exon-lengths.awk deleted file mode 100755 index 1623d2569dd00663e8328c0ea68d0395d426ac12..0000000000000000000000000000000000000000 --- a/scripts/igi/exon-lengths.awk +++ /dev/null @@ -1,14 +0,0 @@ -#! /usr/bin/nawk -f -# $Id$ -# Little script to find statistics on exon lengths from a gtf file. -BEGIN { min=100000; max=-1} -$3=="exon" { - n++ - len=($5-$4) - tot+=len; - if (len<min) min=len - if (len>max) max=len -} -END { - print "avg: ", tot/n, "min: ", min, "max: ", max -} diff --git a/scripts/igi/fgeneh/THIS-DIR-IS-OBSOLETE b/scripts/igi/fgeneh/THIS-DIR-IS-OBSOLETE deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/scripts/igi/fgenesh-chromo2fpc.pl b/scripts/igi/fgenesh-chromo2fpc.pl deleted file mode 100755 index 755ae9821fcdce94662aa21c7e69ddb62351d4ef..0000000000000000000000000000000000000000 --- a/scripts/igi/fgenesh-chromo2fpc.pl +++ /dev/null @@ -1,162 +0,0 @@ -#!/usr/local/bin/perl -# $Id$ - -# Script to translate fgenesh++ gtf files from chromosome+chr_coords to -# fpcctg_name + fpc_coords. Is a bit tricky due to our golden path -# missing (parts of) fpc_contigs. -# This is used for the IGI work. - -# cared for by Philip lijnzaad@ebi.ac.uk - -use DBI; -use strict; - -use vars qw($opt_h $opt_C); - -use Getopt::Long; -# use Bio::EnsEMBL::Utils::GTF_Merge('gtf_merge'); -use Bio::EnsEMBL::DBSQL::Obj; -use Bio::EnsEMBL::DBLoader; - -my $Usage = "Usage: $0 [ -h (help) ] < chromo-coords.gtf > fpc-coords.gtf [ >& log ]\n"; -my $opts = 'hC:'; - -my $connection = ($opt_C || 'host=ecs1c;user=ensro;database=ensembl080;'); - -#Database options -my $host = 'ecs1c'; -my $port = undef; -my $dbname = 'ensembl080'; -my $dbuser = 'ensro'; -my $dbpass = undef; -my $module = 'Bio::EnsEMBL::DBSQL::Obj'; -my $help; - -&GetOptions( - 'host:s' => \$host, - 'dbname:s' => \$dbname, - 'dbuser:s' => \$dbuser, - 'dbpass:s' => \$dbpass, - 'module:s' => \$module, - 'h|help' => \$help, - ); -die $Usage if $help; - -my $locator = "$module/host=$host;port=$port;dbname=$dbname;user=$dbuser;pass=$dbpass"; - -my $db = Bio::EnsEMBL::DBLoader->new($locator); - -my %bighash = undef; - -&read_mapping($db); - -# $db->static_golden_path_type('UCSC'); -# my $stadaptor = $db->get_StaticGoldenPathAdaptor(); - -while(<>) { - chomp($_); - my ($chr, $source, $tag, $start, $end, $score, $strand, $frame, @rest) - = split("\t"); -### depracated, much too slow (2 s per query) -### ($chr, $start, $end) = $stadaptor->convert_chromosome_to_fpc($chr,$start, $end); - - #reformat a bit as well: - grep( s/ ([^ ]+)/ "$1";/ , @rest); - my $rest = join(' ', @rest); - chomp $rest; - - my ($contig, $ctg_start, $ctg_end, $status) = map_coords($chr,$start, $end); - - if (! $status ) { - print join("\t", ($contig, $source, $tag, - $ctg_start, $ctg_end, $score, - $strand, $frame, $rest)), "\n"; - } elsif ($status eq 'init' ) { - warn "$chr: ignoring it, but it might lie on missing initial part of $contig: $_\n"; - } elsif ($status eq 'missing') { - warn "$chr: cannot find contig for $_; ignoring it\n"; - } elsif ($status eq 'bug') { - warn "$chr: bug with $_; ignoring it\n"; - } else { - die "Bugger"; # even bigger bug - } -} # while - -sub read_mapping { - my ($db) = shift; - my ($q) = "SELECT chr_name, MIN(chr_start), MAX(chr_end), fpcctg_name, MIN(fpcctg_start), MAX(fpcctg_end) - FROM static_golden_path - GROUP BY fpcctg_name - ORDER BY chr_name, chr_start"; - $q=$db->prepare($q) || die $db->errstr; - $q->execute() || die $db->errstr; - - while ( my ($chr, $chr_start, $chr_end, $fpcctg_name, $fpcctg_start, $fpcctg_end ) - = $q->fetchrow_array ) { - die $db->errstr if $q->err; - push @{$bighash{$chr}}, [ $chr_start, $chr_end, $fpcctg_name, $fpcctg_start ]; - # structure : hash{chr} contains sorted list of these arrays - - } - return; -} # read_mapping - -# return fpc_ctg_name, start and end, status, given the chromosome+start/end -sub map_coords { - my ($chr, $start, $end) = @_; - - my $contigs = $bighash{$chr}; - my $n = 0; - - if (! defined $contigs) { - warn "don't know $chr"; - return ('ctg_on_unknown_chromosome', -999, -999, 'bug'); - } - - foreach my $l ( @{$contigs} ) { - if ( $start < $l->[1] && $start >= $l->[0] ) { - # found it - my ( $chr_start, $chr_end, $fpcctg_name, $fpcctg_start) - = @$l; - return ($fpcctg_name, - $fpcctg_start-1+ $start - $chr_start, - $fpcctg_start-1+ $end - $chr_start, undef); - } - } - # this happens (only?) if we have an fpccontig with start != 1; - # look for the missing bit. - my $prev = undef; - foreach my $curr ( @{$contigs} ) { - if (defined $prev - && $start > $prev->[1] && $start < $curr->[0] ) { - ## see if we were correct - my ( $chr_start, $chr_end, $fpcctg_name, $fpcctg_start) - = @$curr; - if ( $fpcctg_start == 1 ) { - # this means we can't pretend that it's on the missing - # piece of this contig; so this contig is simply missing - # from our static_golden_path - return ('missing_ctg', -999, -999, 'missing'); - } - #else: - # fpccontig_start is not 1; now just pretend this feature - # is on the 'unknown' first bit of the current fpc contig, - # and pretend our fpccontig actually did start at 1: - my $status = 'init'; - $chr_start -= ($fpcctg_start -1); - $fpcctg_start =1; - my $newstart = $fpcctg_start-1+ $start - $chr_start; - $status = 'missing' if ($newstart < 1); - # (i.e., it is too far away from our fpcctg_start to still be - # able to pretend it's still on this contig, so report as missing) - - return ($fpcctg_name, - $newstart, - $fpcctg_start-1+ $end - $chr_start, $status); - } - $prev = $curr; - } - warn "really couldn find coords of $chr $start $end\n"; - return ('bug_ctg', -999, -999, 'bug'); -} # map_coords - diff --git a/scripts/igi/fgenesh.README b/scripts/igi/fgenesh.README deleted file mode 100644 index 5154756de474288af4d300aff15cc4e3a9e604ed..0000000000000000000000000000000000000000 --- a/scripts/igi/fgenesh.README +++ /dev/null @@ -1,13 +0,0 @@ -*.sgp.gff: original fgenesh files -collate.sh: used to both convert chromosomes/chromo_coords to - fpccontigs/fpc_coords. Needs ensembl/scripts/chromo2fpc.pl -all.gff: (unsorted) file with all coord-converted features -all.log: the log file - -## -missing-stats.sh: compiles vital statistics: -missing-stats: the output -assumed-on-first: some features can't be found, but they appear just before - an fpccontig for which fpc_start != 1; in this case, we assume it maps to - this missing first part of the contig -missing: these features can't be found at all, and lie in gaps diff --git a/scripts/igi/fgenesh.collate.sh b/scripts/igi/fgenesh.collate.sh deleted file mode 100755 index 2dc08d4256947bd4afedf890adaaebfd77e38d7d..0000000000000000000000000000000000000000 --- a/scripts/igi/fgenesh.collate.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/sh -x -# -*- mode: sh; -*- - -# mangle all files into one big happy file; have to convert chromosome to fpc -# coordinates first: -convert=fgenesh-chromo2fpc.pl # in ensembl/scripts/igi - -for f in "$@" ; do - nawk -F\t '$3=="exon" || $3 ~ "_codon" ' $f | $convert -done -# sort is not needed, done later anyway - - diff --git a/scripts/igi/fgenesh.missing-stats.sh b/scripts/igi/fgenesh.missing-stats.sh deleted file mode 100755 index 0418d510a438cf5899a6f49211a1515d012efda0..0000000000000000000000000000000000000000 --- a/scripts/igi/fgenesh.missing-stats.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/sh -x -# -*- mode: sh; -*- -# $Id$ - -# litte script to compile statistics on missing features: - -outdir=./ -# create the following files: -missing=$outdir/missing -init=$outdir/assumed-on-first -stats=$outdir/stats -tmp=$outdir/x - -Usage="Usage: $0 logfile. Reads from $logfile, writes to files $missing, $init, $stats" -if [ $# -ne 1 ] ; then - echo $Usage >&2 - exit 1; -fi - -logfile=$1 - -awk -F';' '/initial/{print $1}' $logfile | sort -u > $init -awk -F';' '/cannot/{print $1}' $logfile | sort -u > $missing - -chroms="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y" - -rm $tmp -echo "# features assumed to lie on first (missing) bit of fpc_contig" > $tmp -for c in $chroms; do - echo -n "chr$c:" >> $tmp - awk -F';' '/initial/{print $1}' $logfile | grep chr$c | sort -u | wc -l >> $tmp -done -echo -n "# total:" >> $tmp - awk -F';' '/initial/{print $1}' $logfile | sort -u | wc -l >> $tmp - -echo "# missing features" >> $tmp -chroms="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y" -for c in $chroms; do - echo -n "chr$c:" >> $tmp - awk -F';' '/cannot/{print $1}' $logfile | grep chr$c | sort -u | wc -l >> $tmp -done -echo -n "# total" >> $tmp -awk -F';' '/cannot/{print $1}' $logfile | sort -u | wc -l >> $tmp - -awk -F: '/#/ || $2 > 0' < $tmp > $stats -exit 0 diff --git a/scripts/igi/gtfsummary2gtf.pl b/scripts/igi/gtfsummary2gtf.pl deleted file mode 100755 index fcfbc086e3d986876ef6aa5824d5d58d4b2460db..0000000000000000000000000000000000000000 --- a/scripts/igi/gtfsummary2gtf.pl +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/local/bin/perl - -# $Id$ - -# This script takes a gtf summary file and a merged gft file, and from -# this, filters out the features that don't appear in the summary -# file. The result is the 'final' gtf file. - -# Written by Philip lijnzaad@ebi.ac.uk -# Copyright EnsEMBL http://www.ensembl.org - -use strict; -use Getopt::Long; -use Bio::EnsEMBL::Utils::igi_utils; - -my $usage = "$0 options < merged-file.gtf gtf.summary > gtfile\n"; -my $help; -my @argv_copy = @ARGV; - -&GetOptions( 'h|help' => \$help - ); -die $usage if $help; - -(my $summary = shift) || die $usage; - -open(SUMMARY, "< $summary") || die "$summary: $!"; - -my $wanted_transcripts - = Bio::EnsEMBL::Utils::igi_utils::read_transcript_ids_from_summary(\*SUMMARY); -# all igi's in one big hash - -my $blurp = blurp(); -print $blurp; -print <<MOREBLURP -### Merged GTF file based on summary file $summary; see the header of that -### file to know how many sources had to have predicted this gene in order -### to 'make' it into this file -MOREBLURP - ; - -GTF_LINE: -while (<>) { - # taken largely from Bio::EnsEMBL::Utils::GTF_handler: - s/#.*$//; - next GTF_LINE if /^\s*$/; - chomp; - - my @fields = split "\t", $_; - my ($seq_name, $source, $feature, - $start, $end, $score, - $strand, $phase, $group_field) = @fields; - $feature = lc $feature; - - unless ($group_field) { - warn("line $.: no group field: skipping : '$_'"); - next GTF_LINE ; - } - - # Extract the rest of the information from the 'merged' gtf file. - my ($igi, $gene_name, $native_ids, $transcript_ids, $exon_num, $exon_id) = - Bio::EnsEMBL::Utils::igi_utils::parse_group_field($group_field); - - if (@$transcript_ids == 0 ) { - die("line has no transcript_id: '$_'\n"); - } - - if (@$transcript_ids > 1 ) { # shouldn't really happen ... - warn("line has more than one transcript_id, taking first one: '$_'\n"); - } - my $transcript_id = $$transcript_ids[0]; - - # the summary file has them as "source:id": - unless( $wanted_transcripts->{"$source:$transcript_id"} ) { - next GTF_LINE ; - } - - # found a feature listed as wanted in the summary file; rearrange - # things a bit so the order is more consistent: - @fields = ($seq_name, $source, $feature, - $start, $end, $score, - $strand, $phase); - - my $native_id; - if (@$native_ids == 0 ) { - die("line has no gene_id: '$_'\n"); - } - - if (int(@$native_ids) > 1 ) { # this would be bizarre, but never - # mind - warn("Line with several gene_ids (taking first one): '$_'\n"); - } - $native_id = $ {$native_ids}[0]; - - if (int(@$transcript_ids) > 1 ) { # also bizarre - warn("Line with several transcript_ids (taking first one): '$_'\n"); - } - $transcript_id = $ {$transcript_ids}[0]; - - my $rest = "igi_id \"$igi\"; gene_id \"$native_id\"; "; - $rest .= "gene_name \"$gene_name\"; " if $gene_name; - $rest .= "transcript_id \"$transcript_id\"; "; - $rest .= "exon_id \"$exon_id\"; " if $exon_id; - $rest .= "exon_number $exon_num; " if defined($exon_num); - push @fields, $rest; - print join("\t", @fields), "\n"; - -} # while(<>) - - -### simple log message -sub blurp { - my (@stuff) = ("on", `date`, "by", `whoami`, "@", `hostname`); - foreach (@stuff) {chomp}; - my $s = '### Produced by $Id$ ' . "\n" - . "### run " . join(' ',@stuff) . "\n" - . "### for EnsEMBL (http://www.ensembl.org)\n" - . "### argument(s): ". join(' ', @argv_copy) . "\n"; - foreach (@argv_copy) { $s .= "### ", `ls -l $_`; } - $s .= "###\n"; - $s; -} # blurp - diff --git a/scripts/igi/gtfsummary2pep.pl b/scripts/igi/gtfsummary2pep.pl deleted file mode 100755 index ae673ab9b1780095a5b45aaa9595875a776a792c..0000000000000000000000000000000000000000 --- a/scripts/igi/gtfsummary2pep.pl +++ /dev/null @@ -1,149 +0,0 @@ -#!/usr/local/bin/perl - -# $Id$ - -# This script takes a gtf summary file and peptide files, and produces an -# igi peptide file. For each igi, it simply takes the longest transcript. - -# Written by Philip lijnzaad@ebi.ac.uk -# Copyright EnsEMBL http://www.ensembl.org - -use strict; -use Getopt::Long; -use Bio::EnsEMBL::Utils::igi_utils; - -my $usage = "$0 options gtf.summary peptidefiles > peptidefile\n"; -my $help; -my @argv_copy = @ARGV; - -&GetOptions( 'h|help' => \$help - ); -die $usage if $help; - -(my $summary = shift) || die $usage; - -open(SUMMARY, "< $summary") || die "Summary file $summary: $!"; - -# put all igi's in one big hash(ref) (a table of of igi to lists of -# native_ids): -my $all_igis = - Bio::EnsEMBL::Utils::igi_utils::read_igis_from_summary(\*SUMMARY); - -my $igis_of_natives = igi_of_native($all_igis); -my $all_peptides={}; - -foreach my $file ( @ARGV) { - warn "reading $file ...\n"; - read_peptide_file($all_peptides, $file, $igis_of_natives); - warn " done\n"; -} - -# my $blurp = blurp(); -# print $blurp; -# print <<MOREBLURP -# ### IPI file {based on: see arguments line) -# MOREBLURP -# ; - -print_peptides($all_igis, $all_peptides); - -# ### simple log message -# sub blurp { -# my (@stuff) = ("on", `date`, "by", `whoami`, "@", `hostname`); -# foreach (@stuff) {chomp}; -# my $s = '### Produced by $Id$ ' . "\n" -# . "### run " . join(' ',@stuff) . "\n" -# . "### for EnsEMBL (http://www.ensembl.org)\n" -# . "### argument(s): ". join(' ', @argv_copy) . "\n"; -# foreach (@argv_copy) { $s .= "### ", `ls -l $_`; } -# $s .= "###\n"; -# $s; -# } # blurp - -# invert the igis hash so we can lookup the igi given a native id: -sub igi_of_native { - my ($all_igis) = @_; - my %igis_of_natives = undef; - foreach my $igi (keys %$all_igis) { - my @natives = @{$all_igis->{$igi}}; - foreach my $nat ( @natives ) { # $nat includes the source prefix, - # e.g. FGENH:S.C16000125 - $igis_of_natives{$nat} = $igi; - } - } - return \%igis_of_natives; -} - -# add peptides all into one big ugly hash -sub read_peptide_file { - my ($peptides, $file, $igis_of_natives) = @_; - my ($kept, $ignored) = (0,0); - - open(IN, $file) || die "$file: $!"; - - local( $/ ) = "\n>"; - PEPTIDE: - while( <IN> ) { - s/>//g; - my $n =(index $_, "\n"); - my $firstline = substr($_, 0, $n); - my $sequence = substr($_, $n+1); - my $seqlen = ($sequence =~ tr /a-zA-Z//); - - my ($native_id, $rest) = ( $firstline =~ /(\S+)(.*)$/); - unless ($native_id) { - die "no native_id for $." - } - - my $igi = $igis_of_natives->{$native_id}; - - if (! defined($igi)) {; # didn't make it into the IGI set - $ignored++; - next PEPTIDE ; - } - $kept++; - push @{$peptides->{$igi}}, [$seqlen, $native_id, $rest, $sequence ]; - } - warn "From $file, kept $kept , ignored $ignored peptides\n"; - close(IN); - return; -} # read_peptide_file - - -sub print_peptides { - my ($igis,$peptides) = @_; - my ($ignored, $kept) = (0,0); - -# $a - my @igis = sort { Bio::EnsEMBL::Utils::igi_utils::by_igi_number($a,$b)} - keys %$igis; - IGI: - foreach my $igi (@igis) { - # (this sorts on the numerical part of the "igi3_" identifier) - my $peps= $peptides->{$igi}; - if (!defined $peps) { - warn "no peptides for $igi\n"; - next IGI; - } - my (@peps) = @$peps; - - my $max = -1; - my $maxpep = undef; - # get the longest (or ENS if equal;-) peptide: - my @all_natives; # - foreach my $pep (@peps) { - my $len = $ {$pep}[0]; - my $native = $ {$pep}[1]; - push(@all_natives, $native); - if ($len > $max || ( $len == $max && $native =~ /^ENS/ ) ) { - $max=$len; - $maxpep = $pep; - } - } - my ($len, $native_id, $rest, $sequence) = @$maxpep; - my @others = grep ( $_ ne $native_id, @all_natives); - my $others = join(',', @others); - my $newheader = ">$igi $native_id; length: $len; others: $others; original: $rest"; - print "$newheader\n$sequence"; - } -} # print_peptides diff --git a/scripts/igi/pep-collate.sh b/scripts/igi/pep-collate.sh deleted file mode 100755 index 16870e01173e89f64c1d93119f0be62079b587b6..0000000000000000000000000000000000000000 --- a/scripts/igi/pep-collate.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/sh -x -# -*- mode: sh; -*- - -# $Id$ - -# simple script to put all peptides files of the various sources in order. -# It collate the peptide files in case they're not one big file, and -# makes the geneid appear as the first word of a '>...' line -# (this is only for ensembl, rest already has it like that). - -# It's all done in this one script (unlike all-collates.sh, which does something -# similar for the raw gtf's, and which calls other scripts) - -### sources="ensembl affymetrix fgenesh" - -echo "not tested, check thoroughly!" >&2 - -source=ensembl -cd $source -origfile=$source.pep.orig -pepfile=$source.pep -[ ! -f $origfile ] && echo "$origfile: Not found">&2 && exit 1 -[ -f $pepfile ] && echo "found $pepfile; not overwriting">&2 && exit 1 -sed '/^>/s/>\(ENSP[0-9]*\) Gene:\(ENSG[0-9]*\)\(.*$\)/>\2 \1\3/' < $origfile \ - | sed '/^>/s/^>/>ENS:/' > $pepfile >$pepfile || exit 2 - -source=affymetrix -cd $source -pepfile=$source.pep -[ -f $pepfile ] && echo "found $pepfile; not overwriting">&2 && exit 1 - -chroms="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y UL NA" -echo "make sure that these are the right chromosomes:" | tee /dev/tty >&2 -echo $chroms >&2 -echo "" > /dev/tty -for chr in "$@"; do - find . $chr -name '*.affymetrix.aa' -exec cat {} \; -done | sed '/^>/s/>\(.*\)\.\([0-9][0-9]*\)$/>AFFY:\1 \1.\2/' > $pepfile -# the pep file has the transcript id, which is gene_id + number. Replace it -# to make it easier for gtfsummary2pep - -source=fgenesh -cd $source -pepfile=$source.pep - -[ -f $pepfile ] && echo "found $pepfile; not overwriting">&2 && exit 1 -for i in chr_pro/*.sgp.pro chr_r_pro/*.sgp.pro; do - cat $i -done | sed '/^>C/s/^>FGENH:C/>S.C/' > $pepfile -# (replaces ">C1000004 chr1 ..." with ">S.C1000004 chr1 ..." everywhere; -# the gtf files have 'S.C...', the pep files 'C....'. Easiest to change here. diff --git a/scripts/igi/remap-sources.sed b/scripts/igi/remap-sources.sed deleted file mode 100755 index 7e9adabdcc9a4e712e9355c8fdda1e50667de244..0000000000000000000000000000000000000000 --- a/scripts/igi/remap-sources.sed +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sed -f -s/Affymetrix/AFFY/ -s/merged/ENS/ -s/src/FGENH/ diff --git a/scripts/igi/stats-from-merge-files.pl b/scripts/igi/stats-from-merge-files.pl deleted file mode 100755 index 7238b3c78ecb08f3d039ce2bd81b36358e302969..0000000000000000000000000000000000000000 --- a/scripts/igi/stats-from-merge-files.pl +++ /dev/null @@ -1,592 +0,0 @@ -#!/usr/local/bin/perl -# $Id$ - -# Copyright EnsEMBL http://www.ensembl.org - -# This script is used for collecting statistics on the gtf merges done by -# gtf_merge.pl (which are typically called from the wrapper -# all-merges.sh), and it is typically called by the all-stats.sh wrapper. - -# The whole objective is to compare gene predictions of different -# 'sources', currently EnsEMBL, Affymetrix and Fgenesh (Softberry Inc), -# and arrive at a consensus 'Initial Gene Index' (igi). The igi's are -# assigned by gtf_merge, and this script just summarizes them. - -# The script has grown a bit unwieldy, but that is because it is so -# convenient to stay inside this script once every feature, gene and igi -# of each prediction is loaded into the hashes-of-hashes-of-hashes of this -# script. - -# For usage, ask author ;-) - -# Written by Philip lijnzaad@ebi.ac.uk - - -use strict; -use Getopt::Long; -use Bio::EnsEMBL::Utils::igi_utils; - - -my $usage = "$0 options < merged-file.gtf > outputfile\n"; -my $help; -my $all_stats; -my $chaining = undef; -my $i2nmapping; -my $n2imapping; -my $gtf_summary; -my $cluster_n; -&GetOptions( - 'stats' => \$all_stats, - 'chaining:s' => \$chaining, - 'gtfsummary:s' => \$gtf_summary, - 'cluster_n:s' => \$cluster_n, - 'igi2native:s' => \$i2nmapping, - 'native2igi:s' => \$n2imapping, - 'h|help' => \$help, - ); -die $usage if $help; - -die "need both cluster_n and gtf_summary or neither" - unless ( defined($gtf_summary) == defined($cluster_n)); - -die "need -cluster_n when doing igi2native or native2igi" - if ( (defined($n2imapping)||defined($i2nmapping)) > - defined($cluster_n)); - - -my @argv_copy = @ARGV; # may get gobbled up by the <> construct. - -my %igis_of_source; # $igis_of_source_{$source}{$igi} = [nfeats, start, - # end, nexons ] -my %natives_of_source; # $natives_of_source{$source}{$native_id} = - # [ nfeats, start, end, nexons ] -my %natives_of_igi; # $natives_of_igi{$source}{$igi}{$native_id} => 1 - -my %transcripts_of_native; # - # $transcripts_of_native{$source}{$native}{$transcript} - # = length of the transcript - -my $blurp = blurp(); - -### just read the complete file into the relevant hashes -GTF_LINE: -while (<>) { - # taken largely from Bio::EnsEMBL::Utils::GTF_handler: - next GTF_LINE if /^#/; - next GTF_LINE if /^\s*$/; - chomp; - - my @fields = split "\t", $_; - my ($seq_name, $source, $feature, - $start, $end, $score, - $strand, $phase, $group_field) = @fields; - $feature = lc $feature; - - unless ($group_field) { - warn("no group field: skipping : '$_'\n"); - next GTF_LINE ; - } - - # Extract the extra information from the final field of the GTF line. - my ($igi, $gene_name, $native_ids, $transcript_ids, $exon_num, $exon_id) = - Bio::EnsEMBL::Utils::igi_utils::parse_group_field($group_field); - - unless ($igi) { - warn("Skipping line with no igi_id: '$_'\n"); - next GTF_LINE; - } - - my $native_id; - if (@$native_ids == 0 ) { - warn("Skipping line with no gene_id(s): '$_'\n"); - next GTF_LINE; - } - - if (int(@$native_ids) > 1 ) { - warn("Line with several gene_ids (taking first one): '$_'\n"); - } - $native_id = $ {$native_ids}[0]; - - if (@$transcript_ids == 0) { - warn("Skipping line with no transcript_id(s): '$_'\n"); - next GTF_LINE; - } - if (int(@$transcript_ids) > 1 ) { - warn("Line with several transcript_ids (taking first one): '$_'\n"); - } - my $transcript_id = $ {$transcript_ids}[0]; - - # keep track of marginals by 'looping' over current one + - # fictional source 'ALL': - foreach my $s ($source, 'ALL') { - - # keep track of number of features, exons, start and end of this igi: - track_extents(\%igis_of_source, $s, $igi, - $seq_name, $start, $end, $strand, $exon_num); - - # as well as this native_id: - track_extents(\%natives_of_source, $s, $native_id, - $seq_name, $start, $end, $strand, $exon_num); - - # keep track of the native gene_ids of a given igi in a given - # source. As always, use a hash for faster collating (i.e. keys - # %($natives_of_igi{$s}{$igi}) gives them all; the value is - # irrelevant - $natives_of_igi{$s}{$igi}{$native_id}++; - - # pointless to keep track of exon statistics; call exon-lengths.awk - # for that. - } # foreach $s - - if ( $feature eq 'exon' ) { - $transcripts_of_native{$source}{$native_id}{$transcript_id} += - ($end - $start); - } -} # while(<>) -### done reading files - -# get rid of 'ALL' (which was added for convenience when gathering stats -# on marginals) -my @all_sources = (grep $_ ne 'ALL', sort keys %igis_of_source); - -my %sources_of_igi= undef; -my @igis_of_n_sources = undef; -&find_overlaps; - -my @native_ids_per_igi_histo = undef; -my @igis_per_chaining_group = undef; -&find_chaining; - -if ($all_stats) { - print_stats(); -} - -if ($chaining) { - print "igi's and native ids of igi's that chain together >= $chaining native ids\n"; - # print out native_id's clumped together in clusters of more than n - foreach my $source ( @all_sources ) { - print_chaining($source, $chaining); - } - print "----\n"; -} - -if ($i2nmapping) { - foreach my $s (@all_sources) { - my $file = "> $i2nmapping.$s"; - open(I2N, $file) || die "can't open $file: $!"; - print_igi_to_native(\*I2N, $s, $cluster_n); - close(I2N); - } -} - -if ($n2imapping) { - foreach my $s (@all_sources) { - my $file = "> $n2imapping.$s"; - open(N2I, $file) || die "can't open $file: $!"; - print_native_to_igi(\*N2I, $s, $cluster_n); - close(N2I); - } -} - -if ($gtf_summary) { - my $file = "$gtf_summary"; - open(OUT, "> $file" ) || die die "$file: $!"; - - my (@stuff) = ("on", `date`, "by", `whoami`, "@", `hostname`); - foreach (@stuff) {chomp}; - print OUT $blurp; - print OUT <<MOREBLURP -## -## GTF summary file in gff format. The source is 'igi3', and only gene -## features are given. Only genes predicted by $cluster_n or more gene -## predictions have been included. The original gene id's have been -## prefixed with <source-name>. The output is sorted by fpc contig id, -## strand, and start. This is not the final IGI3 file; it just summarizes -## the extent of the igi3 clusters. -MOREBLURP - ; - close(OUT); - my $sortcmd="sort -k1,1 -k7,7 -k4,4n "; - open(OUT, "| $sortcmd >> $file") || die "$file: $!"; - gtf_for_igis_predicted_by_n(\*OUT, $cluster_n); -} # if gtf_summary - -### simple log message -sub blurp { - my (@stuff) = ("on", `date`, "by", `whoami`, "@", `hostname`); - foreach (@stuff) {chomp}; - my $s = '### Produced by $Id$ ' . "\n" - . "### run " . join(' ',@stuff) . "\n" - . "### for EnsEMBL (http://www.ensembl.org)\n" - . "### argument(s): ". join(' ', @argv_copy) . "\n"; - foreach (@argv_copy) { $s .= "### ", `ls -l $_`; } - $s .= "###\n"; - $s; -} - -sub print_coord_stats { - my ($min, $max, $avg, - $minfeats, $maxfeats, $avgfeats, - $minexons, $maxexons, $avgexons) = @_; - - print "\tminl=$min, maxl=$max, avgl=$avg\n"; - print "\tminfeats=$minfeats, maxfeats=$maxfeats, avgfeats=$avgfeats\n"; - print "\tminexons=$minexons, maxexons=$maxexons, avgexons=$avgexons\n"; -} - -sub igi_stats { - my ($igi_hash) = @_; - my $huge = 1e308; - - my ( @igis) = keys %$igi_hash; - my $min = $huge; - my $max = -$huge; - my $sum = 0; - my $minfeats = $huge; - my $maxfeats = -$huge; - my $sumfeats = 0; - - my $minexons = $huge; - my $maxexons = -$huge; - my $sumexons = 0; - - my $n =0; - foreach my $igi (@igis) { - my ($nfeats, $start, $end, $nexons) = @{$igi_hash->{$igi}}; - die unless defined($nexons); - my $len = ($end - $start); - $min = $len if $len < $min ; - $max = $len if $len > $max ; - $sum += $len; - - $minfeats = $nfeats if $nfeats < $minfeats ; - $maxfeats = $nfeats if $nfeats > $maxfeats ; - $sumfeats += $nfeats; - - $minexons = $nexons if $nexons < $minexons ; - $maxexons = $nexons if $nexons > $maxexons ; - $sumexons += $nexons; - - $n++; - } - - my ($avg, $avgfeats, $avgexons) = ('none', 'none', 'none'); - if ($n>0) { - $avg = int($sum/$n); - $avgfeats= int($sumfeats/$n); - $avgexons = int($sumexons/$n) - } - - print_coord_stats( $min, $max, $avg, - $minfeats, $maxfeats, $avgfeats, - $minexons, $maxexons, $avgexons); -} - - -### keep track of start,end of a gene, by looking at the lowest start and -### higest end of any of the features. This is used both for igi's and -### native id's -sub track_extents { - my ($extents, $source, $id, - $seq_name, $start, $end, $strand, $exon_num) = @_; - my ($nfeats, $min, $max, $nexons); - - #get previous record of this igi, if any: - if (defined $ {$extents}{$source}{$id}) { - ($nfeats, $min, $max, $nexons) = - @{$ {$extents}{$source}{$id}}[0,1,2,3]; - } else { - ($nfeats, $min, $max, $nexons)= (0, $start, $end, 0); - } - - $min = $start if $start < $min; - $max = $end if $end > $max; - $nfeats++; - $nexons = $exon_num if $exon_num > $nexons; - - # add record back in: - $ {$extents}{$source}{$id} = [$nfeats, $min, $max, $nexons, - $seq_name, $strand ]; -} - -### find out how the sets overlap between each other -sub find_overlaps { - my @all_igis = keys %{$igis_of_source{'ALL'}}; - my $n_igis = int(@all_igis); - - # invert igis_of_source: - foreach my $igi (@all_igis) { - foreach my $source (@all_sources ) { # excluding 'ALL', of course - # does this source contain this igi? - if (defined $ {$igis_of_source{$source}}{$igi} ) { - $sources_of_igi{$igi}{$source}++; # meaning: seen it - } - } - } - -## do histogram: find the igis per overlap group (i.e., n sources) - foreach my $igi (@all_igis) { - my $n = int(keys %{$sources_of_igi{$igi}}); - my $loc = $ {$igis_of_source{'ALL'}}{$igi}; # start,end etc. of this igi - - # $ {$igis_of_n_sources[$n]}{$igi} = $loc - # wrong, this is for those that are in - # exactly two groups, but you want to know the cumulation: - for (my $i=$n; $i>=0; $i--) { - $ {$igis_of_n_sources[$i]}{$igi} = $loc - } - } -} # find_overlaps - -sub find_chaining { - - foreach my $source ('ALL', @all_sources) { - foreach my $igi ( keys % {$igis_of_source{$source}} ) { - my @native_ids = keys %{$natives_of_igi{$source}{$igi}}; - # these are all distinct native id's of this igi in this source; - # collate them in a histogram - my $n = int(@native_ids); - $native_ids_per_igi_histo[ $n ]{$source} ++; - - my $loc = $ {$igis_of_source{'ALL'}}{$igi}; # start,end etc. of this igi - $ {igis_per_chaining_group[ $n ]{$source}}{$igi} = $loc; - } - } -} - -sub print_stats { - print $blurp; # log comment - print "Sources: " , join( ' ', @all_sources), "\n"; - - my $n_sources = int(@all_sources); - - print "number of igi's per source:\n"; - foreach my $source ('ALL', @all_sources) { - my $igi_of_source = $igis_of_source{$source}; - my $num = int(keys %{$igi_of_source}); - print "$source: $num\n"; - } - print "----\n"; - - # compare igi's per source - print "number of igis shared with other source; sharing multiplicity\n"; - my %h = undef; - # headers: - print ' ' - # overlaps with named groups - , map(do {sprintf "%12s ", substr($_,0,6) }, (@all_sources)) - # multiplicity of groups membership: - , map(do {sprintf " >=%2d ", 2+$_ } - , reverse ( 0 .. (int(@all_sources)-2))) - , " unshared ", - , " total\n"; - -SOURCE1: - foreach my $source1 ('ALL', @all_sources) { - my @igis_of_source1 = keys %{$igis_of_source{$source1}}; - my $source2; - - printf "%6s ", $source1; - SOURCE2: - my ($overlap, $tot); - my %is_shared=undef; - foreach $source2 (@all_sources) { - ($overlap, $tot) = (0,0); - foreach my $igi (@igis_of_source1) { - if (defined $igis_of_source{$source2}{$igi}) { - $overlap++; - # keep track of how many sources have this particular igi: - $is_shared{$igi}++; # if $source1 ne $source2; - } - $tot++; - } - if ($source1 eq $source2) { - print " -- "; - } else { - printf "%6d (%2d%%) ", $overlap, 100.0*$overlap/$tot; - } - } - # print out the sharing multiplicity - my ($from, $to) = (int(@all_sources), 2); - for(my $i =$from; $i>= $to; $i--) { - my $num = int(grep( $_ >= $i, values %is_shared)); - printf "%6d (%2d%%) ", $num, 100.0*$num/$tot; - } - my $unshared = - int(grep( $_ == 1, values %is_shared)); # those seen exactly once - # are in just one source, - # so are not shared - printf "%6d (%2d%%) %6d\n", $unshared, 100.0*$unshared/$tot, $tot; - } # for source1 - - print "----\n"; - - ### igi statistics per source - foreach my $source ('ALL', @all_sources) { - print "igi stats on $source:\n"; - igi_stats($igis_of_source{$source}); - } - print "----\n"; - - ### see to what extent the clusters grow as they are - ### shared by an increasing number of sources: - print "igi stats per cluster group:\n"; - for(my $i = 1; $i<=$n_sources; $i++) { - print "those in $i sources:\n"; - igi_stats($igis_of_n_sources[$i]) ; - } - print "----\n"; - - ### output the chaining: the extent to which native gene_ids get - ### chained together by the igi clustering - print "gene id chaining: histogram of native gene_ids per igi, and igi statistics on this:\n"; - foreach my $source ('ALL', @all_sources) { - print "source $source:\n"; - for (my $i=0; $i< int(@native_ids_per_igi_histo); $i++ ) { - my $n = $native_ids_per_igi_histo[ $i ]{$source}; - if ($n > 0) { - print "\t$i: $n\n"; - igi_stats( $ {igis_per_chaining_group[ $i ]}{$source}) ; - } - } - } - print "----\n"; -} # all_stats - -# print out native_id's clumped together in clusters of more than n -sub print_chaining { - my ($source, $n)=@_; - - print "source $source:\n"; - for (my $i=$n; $i< int(@native_ids_per_igi_histo); $i++) { - my $num = $native_ids_per_igi_histo[ $i ]{$source}; - if ( $num ) { - print " chaining together $i native ids ( $num cases) :\n"; - my $igi_hash = $ {igis_per_chaining_group[ $i ]}{$source}; - foreach my $igi ( keys %$igi_hash) { - my ($start,$end, $seq, $strand) - = @{$igis_of_source{$source}{$igi}}[1,2,4,5]; - print " $igi [ $seq\t$start\t$end\t$strand ]:\n"; - print_natives($source, $igi); - print " linked by:\n"; - foreach my $s ( @all_sources ) { - next if $s eq $source; - print_natives($s, $igi, " "); - } - } - } - } -} # print_chaining - -# print out the native id's of this igi, with coordinates, and sorted by -# coordinate -sub print_natives { - my ($source, $igi, $indent) = @_; - - my (@native_coords); - foreach my $nat ( keys %{$natives_of_igi{$source}{$igi}} ) { - my ($start,$end) = @{$natives_of_source{$source}{$nat}}[1,2]; - push @native_coords, [$nat, $start, $end]; - } - my $n; $n++; - foreach my $nat (sort sort_native @native_coords) { - my ($id, $start, $end) = @{$nat}; - print " $indent$source $id [ $start\t$end ]\n"; - } -} # print natives - -# for sorting id's by start/stop coords. This is a bit hairy, to get the -# start exon's first, and end exons last ... -sub sort_native { - my ($starta, $enda) = @{$a}[1,2]; - my ($startb, $endb) = @{$b}[1,2]; - - my $n; - $n = $starta <=> $startb; - return $n if $n; - $n = $enda <=> $endb; - return $n if $n; - return $$a[0] cmp $$b[0]; # i.e., alphabetically -} - - -## print file that maps igi to native -sub print_igi_to_native { - my($OUT, $source, $cluster_n) = @_; - foreach my $igi (sort - { Bio::EnsEMBL::Utils::igi_utils::by_igi_number($a,$b)} - keys %{$igis_of_source{$source}}) { - - if (defined $ {$igis_of_n_sources [ $cluster_n ]}{$igi} ) { - print $OUT "$igi " - , join(' ', (sort keys %{$natives_of_igi{$source}{$igi}})) - , "\n"; - } else { warn "i2n: $igi: not in $cluster_n sources\n" } - } -} # print_igi_to_native - -## print file that maps native to igi: -sub print_native_to_igi { - my($OUT, $source, $cluster_n) = @_; - - my %igis_of_native = undef; # have to invert first: - - foreach my $igi (sort - { Bio::EnsEMBL::Utils::igi_utils::by_igi_number($a,$b)} - keys %{$igis_of_source{$source}}) { - - foreach my $nat (keys %{$natives_of_igi{$source}{$igi}}) { - $igis_of_native{$nat}=$igi; - } - } - - foreach my $nat (sort keys %igis_of_native) { - my $igi = $igis_of_native{$nat}; - if (defined $ {$igis_of_n_sources [ $cluster_n ]}{$igi} ) { - print $OUT "$nat $igi\n"; - } else { warn "n2i: $igi: not in $cluster_n sources\n" } - } -} # print_native_to_igi - -### print a gtf file for all igi's that are predicted by N sources. For -### now, just print start and end, nothing else. I.e., don't give any -### native exons or so: -sub gtf_for_igis_predicted_by_n { - my ($OUT, $n) = @_; - my $newsource = 'igi3'; - my $feature='gene'; - my $score = 0; - my $phase = '.'; - - foreach my $igi (sort - { Bio::EnsEMBL::Utils::igi_utils::by_igi_number($a, $b)} - keys %{$igis_of_n_sources[$n]}) { - my ($nfeats, $min, $max, $nexons, $seq_name, $strand ) - = @{$igis_of_source{'ALL'}{$igi}}; - my @fields =($seq_name, $newsource, $feature, $min,$max, $score, $strand, $phase); - - # add the ids: - my $rest ="igi_id \"$igi\"; "; - SOURCE: - foreach my $source (sort @all_sources) { - foreach my $nat_id (sort keys %{$natives_of_igi{$source}{$igi}}) { - $rest .= "gene_id \"$source:$nat_id\"; "; - - ## find id of the longest transcript: - my $max = -1; my $max_tid=undef; - my @tids = keys %{$transcripts_of_native{$source}{$nat_id}}; -# warn "tids: @tids" if @tids > 2; - foreach my $tid ( @tids ) { - my $len = $transcripts_of_native{$source}{$nat_id}{$tid}; - if ($len > $max) { - $max_tid = $tid; - $max = $len; - } - } - $rest .= "transcript_id \"$source:$max_tid\"; "; - } # foreach $nat - } # source - push(@fields, $rest); - print $OUT join("\t", @fields), "\n"; - } # foreach $igi -} # gtf_for_igis_predicted_by_n