Commit 5117319c authored by cvs2git's avatar cvs2git
Browse files

This commit was manufactured by cvs2svn to create tag 'sunday-before-chr22'.

Sprout from master 2000-06-25 18:21:32 UTC James Gilbert <jgrg@sanger.ac.uk> 'Removed line which excluded all but "ucsc" overlaps'
Delete:
    misc-scripts/ePCR_snapshot/snapshot.pl
    misc-scripts/fpc/ALLmap
    misc-scripts/fpc/fpc_parse.pl
    misc-scripts/golden_path/gp_contig_mismatch.pl
    misc-scripts/golden_path/gp_incons_test.pl
    misc-scripts/golden_path/gp_parse.pl
parent e6ccd1cf
# access timdb directly and produce a tab delimited file
# for ePCR hits
# uses timdb freeze dataset
# queries the clone directories directly
# outputs tab delimited on STDOUT
# contigname \t seq_start \t seq_end \t strand\t 1 \t
# length of the hit \t HID
# and some progress info on STDERR
# to read this into ensembl you generate a helper table
# create table map_feature ( contig_name varchar(40) DEFAULT '' NOT NULL, seq_start int(11) DEFAULT '0' NOT NULL,seq_end int(11) DEFAULT '0' NOT NULL, strand int(11), hstart int(11), hend int(11), hid varchar(40) DEFAULT '' NOT NULL, PRIMARY KEY (contig_name,hid,seq_start,seq_end));
# and do a
# load data infile '/tmp/snapshot.txt' ignore into table map_feature( contig_name, seq_start, seq_end, strand, hstart, hend, hid );
# then you have to generate a row in analysis table
# and finally insert into feature select .... from map_feature, contig ...
# to find the internal_ids to the contig names ....
use Bio::EnsEMBL::TimDB::Obj;
use Bio::EnsEMBL::Analysis::ensConf;
use NDBM_File;
# Bio::EnsEMBL::Analysis::ensConf->import;
print join( " ",%$UNFIN_DATA_ROOT_CGP),"\n";
$timdb = Bio::EnsEMBL::TimDB::Obj->new
( -freeze => 1, -nogene => 1 );
print STDERR ( "Entering get_all_Clone_id\n" );
@cloneIds = $timdb->get_all_Clone_id;
print STDERR ( "Leaving get_all_Clone_id\n" );
# print join( " ", @cloneIds ),"\n";
print STDERR ( "Found ", scalar( @cloneIds ), " clones.\n" );
*LOG = *STDOUT;
print STDERR ( "Reading NDBM files.\n" );
for $cgp ( 'EU', 'EF', 'SU', 'SF' ) {
my $clone;
$cgp_dir = $UNFIN_DATA_ROOT_CGP->{$cgp};
$contig_dbm_file = "$cgp_dir/unfinished_ana.dbm";
my %unfin_contig;
unless(tie(%unfin_contig,'NDBM_File',$contig_dbm_file,O_RDONLY,0644)){
die("Error opening contig dbm file $contig_dbm_file");
}
while (($key,$val) = each %unfin_contig) {
( $clone ) = ( $key =~ /([^.]+)./ );
push( @{$cloneContigs->{$clone}}, $key );
}
}
print STDERR ("Done reading NDBM files\n" );
for my $clone (@cloneIds) {
$clonecount++;
my @moreInfo = $timdb->get_id_acc( $clone );
$cgp = $moreInfo[2];
$clonedir = $UNFIN_DATA_ROOT_CGP->{$cgp}."/data/".$moreInfo[1];
@clonefiles = map( "$clonedir/$_.ePCR", @{$cloneContigs->{$clone}} );
# glob( "$clonedir*ePCR" );
# print join( "\n",@clonefiles),"\n";
# next;
for my $clonefile (@clonefiles) {
if ( ! open( EPCR, $clonefile )) {
print STDERR ( "Couldnt open $clonefile\n" );
next;
}
while( <EPCR> ) {
( $contigname, $range, $accnum ) = split;
if( ! ( $range =~ /^(\d+)\.\.(\d+)$/ )) {
next;
} else {
( $start, $end ) = ( $1, $2 );
$strand = 1;
if( $end < $start ) {
$strand = -1;
( $start, $end ) = ( $end, $start );
}
$length = $end-$start+1;
}
$contigname =~ s/([^.]*)\./$clone./g;
$hitcount++;
print LOG ( "$contigname\t$start\t$end\t$strand\t1\t$length\t$accnum\n" );
if(( $clonecount % 100 ) == 0 ) {
# exit;
print STDERR ( "$hitcount hits on $clonecount clones.\n" );
}
}
close( EPCR );
}
}
exit;
This diff is collapsed.
# input, the slightly edited ALLmap file from an Email
# output
# information about Fpc_Clone table entries and
# Fpc_Contig table entries
$allfile = shift;
open( FH, $allfile ) or die( "Couldnt open input file" );
open( CO, ">fpc_contig.txt" ) or
die( "Couldnt open fpc_contig.txt output file." );
open( CL, ">fpc_clone.txt" ) or
die( "Coulnt open fpc_clone.txt output file." );
$cloneHash = {};
$contigHash = {};
$currentContigNum;
%chrlength = ( "1", 263000058, "10", 143999990, "11", 144000020, "12", 142999999, "13", 113999999, "14", 108999990, "15", 106000020, "16", 98000000, "17", 92000000, "18", 85000029, "19", 66999999, "2", 254999998, "20", 72000009, "21", 50000000, "22", 50000000, "3", 214000030, "4", 203000002, "5", 194000011, "6", 183000009, "7", 171000011, "8", 155000010, "9", 144999990, "X", 164000011, "Y", 50810739 );
while( <FH> ) {
/^\#/ && next;
$clone = {};
if( /^start.*\.human\.(\S+) (\S+)/ ) {
$chrom = $1;
if( $2 eq "ORDERED" ) {
$ordered = 1;
} else {
$ordered = 0;
}
next;
}
if( /^@\s*$/ ) {
if( defined $lastContig ) {
# contig_done( $chrom, $lastContig, $ordered );
$lastContig = undef;
}
next;
}
if( /^end.*\.human\./ ) {
next;
}
if( /^@\s+(\S+)\s+(\S+)/ ) {
# contig done
$contigHash->{$1}->{start} = $2;
# contig_done( $chrom, $lastContig, $ordered, $1, $2 );
$lastContig = undef;
next;
}
chomp;
split;
if(/^COMMITTED/) {
$embl = 0;
} else {
$embl = 1;
$clone->{embl} = $_[0];
if( defined $cloneHash->{$_[0]} ) {
print("Double clone $_[0]\n" )
}
$cloneHash->{$_[0]} = 1;
}
if(!( $_[1] =~ /UNK/ || $_[1] =~ /multi/ )) {
$clone->{name} = $_[1];
}
$clone->{institute} = $_[2];
$clone->{contig} = $_[3];
$lastContig = $_[3];
$clone->{start} = $_[4]*1000;
if( $embl ) {
$clone->{length} = $_[6];
}
submit_clone( $chrom, $ordered, $clone );
$clone = undef;
}
for (keys %$contigHash ) {
if( $contigHash->{$_}->{ordered} ) {
push( @{$chrom{$contigHash->{$_}->{chrom}}}, $contigHash->{$_} );
}
}
for $chrom (keys %chrom) {
my @newlist = sort {$a->{start} <=> $b->{start}} @{$chrom{$chrom}};
$chrom{$chrom} = \@newlist;
recalc_starts_standard( $chrom, $chrom{$chrom} );
}
for (keys %$contigHash ) {
$contigRef = $contigHash->{$_};
if( ! defined $contigRef->{name} ) {
print STDERR ("Key $_ empty contig.\n" );
next;
}
print CO ( $contigRef->{num},"\t" );
print CO ( $contigRef->{name},"\t" );
if( ! defined $contigRef->{start} ) {
print CO ( "-1\t" );
} else {
print CO ( $contigRef->{start_bp},"\t" );
}
print CO ( $contigRef->{length},"\t" );
print CO ( $contigRef->{chrom},"\n" );
}
sub contig_done {
my $chrom = shift;
my $name = shift;
my $ordered = shift;
my $moreName = shift;
my $global = shift;
if( defined $moreName &&
defined $global ) {
$contigHash->{$moreName}->{start} = $global;
}
$contigHash->{$name}->{chrom} = $chrom;
if( $ordered && ! defined $contigHash->{$name}->{start} ) {
print( "$name has no start pos.\n" );
}
# print( $name, " ", $contigHash->{$name}->{length}," ",
# $contigHash->{$name}->{start},"\n" );
}
sub submit_clone {
my $chrom = shift;
my $ordered = shift;
my $clone = shift;
if( ! defined $contigHash->{$clone->{contig}} ) {
$contigNum = $currentContigNum++;
$contigHash->{$clone->{contig}}->{num} = $contigNum;
$contigHash->{$clone->{contig}}->{name} = $clone->{contig};
$contigHash->{$clone->{contig}}->{chrom} = $chrom;
$contigHash->{$clone->{contig}}->{ordered} = $ordered;
}
my $contiglen = $clone->{start} + $clone->{length};
if( !defined $contigHash->{$clone->{contig}}->{length} ) {
$contigHash->{$clone->{contig}}->{length} = $contiglen;
} else {
if( $contiglen > $contigHash->{$clone->{contig}}->{length} ) {
$contigHash->{$clone->{contig}}->{length} = $contiglen;
}
}
print CL ( $clone->{embl},"\t" );
print CL ( $clone->{name},"\t" );
print CL ( $clone->{institute},"\t" );
print CL ( $contigHash->{$clone->{contig}}->{num},"\t" );
print CL ( $clone->{length},"\t" );
print CL ( $clone->{start},"\n" );
}
sub recalc_starts_standard {
my $chrom = shift;
my $contigListRef = shift;
my @contigList = @$contigListRef;
my $maxlen = $chrlength{"$chrom"};
print "Chromosome $chrom $maxlen\n";
my $lastContig = $contigList[ $#contigList ];
my $factor =( $maxlen - $lastContig->{length} ) / $lastContig->{start};
for my $contigRef ( @contigList ) {
$contigRef->{start_bp} = int( $contigRef->{start}*$factor );
# print( $contigRef->{start_bp}," ",
# ($contigRef->{start_bp} + $contigRef->{length}), "\n" );
}
}
# check innconsistencies between contig offsets and golden path
# when a fragment of the golden path overlaps ends of contigs in
# a way that its not contained completely by the sequenced contig.
# takes three arguments.first is contig file
# this has rows containing tab delimited
# contig_name \t offset in embl \t length of the contig \t dna_id in ensembl
# second argument is the directory where the .agp files live.
# A find for all files ending in .agp is run over it.
use File::Find;
$contig_file = shift;
$agp_dir =shift;
open( CF, $contig_file ) or die( "Couldnt open contigfile." );
print STDERR ( "Start reading contig.txt.\n" );
while( <CF> ) {
my ( $contig, $offset, $length ) = split;
( $clone ) = $contig =~ /^([^\.]+)\./;
$clone_offset_hash{$clone}->{$contig} = [ $offset, $length ];
}
close CF;
print STDERR ( "Contig.txt read.\n" );
sub append_agp {
/\.agp$/ &&
push( @filelist, $File::Find::name );
}
sub read_agp {
$filename = shift;
local *FH;
open( FH, $filename ) or die( "Cant open file $filename." );
while( <FH> ) {
/^#/ && next;
@list = split( /\t/, $_ );
if( $list[4] =~ /[^N]/ ) {
$readLines++;
( $clone ) = $list[5] =~ /^([^\.]+)\./;
$start = $list[6];
$end = $list[7];
if( defined( $clone_offset_hash{$clone} )) {
$known_clone++;
check_line( $clone, $start, $end, $_ );
} else {
$unknown_clone++;
}
}
}
close FH;
}
sub check_line {
my ( $clone, $start, $end, $line ) = @_;
my $clonehashref = $clone_offset_hash{$clone};
my ( $offset, $length );
for my $contig ( keys %{$clonehashref} ) {
( $offset, $length ) = @{$clonehashref->{$contig}};
# match check
if( $start >= $offset && ($end <= ( $offset+$length-1))) {
if( defined $contigUsage{$contig} ) {
push( @{$contigUsage{$contig}}, $line );
$fragmented{$clone}->{$contig} = 1;
$double_used_contigs++;
} else {
push( @{$contigUsage{$contig}}, $line );
}
}
# partly match == mismatch
if((( $start < $offset ) && ( $end >= $offset )) ||
(( $start <= ($offset+$length-1)) && ( $end > ($offset+$length-1)))) {
push( @{$mismatched{$clone}}, $line );
$mismatched++;
return;
}
}
}
sub clone_hash_string {
my $clonename = shift;
my $hashref = $clone_offset_hash{$clonename};
my $result;
my @keylist_sorted;
$result .= "EnsEMBL Freeze Clone $clonename\n";
@keylist_sorted = sort { $hashref->{$a}[0] <=> $hashref->{$b}[0] } ( keys %{$hashref} );
for my $contig ( @keylist_sorted ) {
$result .= "$contig ";
my ( $offset, $length );
( $offset, $length ) = @{$hashref->{$contig}};
$result .= "$offset ".($offset+$length-1)."\n";
}
return $result;
}
find( \&append_agp, $agp_dir );
print STDERR ( "Reading ", $#filelist+1, " files.\n" );
$count = 0;
for $filename (@filelist) {
read_agp( $filename );
$count++;
if( $count % 100 == 0 ) {
# last;
print STDERR ( "$count read.\n" );
}
}
# fragmented
print( "The folllowing lines fragmented or mismatched contigs:\n\n" );
for my $clone ( keys %fragmented ) {
for my $contig ( keys %{$fragmented{$clone}} ) {
for my $line ( @{$contigUsage{$contig}} ) {
$bogusLines++;
print $line;
}
}
if( defined $mismatched{$clone} ) {
for my $line ( @{$mismatched{$clone}} ) {
print $line;
$bogusLines++;
}
}
print ( "-----------\n" );
print clone_hash_string( $clone );;
print ( "-----------\n\n" );
}
for my $clone ( keys %mismatched ) {
if( ! defined $fragmented{$clone} ) {
for my $line ( @{$mismatched{$clone}} ) {
print $line;
$bogusLines++;
}
print ( "-----------\n" );
print clone_hash_string( $clone );;
print ( "-----------\n\n" );
}
}
print qq
(From $readLines read lines, $known_clone lines were in Freeze.
$bogusLines of them were bogus.
$mismatched lines were mismatches.
$unknown_clone lines had unknown clones.
);
exit;
# find inconsistencies in .agp files.
# report when two fragments in agp overlap the same embl sequence.
# argument: the agp subdirectory. A find is done over it.
use File::Find;
$agp_dir =shift;
sub append_agp {
/\.agp$/ &&
push( @filelist, $File::Find::name );
}
sub read_agp {
$filename = shift;
local *FH;
open( FH, $filename ) or die( "Cant open file $filename." );
my %clonestore;
my $clone;
while( <FH> ) {
/^#/ && next;
@list = split( /\t/, $_ );
if( $list[4] =~ /[^N]/ ) {
$readLines++;
( $clone ) = $list[5] =~ /^([^\.]+)\./;
$start = $list[6];
$end = $list[7];
if( $list[1] > $list[2] ) {
print $_;
$reverse++;
}
next;
push( @{$clonestore{$clone}}, [ $start, $end, $_] );
}
}
close FH;
return;
my ( $s1, $s2, $e1, $e2, $l1, $l2 );
# check overlaps in %clonestore
for my $clone ( keys %clonestore ) {
my @frags = @{$clonestore{$clone}};
for( my $i = 0; $i <= $#frags; $i++ ) {
for( my $j = $i+1; $j <= $#frags; $j++ ) {
( $s1, $e1, $l1 ) = @{$frags[$i]};
( $s2, $e2, $l2 ) = @{$frags[$j]};
if(( $s2 >= $s1 && $s2 <=$e1 ) ||
( $s1 >= $s2 && $s1 <=$e2 )) {
print $l1,$l2,"\n";
$collisionCount++
}
}
}
}
}
find( \&append_agp, $agp_dir );
print STDERR ( "Reading ", $#filelist+1, " files.\n" );
$count = 0;
for $filename (@filelist) {
read_agp( $filename );
$count++;
if( $count % 100 == 0 ) {
# last;
print STDERR ( "$count read.\n" );
}
}
print( "Reverse count $reverse\n" );
print( "\nFound $collisionCount contradicting line pairs\n" );
print( "in $readLines read lines.\n" );
exit;
# Script to parse the golden path files ".agp"
# into ensembl overlap table
# takes three arguments.first is contig file
# this has rows containing tab delimited
# contig_name \t offset in embl \t length of the contig \t dna_id in ensembl
# second argument is the directory where the .agp files live.
# A find for all files ending in .agp is run over it.
# third argument is a file with invalid clonenames
# one per line
use File::Find;
$contig_file = shift;
$agp_dir =shift;
$bogus_clones_file = shift;
open( CF, $contig_file ) or die( "Couldnt open contigfile." );
print STDERR ( "Start reading contig.txt.\n" );
while( <CF> ) {
my ( $contig, $offset, $length, $dna_id ) = split;
( $clone ) = $contig =~ /^([^\.]+)\./;
$clone_offset_hash{$clone}->{$contig} = [ $offset, $length, $dna_id ];
}
close CF;
print STDERR ( "Contig.txt read.\n" );
open( BC, $bogus_clones_file ) or die( "Couldnt open $bogus_clones_file" );
print STDERR ( "Reading bogus clones\n" );
while( <BC> ) {
chomp;
$bogus_clones{$_} = 1;
}
close( BC );
sub append_agp {
/\.agp$/ &&
push( @filelist, $File::Find::name );
}
sub read_agp {
my $filename = shift;
local *FH;
open( FH, $filename ) or die( "Cant open file $filename." );
my ( $start, $end );
my $firstFrag = 1;
my $lastFrag = undef;
my $thisFrag = undef;
print ( "$filename\n" );
while( <FH> ) {
/^#/ && next;
chomp;
@list = split( /\t/, $_ );
if( $list[4] =~ /[^N]/ ) {
$readLines++;
( $clone ) = $list[5] =~ /^([^\.]+)\./;