Skip to content
Snippets Groups Projects
Commit df195ecb authored by Simon Potter's avatar Simon Potter
Browse files

parse final AGPs

parent 9d69cb05
No related branches found
No related tags found
No related merge requests found
use strict;
# finished chromosome AGPs -> ensembl assembly/chromosome tables
# author: simon potter, 04/03
# (c) ebi/grl 2003
# takes inputs:
# dump of ensembl raw contig names/internal IDs ('e_ctg.dat')
# AGP files in $agp_dir/human_*.agp
# outputs 3 files:
#
# static.dat - tab delimited assembly table
# chrom.dat - tab delimited chromosome table
# idlist.txt - list of contig names and internal IDs used
# "strategy":
#
# parse list of ensembl raw contig coords and store in uberhash;
# go through AGP linewise and look for matching raw contigs
#
# does _some_ checks:
# each raw contig used no more than once
# no gaps in AGP
# no unfinished sequence
# Note: the chromosome_id in assembly and chromosome tables starts
# from 1. Care is needed if loading these tables into a database
# where another assembly is already stored.
my (@agps);
my $gptype = "Human_finished";
my $raw = "e_ctg.dat";
my $agp_dir = "./fin_agp";
my %clone;
my %used_ctg_ids;
my @chrs = qw/
1 2 3 4 5 6 7 8 9 10
11 12 13 14 15 16 17 18 19 20
X Y
/;
my @chr_info;
my $chr_id = 1;
open CTG, "< $raw" or die "Can't open raw contig file $raw";
open ASS, "> static.dat" or die "Can't open static.dat for output";
open INF, "> idlist.txt" or die "Can't open idlist.txt for output";
open CHR, "> chr.dat" or die "Can't open chr.dat for output";
while (<CTG>) {
my ($name, $internal_id) = split;
my ($sv, $start, $end) = $name =~ /(.+)\.(\d+)\.(\d+)/;
die "Bad contig $name" unless $sv && $start && $end;
$clone{$sv} ||= [];
push @{$clone{$sv}}, {
name => $name,
id => $internal_id,
start => $start,
end => $end
};
}
close CTG;
foreach my $chr (@chrs) {
my $agp = "$agp_dir/human_$chr.agp";
my $global_end;
my $prev_end;
unless (-f $agp) {
print "Not found AGP for chr $chr\n";
next;
}
open AGP, "< $agp" or die "Canna open AGP file $agp";
while (<AGP>) {
my ($chr_name, $chr_start, $chr_end, $status) = (split)[0, 1, 2, 4];
die "Unexpected gap: $_" if ($prev_end && $prev_end + 1 != $chr_start);
$prev_end = $chr_end;
next if $status eq "N";
die "Unfinished seq! $_" unless $status eq "F";
die "Bad chr name" unless $chr_name eq "Human-chr$chr";
$global_end = $chr_end;
my ($sv, $clone_start, $clone_end, $clone_ori) = (split)[5, 6, 7, 8];
die "Invalid SV: $_" unless $sv =~ m{^\w+\.\d+$};
unless (exists $clone{$sv}) {
print STDERR "No clone found: $sv\n";
next;
}
die "Raw contig and chr coords don't match: $_"
unless ($clone_end - $clone_start) == ($chr_end - $chr_start);
if ( $clone_ori eq '+') { $clone_ori = 1; }
elsif ( $clone_ori eq '-') { $clone_ori = -1; }
else { die "Bad clone orientation $clone_ori: $_"; }
my $matched = 0;
foreach my $raw_ctg (@{$clone{$sv}}) {
my $start = $raw_ctg->{'start'};
my $end = $raw_ctg->{'end'};
# skip if this raw contig outside clone
next if ($clone_start < $start) || ($clone_start > $end);
if ($end < $clone_end) {
print STDERR $raw_ctg->{'name'}, " too short to fit into ",
"$clone_start:$clone_end\n";
next;
}
$matched = 1;
my $ctg_start = $clone_start - $start + 1;
my $ctg_end = $clone_end - $start + 1;
if (defined $used_ctg_ids{$raw_ctg->{'id'}}) {
print STDERR "Warning: ", $raw_ctg->{'name'},
" used more than once\n";
}
else {
$used_ctg_ids{$raw_ctg->{'id'}} = 1;
}
print ASS join("\t",
$chr_id,
$chr_start,
$chr_end,
$sv,
$clone_start,
$clone_end,
1,
# $nt_ctg,
# $nt_start,
# $nt_end,
# $nt_ori,
$raw_ctg->{'id'},
$ctg_start,
$ctg_end,
$clone_ori,
$gptype
), "\n";
print INF $raw_ctg->{'name'}, "\t", $raw_ctg->{'id'}, "\n";
}
if ($matched == 0) {
print STDERR "Can't fit clone $sv: $clone_start - $clone_end\n";
}
}
close AGP;
push @chr_info, [ $chr_id, $chr, $global_end ];
$chr_id++;
}
foreach my $c (@chr_info) {
print CHR join("\t",
$c->[0],
$c->[1],
0,
0,
0,
$c->[2],
), "\n";
}
close CHR;
__END__
human-*.agp:
Human-chr1 1 50000 1 N 50000
Human-chr1 50001 50616 2 F AP006221.1 36116 36731 -
Human-chr1 50617 217280 3 F AL627309.15 241 166904 +
ctg.dat (name, contig_id):
AB000381.1.1.35863 1
AB012723.1.1.40850 11
AB015355.1.1.43999 24
AB015752.1.1.116160 25
AB016897.1.1.331211 26
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment