From d124a848c29478ed048b147435dc90d9e70bd107 Mon Sep 17 00:00:00 2001 From: Susan Fairley <sf7@sanger.ac.uk> Date: Fri, 27 Aug 2010 10:02:06 +0000 Subject: [PATCH] Updated to deal with cases where not all of the patch scaffold is included in the toplevel. --- .../assembly_patches/assembly_patch_load.pl | 103 +++++++++++++++++- 1 file changed, 98 insertions(+), 5 deletions(-) diff --git a/misc-scripts/assembly_patches/assembly_patch_load.pl b/misc-scripts/assembly_patches/assembly_patch_load.pl index ad08519f28..e249994a85 100644 --- a/misc-scripts/assembly_patches/assembly_patch_load.pl +++ b/misc-scripts/assembly_patches/assembly_patch_load.pl @@ -142,6 +142,10 @@ my %key_to_index; my %name_to_seq_id; my %seq_id_to_start; my %name_to_type; +my %seq_id_to_alt_scaf_start; +my %seq_id_to_alt_scaf_stop; +my %seq_id_to_stop; +my %seq_id_to_strand; while (<TYPE>) { chomp; @@ -200,6 +204,18 @@ while(<TXT>){ $name_to_seq_id{$alt_name} = $max_seq_region_id; $name_to_seq_id{$alt_acc} = $max_seq_region_id; $seq_id_to_start{$max_seq_region_id} = $arr[$key_to_index{'parent_start'}]; + $seq_id_to_alt_scaf_start{$max_seq_region_id} = $arr[$key_to_index{'alt_scaf_start'}]; + $seq_id_to_alt_scaf_stop{$max_seq_region_id} = $arr[$key_to_index{'alt_scaf_stop'}]; + $seq_id_to_stop{$max_seq_region_id} = $arr[$key_to_index{'parent_stop'}]; + if ($arr[$key_to_index{'ori'}] eq "+"){ + $seq_id_to_strand{$max_seq_region_id} = 1; + } + elsif($arr[$key_to_index{'ori'}] eq "-"){ + $seq_id_to_strand{$max_seq_region_id} = -1; + } + else{ + print "Problem with strand.\n"; + } my $exc_len = $arr[$key_to_index{'parent_stop'}]-$arr[$key_to_index{'parent_start'}]; my $new_length = $arr[$key_to_index{'alt_scaf_stop'}]-$arr[$key_to_index{'alt_scaf_start'}]; @@ -230,8 +246,8 @@ while(<TXT>){ print SQL "insert into assembly_exception (assembly_exception_id, seq_region_id, seq_region_start, seq_region_end, exc_type, exc_seq_region_id, exc_seq_region_start, exc_seq_region_end, ori)\n"; print SQL "\tvalues($max_assembly_exception_id, $max_seq_region_id, ", - ($arr[$key_to_index{'alt_scaf_start'}]+$arr[$key_to_index{'parent_start'}])-1, ",", - ($arr[$key_to_index{'alt_scaf_stop'}]+$arr[$key_to_index{'parent_start'}])-1, ",", + $arr[$key_to_index{'parent_start'}], ",", #($arr[$key_to_index{'alt_scaf_start'}]+$arr[$key_to_index{'parent_start'}])-1, ",", + ($arr[$key_to_index{'alt_scaf_stop'}]+$arr[$key_to_index{'parent_start'}])-$arr[$key_to_index{'alt_scaf_start'}], ",", # ($new_length+$arr[$key_to_index{'parent_start'}], ",", "$hap_type, ", $seq_region_id,", ", @@ -272,7 +288,8 @@ while(<MAPPER>){ $get_seq_id_sth->fetch; if(!defined($cmp_seq_id)){ print "Could not get seq_region_id for $contig trying pfetch\n"; - my $out = system("mfetch -d embl -v fasta $contig > ./$contig.fa"); + #prev mfetch -d embl -v fasta $contig > ./$contig.fa + my $out = system("pfetch $contig > ./$contig.fa"); my $contig_seq =""; open(FA,"<./$contig.fa") || die "Could not open file ./$contig.fa\n"; while (<FA>){ @@ -299,8 +316,84 @@ while(<MAPPER>){ die "Could not get seq id for $contig\n\n"; } my $seq_id = $name_to_seq_id{$acc}; - print SQL "insert into assembly (asm_seq_region_id, cmp_seq_region_id, asm_start, asm_end, cmp_start , cmp_end, ori) \n"; - print SQL " values(".$seq_id.", ".$cmp_seq_id.", ".(($seq_id_to_start{$seq_id}+$p_start)-1).", ".(($seq_id_to_start{$seq_id}+$p_end)-1).", $c_start, $c_end, $strand);\n\n"; + + my $asm_start; + my $asm_end; + my $cmp_start; + my $cmp_end; + + $asm_start = ($p_start-($seq_id_to_alt_scaf_start{$seq_id}-1))+($seq_id_to_start{$seq_id}-1); + $cmp_start = $c_start; + $asm_end = ($seq_id_to_start{$seq_id} - $seq_id_to_alt_scaf_start{$seq_id})+$p_end; + $cmp_end = $c_end; + + #for each combination of strand orientations + #alter the chromosome level starts and ends if needed + #then deal with the cases where a contig extends beyond the part of the scaffold included + #in the chromosome/toplevel + #Do leftmost, relative to chrom and then rightmost. + #Cases where the contig is completely off the toplevel are caught at the end + + #if the contig is negative + if($strand == -1 and $seq_id_to_strand{$seq_id} == 1){ + if($seq_id_to_alt_scaf_start{$seq_id}>$p_start){ + $asm_start = $seq_id_to_start{$seq_id}; + $cmp_end = $c_start + ($p_end-$seq_id_to_alt_scaf_start{$seq_id}); + } + if($seq_id_to_alt_scaf_stop{$seq_id}<$p_end){ + $asm_end = $seq_id_to_start{$seq_id} + ($seq_id_to_alt_scaf_stop{$seq_id}-$seq_id_to_alt_scaf_start{$seq_id}); + $cmp_start = $c_end - ($seq_id_to_alt_scaf_stop{$seq_id} - $p_start); + } + }#if the scaffold is negative + elsif($strand == 1 and $seq_id_to_strand{$seq_id} == -1){ + + $asm_start = ($seq_id_to_alt_scaf_stop{$seq_id} - $p_end) + $seq_id_to_start{$seq_id}; + $asm_end = ($seq_id_to_alt_scaf_stop{$seq_id} - $p_start) + $seq_id_to_start{$seq_id}; + + if($seq_id_to_alt_scaf_stop{$seq_id}<$p_end){ + $asm_start = $seq_id_to_start{$seq_id}; + $cmp_end = $c_start + ($p_start - $seq_id_to_alt_scaf_start{$seq_id}); + } + if($seq_id_to_alt_scaf_start{$seq_id}>$p_start){ + $asm_end = $seq_id_to_start{$seq_id} + ($seq_id_to_alt_scaf_stop{$seq_id}-$seq_id_to_alt_scaf_start{$seq_id}); + $cmp_start = $c_end - ($p_end-$seq_id_to_alt_scaf_start{$seq_id}); + } + $strand = -1; + }#if they're both negative + elsif($strand == -1 and $seq_id_to_strand{$seq_id} == -1){ + + $asm_start = ($seq_id_to_alt_scaf_stop{$seq_id} - $p_end) + $seq_id_to_start{$seq_id}; + $asm_end = ($seq_id_to_alt_scaf_stop{$seq_id} - $p_start) + $seq_id_to_start{$seq_id}; + + if($seq_id_to_alt_scaf_stop{$seq_id}<$p_end){ + $asm_start = $seq_id_to_start{$seq_id}; + $cmp_start = $c_end - ($seq_id_to_alt_scaf_stop{$seq_id} - $p_start); + } + if($seq_id_to_alt_scaf_start{$seq_id}>$p_start){ + $asm_end = $seq_id_to_start{$seq_id} + ($seq_id_to_alt_scaf_stop{$seq_id}-$seq_id_to_alt_scaf_start{$seq_id}); + $cmp_end = $c_start + ($p_end - $seq_id_to_alt_scaf_start{$seq_id}); + } + $strand = 1; + }#otherwise they should both be positive + else{ + if($seq_id_to_alt_scaf_start{$seq_id}>$p_start){ + $asm_start = $seq_id_to_start{$seq_id}; + $cmp_start = $c_end - ($p_end-$seq_id_to_alt_scaf_start{$seq_id}); + } + if($seq_id_to_alt_scaf_stop{$seq_id}<$p_end){ + $cmp_end = ($seq_id_to_alt_scaf_stop{$seq_id}-$p_start) + $c_start; + $asm_end = $seq_id_to_start{$seq_id} + ($seq_id_to_alt_scaf_stop{$seq_id}-$seq_id_to_alt_scaf_start{$seq_id}); + } + } + + #if the contig isn't part of the toplevel/chromosome + if(($p_start>$seq_id_to_alt_scaf_stop{$seq_id}) or ($p_end<$seq_id_to_alt_scaf_start{$seq_id})){ + print "WARNING: Contig $contig is not part of chromosome level sequence from $acc.\n"; + } + else{ + print SQL "insert into assembly (asm_seq_region_id, cmp_seq_region_id, asm_start, asm_end, cmp_start , cmp_end, ori) \n"; + print SQL " values(".$seq_id.", ".$cmp_seq_id.", ".$asm_start.", ".$asm_end.", $cmp_start, $cmp_end, $strand);\n\n"; + } } else{ print "GAP of $contig length\n"; -- GitLab