From 193a37239031fa3156175641d65d633d900942ff Mon Sep 17 00:00:00 2001 From: Amonida Zadissa <amonida@ebi.ac.uk> Date: Thu, 2 Sep 2010 15:51:43 +0000 Subject: [PATCH] Incorporating the alt_start_tail and alt_stop_tail in the length calculations, implemented by Steve Searle. --- .../assembly_patches/assembly_patch_load.pl | 191 +++++++----------- 1 file changed, 68 insertions(+), 123 deletions(-) diff --git a/misc-scripts/assembly_patches/assembly_patch_load.pl b/misc-scripts/assembly_patches/assembly_patch_load.pl index e249994a85..fe548f858f 100644 --- a/misc-scripts/assembly_patches/assembly_patch_load.pl +++ b/misc-scripts/assembly_patches/assembly_patch_load.pl @@ -90,13 +90,6 @@ my $get_seq_region_id_sth = $dba->dbc->prepare("select seq_region_id, length fro my $get_seq_id_sth = $dba->dbc->prepare("select sr.seq_region_id from seq_region sr , coord_system cs where sr.name like ? and cs.coord_system_id = sr.coord_system_id and cs.attrib like '%sequence%'"); -#my $insert_seq_region_sth = $dba->dbc->prepare("insert into seq_region (seq_region_id, name, coord_system_id, length) values(?, ?, ?, ?)") || die "Could not prepare seq_region insert sql"; - -#my $insert_dna_sth = $dba->dbc->prepare("insert into dna (seq_region_id, sequence) values (?, ?)") -# || die "Could not prepare insert dna sql"; - -#my $insert_attrib_sth = $dba->dbc->prepare("insert into seq_region_attrib (seq_region_id, attrib_type_id, value) values (?, ?, 1);"; - my $coord_sth = $dba->dbc->prepare("select coord_system_id from coord_system where name like ? order by rank limit 1"); if(!defined($chr_coord_id) or !defined($contig_coord_id)){ @@ -129,7 +122,6 @@ print "toplevel $toplevel, non_ref $non_ref, patch_novel $patch_novel, patch_fix # Store contigs as seq_region + dna. -#open(FASTA, "<".$fasta_file) || die "Could not open file $fasta_file"; open(MAPPER,"<".$mapping_file) || die "Could not open file $mapping_file"; open(TXT,"<".$txt_file) || die "Could not open file $txt_file"; open(TYPE,"<".$patchtype_file) || die "Could not open file $patchtype_file"; @@ -147,6 +139,8 @@ my %seq_id_to_alt_scaf_stop; my %seq_id_to_stop; my %seq_id_to_strand; +my $time = time(); + while (<TYPE>) { chomp; next if (/^#/); @@ -158,6 +152,13 @@ while (<TYPE>) { } +#alt_asm_name prim_asm_name alt_scaf_name alt_scaf_acc parent_type parent_name parent_acc region_name ori alt_scaf_start alt_scaf_stop parent_start parent_stop alt_start_tail alt_stop_tail +#PATCHES Primary Assembly HG104_HG975_PATCH GL383535.1 CHROMOSOME 8 CM000670.1 EPPK1_SPATC1 + 1 429806 144743526 145146062 0 0 +#PATCHES Primary Assembly HG991_PATCH GL383525.1 CHROMOSOME 3 CM000665.1 SLC25A26 + 27267 65063 66270271 66308065 27266 0 +#PATCHES Primary Assembly HG987_PATCH GL383561.1 CHROMOSOME 17 CM000679.1 REGION27 + 1 314281 21250948 21566608 0 92682 +#PATCHES Primary Assembly HSCHR12_1_CTG2 GL383549.1 CHROMOSOME 12 CM000674.1 REGION21 - 1 120804 28148967 28263711 0 0 + + while(<TXT>){ if(/^#/){ chomp; @@ -167,14 +168,14 @@ while(<TXT>){ $key_to_index{$name} = $i; $i++; } - foreach my $name (qw(alt_scaf_name alt_scaf_acc parent_type parent_name ori alt_scaf_start alt_scaf_stop parent_start parent_stop)){ + foreach my $name (qw(alt_scaf_name alt_scaf_acc parent_type parent_name ori alt_scaf_start alt_scaf_stop parent_start parent_stop alt_start_tail alt_stop_tail)){ if(!defined($key_to_index{$name})){ - print STDERR "PROBLEM could not find index for $name\n"; + print STDERR "PROBLEM could not find index for $name\n"; } else{ - print $name."\t".$key_to_index{$name}."\n"; + print $name."\t".$key_to_index{$name}."\n"; } - } + } } else{ my @arr = split; @@ -188,7 +189,7 @@ while(<TXT>){ $coord_sth->fetch(); if(!defined($coord_sys)){ - die "COuld not get coord_system_id for ".$arr[$key_to_index{'parent_type'}]; + die "Could not get coord_system_id for ".$arr[$key_to_index{'parent_type'}]; } my ($seq_region_id, $seq_length); @@ -196,17 +197,14 @@ while(<TXT>){ $get_seq_region_id_sth->bind_columns(\$seq_region_id,\$seq_length); $get_seq_region_id_sth->fetch(); if(!defined($seq_region_id)){ - die "COuld not get seq_region_id for ".$arr[$key_to_index{'parent_name'}]. " on coord system $coord_sys"; + die "Could not get seq_region_id for ".$arr[$key_to_index{'parent_name'}]. " on coord system $coord_sys"; } -# my $length = ($arr[$key_to_index{'alt_scaf_stop'}] - $arr[$key_to_index{'alt_scaf_start'}]) +1; $name_to_seq_id{$alt_name} = $max_seq_region_id; $name_to_seq_id{$alt_acc} = $max_seq_region_id; $seq_id_to_start{$max_seq_region_id} = $arr[$key_to_index{'parent_start'}]; - $seq_id_to_alt_scaf_start{$max_seq_region_id} = $arr[$key_to_index{'alt_scaf_start'}]; - $seq_id_to_alt_scaf_stop{$max_seq_region_id} = $arr[$key_to_index{'alt_scaf_stop'}]; - $seq_id_to_stop{$max_seq_region_id} = $arr[$key_to_index{'parent_stop'}]; + if ($arr[$key_to_index{'ori'}] eq "+"){ $seq_id_to_strand{$max_seq_region_id} = 1; } @@ -217,10 +215,12 @@ while(<TXT>){ print "Problem with strand.\n"; } - my $exc_len = $arr[$key_to_index{'parent_stop'}]-$arr[$key_to_index{'parent_start'}]; - my $new_length = $arr[$key_to_index{'alt_scaf_stop'}]-$arr[$key_to_index{'alt_scaf_start'}]; + my $exc_len = $arr[$key_to_index{'parent_stop'}] - $arr[$key_to_index{'parent_start'}] + 1; + my $new_length = $arr[$key_to_index{'alt_scaf_stop'}] - $arr[$key_to_index{'alt_scaf_start'}] + 1 + + $arr[$key_to_index{alt_start_tail}] + $arr[$key_to_index{alt_stop_tail}]; my $length = $seq_length + ($new_length-$exc_len); + $seq_id_to_stop{$max_seq_region_id} = $arr[$key_to_index{'parent_start'}] + $new_length -1; print SQL "insert into seq_region (seq_region_id, name, coord_system_id, length)\n"; print SQL "\tvalues ($max_seq_region_id, '$alt_name', $coord_sys, $length);\n\n"; @@ -232,11 +232,11 @@ while(<TXT>){ my $hap_type; if (exists $name_to_type{$alt_name} && defined $name_to_type{$alt_name}) { if ($name_to_type{$alt_name} =~ /fix/i) { - print SQL "insert into seq_region_attrib (seq_region_id, attrib_type_id, value) values ($max_seq_region_id, $patch_fix, now());\n"; - $hap_type = "'PATCH_FIX'"; + print SQL "insert into seq_region_attrib (seq_region_id, attrib_type_id, value) values ($max_seq_region_id, $patch_fix, $time);\n"; + $hap_type = "'PATCH_FIX'"; } elsif ($name_to_type{$alt_name} =~ /novel/i) { - print SQL "insert into seq_region_attrib (seq_region_id, attrib_type_id, value) values ($max_seq_region_id, $patch_novel, now());\n"; - $hap_type = "'PATCH_NOVEL'"; + print SQL "insert into seq_region_attrib (seq_region_id, attrib_type_id, value) values ($max_seq_region_id, $patch_novel, $time);\n"; + $hap_type = "'PATCH_NOVEL'"; } else { throw("Patch type ".$name_to_type{$alt_name}." for $alt_name not recognised"); } @@ -246,14 +246,13 @@ while(<TXT>){ print SQL "insert into assembly_exception (assembly_exception_id, seq_region_id, seq_region_start, seq_region_end, exc_type, exc_seq_region_id, exc_seq_region_start, exc_seq_region_end, ori)\n"; print SQL "\tvalues($max_assembly_exception_id, $max_seq_region_id, ", - $arr[$key_to_index{'parent_start'}], ",", #($arr[$key_to_index{'alt_scaf_start'}]+$arr[$key_to_index{'parent_start'}])-1, ",", - ($arr[$key_to_index{'alt_scaf_stop'}]+$arr[$key_to_index{'parent_start'}])-$arr[$key_to_index{'alt_scaf_start'}], ",", -# ($new_length+$arr[$key_to_index{'parent_start'}], ",", - "$hap_type, ", - $seq_region_id,", ", - $arr[$key_to_index{'parent_start'}], ", ", - $arr[$key_to_index{'parent_stop'}], ",", - " 1);\n\n"; + $arr[$key_to_index{'parent_start'}], ",", + ($new_length+$arr[$key_to_index{'parent_start'}]-1), ",", + "$hap_type, ", + $seq_region_id,", ", + $arr[$key_to_index{'parent_start'}], ", ", + $arr[$key_to_index{'parent_stop'}], ",", + " 1);\n\n"; $max_seq_region_id++; $max_assembly_exception_id++; @@ -287,28 +286,27 @@ while(<MAPPER>){ $get_seq_id_sth->bind_columns(\$cmp_seq_id); $get_seq_id_sth->fetch; if(!defined($cmp_seq_id)){ - print "Could not get seq_region_id for $contig trying pfetch\n"; - #prev mfetch -d embl -v fasta $contig > ./$contig.fa - my $out = system("pfetch $contig > ./$contig.fa"); - my $contig_seq =""; - open(FA,"<./$contig.fa") || die "Could not open file ./$contig.fa\n"; - while (<FA>){ - if(/^>/){ -# $contig_name =~ /^>(S+)/; -# print $contig_name."\n" - } - else{ - chomp; - $contig_seq .= $_; - } - } - close FA; - system("rm ./$contig.fa"); - load_seq_region($contig, \$contig_seq); - $cmp_seq_id = $name_to_seq_id{$contig}; + print "Could not get seq_region_id for $contig trying pfetch\n"; + my $out = system("pfetch $contig > ./$contig.fa"); + my $contig_seq =""; + open(FA,"<./$contig.fa") || die "Could not open file ./$contig.fa\n"; + while (<FA>){ + if(/^>/){ +# $contig_name =~ /^>(S+)/; +# print $contig_name."\n" + } + else{ + chomp; + $contig_seq .= $_; + } + } + close FA; + system("rm ./$contig.fa"); + load_seq_region($contig, \$contig_seq); + $cmp_seq_id = $name_to_seq_id{$contig}; } else{ - print "$contig already stored :-)\n"; + print "$contig already stored :-)\n"; } } @@ -322,88 +320,39 @@ while(<MAPPER>){ my $cmp_start; my $cmp_end; - $asm_start = ($p_start-($seq_id_to_alt_scaf_start{$seq_id}-1))+($seq_id_to_start{$seq_id}-1); + $asm_start = $p_start + $seq_id_to_start{$seq_id} - 1; $cmp_start = $c_start; - $asm_end = ($seq_id_to_start{$seq_id} - $seq_id_to_alt_scaf_start{$seq_id})+$p_end; + $asm_end = $seq_id_to_start{$seq_id} + $p_end - 1; $cmp_end = $c_end; - #for each combination of strand orientations + #for negative chromosome strand orientations #alter the chromosome level starts and ends if needed - #then deal with the cases where a contig extends beyond the part of the scaffold included - #in the chromosome/toplevel - #Do leftmost, relative to chrom and then rightmost. - #Cases where the contig is completely off the toplevel are caught at the end - - #if the contig is negative - if($strand == -1 and $seq_id_to_strand{$seq_id} == 1){ - if($seq_id_to_alt_scaf_start{$seq_id}>$p_start){ - $asm_start = $seq_id_to_start{$seq_id}; - $cmp_end = $c_start + ($p_end-$seq_id_to_alt_scaf_start{$seq_id}); - } - if($seq_id_to_alt_scaf_stop{$seq_id}<$p_end){ - $asm_end = $seq_id_to_start{$seq_id} + ($seq_id_to_alt_scaf_stop{$seq_id}-$seq_id_to_alt_scaf_start{$seq_id}); - $cmp_start = $c_end - ($seq_id_to_alt_scaf_stop{$seq_id} - $p_start); - } - }#if the scaffold is negative - elsif($strand == 1 and $seq_id_to_strand{$seq_id} == -1){ - - $asm_start = ($seq_id_to_alt_scaf_stop{$seq_id} - $p_end) + $seq_id_to_start{$seq_id}; - $asm_end = ($seq_id_to_alt_scaf_stop{$seq_id} - $p_start) + $seq_id_to_start{$seq_id}; - - if($seq_id_to_alt_scaf_stop{$seq_id}<$p_end){ - $asm_start = $seq_id_to_start{$seq_id}; - $cmp_end = $c_start + ($p_start - $seq_id_to_alt_scaf_start{$seq_id}); - } - if($seq_id_to_alt_scaf_start{$seq_id}>$p_start){ - $asm_end = $seq_id_to_start{$seq_id} + ($seq_id_to_alt_scaf_stop{$seq_id}-$seq_id_to_alt_scaf_start{$seq_id}); - $cmp_start = $c_end - ($p_end-$seq_id_to_alt_scaf_start{$seq_id}); - } - $strand = -1; - }#if they're both negative - elsif($strand == -1 and $seq_id_to_strand{$seq_id} == -1){ - $asm_start = ($seq_id_to_alt_scaf_stop{$seq_id} - $p_end) + $seq_id_to_start{$seq_id}; - $asm_end = ($seq_id_to_alt_scaf_stop{$seq_id} - $p_start) + $seq_id_to_start{$seq_id}; + if ($strand == 1 and $seq_id_to_strand{$seq_id} == -1){ - if($seq_id_to_alt_scaf_stop{$seq_id}<$p_end){ - $asm_start = $seq_id_to_start{$seq_id}; - $cmp_start = $c_end - ($seq_id_to_alt_scaf_stop{$seq_id} - $p_start); - } - if($seq_id_to_alt_scaf_start{$seq_id}>$p_start){ - $asm_end = $seq_id_to_start{$seq_id} + ($seq_id_to_alt_scaf_stop{$seq_id}-$seq_id_to_alt_scaf_start{$seq_id}); - $cmp_end = $c_start + ($p_end - $seq_id_to_alt_scaf_start{$seq_id}); - } - $strand = 1; - }#otherwise they should both be positive - else{ - if($seq_id_to_alt_scaf_start{$seq_id}>$p_start){ - $asm_start = $seq_id_to_start{$seq_id}; - $cmp_start = $c_end - ($p_end-$seq_id_to_alt_scaf_start{$seq_id}); - } - if($seq_id_to_alt_scaf_stop{$seq_id}<$p_end){ - $cmp_end = ($seq_id_to_alt_scaf_stop{$seq_id}-$p_start) + $c_start; - $asm_end = $seq_id_to_start{$seq_id} + ($seq_id_to_alt_scaf_stop{$seq_id}-$seq_id_to_alt_scaf_start{$seq_id}); - } - } + $asm_start = $seq_id_to_stop{$seq_id} - ($p_end -1); + $asm_end = $seq_id_to_stop{$seq_id} - ($p_start -1); + $strand = -1; + } elsif ($strand == -1 and $seq_id_to_strand{$seq_id} == -1){ - #if the contig isn't part of the toplevel/chromosome - if(($p_start>$seq_id_to_alt_scaf_stop{$seq_id}) or ($p_end<$seq_id_to_alt_scaf_start{$seq_id})){ - print "WARNING: Contig $contig is not part of chromosome level sequence from $acc.\n"; - } - else{ - print SQL "insert into assembly (asm_seq_region_id, cmp_seq_region_id, asm_start, asm_end, cmp_start , cmp_end, ori) \n"; - print SQL " values(".$seq_id.", ".$cmp_seq_id.", ".$asm_start.", ".$asm_end.", $cmp_start, $cmp_end, $strand);\n\n"; + $asm_start = $seq_id_to_stop{$seq_id} - ($p_end -1); + $asm_end = $seq_id_to_stop{$seq_id} - ($p_start -1); + $strand = 1; + } else { + # Positive seq_id_to_strand - DO NOTHING } + + print SQL "insert into assembly (asm_seq_region_id, cmp_seq_region_id, asm_start, asm_end, cmp_start , cmp_end, ori) \n"; + print SQL " values(".$seq_id.", ".$cmp_seq_id.", ".$asm_start.", ".$asm_end.", $cmp_start, $cmp_end, $strand);\n\n"; } else{ print "GAP of $contig length\n"; } - -} +} close SQL; -sub load_seq_region{ +sub load_seq_region { my $name = shift; my $seq = shift; @@ -426,11 +375,7 @@ sub load_seq_region{ print "adding new seq region $name length of sequence = ".length($$seq)."\n"; print SQL "insert into seq_region (seq_region_id, name, coord_system_id, length) values ($max_seq_region_id, '$name', $contig_coord_id, ".length($$seq).");\n"; -# $insert_seq_region_sth->execute($max_seq_region_id, $name, $contig_coord_id, length($$seq)) -# || die "Could not insert seq region for $name"; print SQL "insert into dna (seq_region_id, sequence) values ($max_seq_region_id,'".$$seq."');\n\n"; -# $insert_dna_sth->execute($max_seq_region_id, $seq) -# || die "Could not add seq for $name"; $name_to_seq_id{$name} = $max_seq_region_id; $max_seq_region_id++; -- GitLab